b575b82001-07-05Johan Schön inherit .Base; Sql.Sql db; string url, table; Web.Crawler.Stats stats; Web.Crawler.Policy policy; Web.Crawler.RuleSet allow, deny; inherit Web.Crawler.Queue; static string to_md5(string url) {
95cfd92008-08-15Martin Stjernholm  Crypto.MD5 md5 = Crypto.MD5();
1f5a862001-08-08Per Hedbor  md5->update(string_to_utf8(url));
95cfd92008-08-15Martin Stjernholm  return String.string2hex(md5->digest());
b575b82001-07-05Johan Schön } void create( Web.Crawler.Stats _stats, Web.Crawler.Policy _policy, string _url, string _table, void|Web.Crawler.RuleSet _allow, void|Web.Crawler.RuleSet _deny) { stats = _stats; policy = _policy; allow=_allow; deny=_deny; table = _table; db = Sql.Sql( _url ); perhaps_create_table( ); } static void perhaps_create_table( ) {
8c93212001-08-14Johan Schön  db->query(
b575b82001-07-05Johan Schön #"
8c93212001-08-14Johan Schön  create table IF NOT EXISTS "+table+#" (
b575b82001-07-05Johan Schön  uri blob not null, uri_md5 char(32) not null default '', template varchar(255) not null default '', md5 char(32) not null default '', recurse tinyint not null, stage tinyint not null,
8c93212001-08-14Johan Schön  UNIQUE(uri_md5),
b575b82001-07-05Johan Schön  INDEX stage (stage) )
8c93212001-08-14Johan Schön  ");
b575b82001-07-05Johan Schön }
e089252002-02-18Johan Schön static mapping hascache = ([]); void clear_cache() { hascache = ([]); }
b575b82001-07-05Johan Schön static int has_uri( string|Standards.URI uri ) { uri = (string)uri; if( sizeof(hascache) > 100000 ) hascache = ([]); return hascache[uri]|| (hascache[uri]= sizeof(db->query("select stage from "+table+" where uri_md5=%s", to_md5(uri)))); } void add_uri( Standards.URI uri, int recurse, string template, void|int force ) { // The language is encoded in the fragment. Standards.URI r = Standards.URI( (string)uri ); if( r->query ) r->query = normalize_query( r->query ); if(r->query && !strlen(r->query)) r->query = 0; // Remove any trailing index filename string rpath=reverse(r->path); // FIXME: Make these configurable? foreach( ({"index.xml", "index.html", "index.htm"}), string index)
62440b2002-08-06Mattias Andersson  if(search(rpath,reverse(index))==0 && rpath[sizeof(index)]=='/')
b575b82001-07-05Johan Schön  rpath=rpath[sizeof(index)..]; r->path=reverse(rpath);
f21ccf2001-12-17Johan Schön  r->path = combine_path(r->path);
b575b82001-07-05Johan Schön 
086f552001-08-14Johan Schön  if( force || check_link(uri, allow, deny) )
8c93212001-08-14Johan Schön  {
086f552001-08-14Johan Schön  if(has_uri(r))
8c93212001-08-14Johan Schön  {
7038542001-08-20Johan Schön  // FIXME: // Race condition: // If a url is forced to be indexed *while* it's being indexed,
aacb1f2002-02-20Johan Schön  // and it's changed since the indexing started, setting the stage
7038542001-08-20Johan Schön  // to 0 here might be worthless, since it could be overwritten before // it's fetched again.
ea46792003-08-14Mattias Andersson  if(force) {
086f552001-08-14Johan Schön  set_stage(r, 0);
ea46792003-08-14Mattias Andersson  set_recurse(r, recurse); }
8c93212001-08-14Johan Schön  }
086f552001-08-14Johan Schön  else
0a8c452008-03-07Martin Jonsson  // There's a race condition between the select query in has_uri() // and this query, so we ignore duplicate key errors from MySQL // by using the "ignore" keyword. db->query( "insert ignore into "+table+
086f552001-08-14Johan Schön  " (uri,uri_md5,recurse,template) values (%s,%s,%d,%s)", string_to_utf8((string)r),
52a24e2001-08-14Johan Schön  to_md5((string)r), recurse, (template||"") );
8c93212001-08-14Johan Schön  }
b575b82001-07-05Johan Schön } void set_md5( Standards.URI uri, string md5 ) {
ea46792003-08-14Mattias Andersson  if( extra_data[(string)uri] ) extra_data[(string)uri]->md5 = md5;
b575b82001-07-05Johan Schön  db->query( "update "+table+ " set md5=%s WHERE uri_md5=%s", md5, to_md5((string)uri) ); }
ea46792003-08-14Mattias Andersson void set_recurse( Standards.URI uri, int recurse ) { if( extra_data[(string)uri] )
7520b62003-09-01Anders Johansson  extra_data[(string)uri]->recurse = (string)recurse;
ea46792003-08-14Mattias Andersson  db->query( "update "+table+ " set recurse=%d WHERE uri_md5=%s", recurse, to_md5((string)uri)); }
b575b82001-07-05Johan Schön mapping(string:mapping(string:string)) extra_data = ([]); mapping get_extra( Standards.URI uri ) { if( extra_data[(string)uri] ) return extra_data[(string)uri]; array r = db->query( "SELECT md5,recurse,stage,template " "FROM "+table+" WHERE uri_md5=%s", to_md5((string)uri) ); if( sizeof( r ) ) return r[0];
af27a62001-07-31Johan Schön 
b575b82001-07-05Johan Schön } static int empty_count; static int retry_count; // cache, for performance reasons. static array possible=({}); static int p_c; int|Standards.URI get() { if(stats->concurrent_fetchers() > policy->max_concurrent_fetchers) return -1; if( sizeof( possible ) <= p_c ) { p_c = 0; possible = db->query( "select * from "+table+" where stage=0 limit 20" );
1f5a862001-08-08Per Hedbor  extra_data = mkmapping( map(possible->uri,utf8_to_string), possible ); possible = map(possible->uri,utf8_to_string);
b575b82001-07-05Johan Schön  } while( sizeof( possible ) > p_c ) { empty_count=0; if( possible[ p_c ] ) {
898c182001-08-27Johan Schön  Standards.URI uri = Standards.URI( possible[p_c++] );
b575b82001-07-05Johan Schön 
898c182001-08-27Johan Schön  if( stats->concurrent_fetchers( uri->host ) >
b575b82001-07-05Johan Schön  policy->max_concurrent_fetchers_per_host ) { retry_count++; continue; // not this host.. } possible[p_c-1] = 0; retry_count=0;
898c182001-08-27Johan Schön  set_stage( uri, 1 ); return uri;
b575b82001-07-05Johan Schön  } p_c++; continue; } if( stats->concurrent_fetchers() ) { return -1; }
80cb452001-08-27Johan Schön  // This is needed for the following race condition scenario: // 1. The queue contains one page // 2. The crawler indexes the page // 3a. In thread/process A, document filtering and fetching is done, and // links are found // 3b. In thread/process B, queue->get() returns 0 since the queue doesn't contain // any more pages to crawl. // // The workaround is to wait 40 cycles (i.e. 4 seconds) after fetching the last page. if( empty_count++ > 40 ) { if( num_with_stage( 2 ) || num_with_stage( 3 ) ) { empty_count=0; return -1; } return 0; } return -1;
b575b82001-07-05Johan Schön }
fc30ea2003-08-14Mattias Andersson array(Standards.URI) get_uris(void|int stage) { array uris = ({}); if (stage) uris = db->query( "select * from "+table+" where stage=%d", stage ); else uris = db->query( "select * from "+table ); uris = map(uris->uri, utf8_to_string); uris = map(uris, Standards.URI); return uris; }
b575b82001-07-05Johan Schön void put(string|array(string)|Standards.URI|array(Standards.URI) uri) { if(arrayp(uri)) { foreach(uri, string|object _uri) put(_uri); return; } if(!objectp(uri)) uri=Standards.URI(uri); add_uri( uri, 1, 0 ); } void clear() {
8c93212001-08-14Johan Schön  hascache = ([ ]);
b575b82001-07-05Johan Schön  db->query("delete from "+table); }
2e991b2001-08-20Johan Schön void remove_uri(string|Standards.URI uri) {
b0b3fa2002-02-18Johan Schön  hascache[(string)uri]=0;
2e991b2001-08-20Johan Schön  db->query("delete from "+table+" where uri_md5=%s", to_md5((string)uri)); }
b575b82001-07-05Johan Schön 
299e1a2007-11-16Marcus Wellhardh void remove_uri_prefix(string|Standards.URI uri) { string uri_string = (string)uri; foreach(indices(hascache), string _uri) if(has_prefix(_uri, uri_string)) hascache[_uri]=0; db->query("delete from "+table+" where uri like '" + db->quote(uri_string) + "%%'"); }
b575b82001-07-05Johan Schön void clear_stage( int ... stages ) { foreach( stages, int s ) db->query( "update "+table+" set stage=0 where stage=%d", s ); } void clear_md5( int ... stages ) { foreach( stages, int s ) db->query( "update "+table+" set md5='' where stage=%d", s ); } int num_with_stage( int ... stage ) { return (int) db->query( "select COUNT(*) as c from "+table+" where stage IN (%s)", ((array(string))stage)*"," )[ 0 ]->c; } void set_stage( Standards.URI uri, int stage ) { db->query( "update "+table+" set stage=%d where uri_md5=%s",stage, to_md5((string)uri)); }
086f552001-08-14Johan Schön 
52a24e2001-08-14Johan Schön int get_stage( Standards.URI uri )
086f552001-08-14Johan Schön {
52a24e2001-08-14Johan Schön  array a = db->query( "select stage from "+table+" where uri_md5=%s", to_md5((string)uri));
086f552001-08-14Johan Schön  if(sizeof(a))
52a24e2001-08-14Johan Schön  return (int)a[0]->stage;
086f552001-08-14Johan Schön  else return -1; }