b575b82001-07-05Johan Schön inherit .Base; Sql.Sql db; string url, table; Web.Crawler.Stats stats; Web.Crawler.Policy policy; Web.Crawler.RuleSet allow, deny; inherit Web.Crawler.Queue; static string to_md5(string url) { object md5 = Crypto.md5();
1f5a862001-08-08Per Hedbor  md5->update(string_to_utf8(url));
b575b82001-07-05Johan Schön  return Crypto.string_to_hex(md5->digest()); } void create( Web.Crawler.Stats _stats, Web.Crawler.Policy _policy, string _url, string _table, void|Web.Crawler.RuleSet _allow, void|Web.Crawler.RuleSet _deny) { stats = _stats; policy = _policy; allow=_allow; deny=_deny; table = _table; db = Sql.Sql( _url ); perhaps_create_table( ); } static void perhaps_create_table( ) {
8c93212001-08-14Johan Schön  db->query(
b575b82001-07-05Johan Schön #"
8c93212001-08-14Johan Schön  create table IF NOT EXISTS "+table+#" (
b575b82001-07-05Johan Schön  uri blob not null, uri_md5 char(32) not null default '', template varchar(255) not null default '', md5 char(32) not null default '', recurse tinyint not null, stage tinyint not null,
8c93212001-08-14Johan Schön  UNIQUE(uri_md5),
b575b82001-07-05Johan Schön  INDEX stage (stage) )
8c93212001-08-14Johan Schön  ");
b575b82001-07-05Johan Schön }
e089252002-02-18Johan Schön static mapping hascache = ([]); void clear_cache() { hascache = ([]); }
b575b82001-07-05Johan Schön static int has_uri( string|Standards.URI uri ) { uri = (string)uri; if( sizeof(hascache) > 100000 ) hascache = ([]); return hascache[uri]|| (hascache[uri]= sizeof(db->query("select stage from "+table+" where uri_md5=%s", to_md5(uri)))); } void add_uri( Standards.URI uri, int recurse, string template, void|int force ) { // The language is encoded in the fragment. Standards.URI r = Standards.URI( (string)uri ); if( r->query ) r->query = normalize_query( r->query ); if(r->query && !strlen(r->query)) r->query = 0; // Remove any trailing index filename string rpath=reverse(r->path); // FIXME: Make these configurable? foreach( ({"index.xml", "index.html", "index.htm"}), string index) if(search(rpath,reverse(index))==0) rpath=rpath[sizeof(index)..]; r->path=reverse(rpath);
f21ccf2001-12-17Johan Schön  r->path = combine_path(r->path);
b575b82001-07-05Johan Schön 
086f552001-08-14Johan Schön  if( force || check_link(uri, allow, deny) )
8c93212001-08-14Johan Schön  {
086f552001-08-14Johan Schön  if(has_uri(r))
8c93212001-08-14Johan Schön  {
7038542001-08-20Johan Schön  // FIXME: // Race condition: // If a url is forced to be indexed *while* it's being indexed,
aacb1f2002-02-20Johan Schön  // and it's changed since the indexing started, setting the stage
7038542001-08-20Johan Schön  // to 0 here might be worthless, since it could be overwritten before // it's fetched again. if(force)
086f552001-08-14Johan Schön  set_stage(r, 0);
8c93212001-08-14Johan Schön  }
086f552001-08-14Johan Schön  else db->query( "insert into "+table+ " (uri,uri_md5,recurse,template) values (%s,%s,%d,%s)", string_to_utf8((string)r),
52a24e2001-08-14Johan Schön  to_md5((string)r), recurse, (template||"") );
8c93212001-08-14Johan Schön  }
b575b82001-07-05Johan Schön } void set_md5( Standards.URI uri, string md5 ) { db->query( "update "+table+ " set md5=%s WHERE uri_md5=%s", md5, to_md5((string)uri) ); } mapping(string:mapping(string:string)) extra_data = ([]); mapping get_extra( Standards.URI uri ) { if( extra_data[(string)uri] ) return extra_data[(string)uri]; array r = db->query( "SELECT md5,recurse,stage,template " "FROM "+table+" WHERE uri_md5=%s", to_md5((string)uri) ); if( sizeof( r ) ) return r[0];
af27a62001-07-31Johan Schön 
b575b82001-07-05Johan Schön } static int empty_count; static int retry_count; // cache, for performance reasons. static array possible=({}); static int p_c; int|Standards.URI get() { if(stats->concurrent_fetchers() > policy->max_concurrent_fetchers) return -1; if( sizeof( possible ) <= p_c ) { p_c = 0; possible = db->query( "select * from "+table+" where stage=0 limit 20" );
1f5a862001-08-08Per Hedbor  extra_data = mkmapping( map(possible->uri,utf8_to_string), possible ); possible = map(possible->uri,utf8_to_string);
b575b82001-07-05Johan Schön  } while( sizeof( possible ) > p_c ) { empty_count=0; if( possible[ p_c ] ) {
898c182001-08-27Johan Schön  Standards.URI uri = Standards.URI( possible[p_c++] );
b575b82001-07-05Johan Schön 
898c182001-08-27Johan Schön  if( stats->concurrent_fetchers( uri->host ) >
b575b82001-07-05Johan Schön  policy->max_concurrent_fetchers_per_host ) { retry_count++; continue; // not this host.. } possible[p_c-1] = 0; retry_count=0;
898c182001-08-27Johan Schön  set_stage( uri, 1 ); return uri;
b575b82001-07-05Johan Schön  } p_c++; continue; } if( stats->concurrent_fetchers() ) { return -1; }
80cb452001-08-27Johan Schön  // This is needed for the following race condition scenario: // 1. The queue contains one page // 2. The crawler indexes the page // 3a. In thread/process A, document filtering and fetching is done, and // links are found // 3b. In thread/process B, queue->get() returns 0 since the queue doesn't contain // any more pages to crawl. // // The workaround is to wait 40 cycles (i.e. 4 seconds) after fetching the last page. if( empty_count++ > 40 ) { if( num_with_stage( 2 ) || num_with_stage( 3 ) ) { empty_count=0; return -1; } return 0; } return -1;
b575b82001-07-05Johan Schön } void put(string|array(string)|Standards.URI|array(Standards.URI) uri) { if(arrayp(uri)) { foreach(uri, string|object _uri) put(_uri); return; } if(!objectp(uri)) uri=Standards.URI(uri); add_uri( uri, 1, 0 ); } void clear() {
8c93212001-08-14Johan Schön  hascache = ([ ]);
b575b82001-07-05Johan Schön  db->query("delete from "+table); }
2e991b2001-08-20Johan Schön void remove_uri(string|Standards.URI uri) {
b0b3fa2002-02-18Johan Schön  hascache[(string)uri]=0;
2e991b2001-08-20Johan Schön  db->query("delete from "+table+" where uri_md5=%s", to_md5((string)uri)); }
b575b82001-07-05Johan Schön  void clear_stage( int ... stages ) { foreach( stages, int s ) db->query( "update "+table+" set stage=0 where stage=%d", s ); } void clear_md5( int ... stages ) { foreach( stages, int s ) db->query( "update "+table+" set md5='' where stage=%d", s ); } int num_with_stage( int ... stage ) { return (int) db->query( "select COUNT(*) as c from "+table+" where stage IN (%s)", ((array(string))stage)*"," )[ 0 ]->c; } void set_stage( Standards.URI uri, int stage ) { db->query( "update "+table+" set stage=%d where uri_md5=%s",stage, to_md5((string)uri)); }
086f552001-08-14Johan Schön 
52a24e2001-08-14Johan Schön int get_stage( Standards.URI uri )
086f552001-08-14Johan Schön {
52a24e2001-08-14Johan Schön  array a = db->query( "select stage from "+table+" where uri_md5=%s", to_md5((string)uri));
086f552001-08-14Johan Schön  if(sizeof(a))
52a24e2001-08-14Johan Schön  return (int)a[0]->stage;
086f552001-08-14Johan Schön  else return -1; }