d629562011-11-05Martin Nilsson #pike __REAL_VERSION__
b575b82001-07-05Johan Schön inherit .Base;
9dbff72013-09-12Henrik Grubbström (Grubba) //! @[Search] crawler state stored in a @[Mysql] database.
b575b82001-07-05Johan Schön string url, table;
faf0e32013-03-14Henrik Grubbström (Grubba) protected Thread.Local _db = Thread.Local(); Sql.Sql `db() { // NB: We need to have a thread local connection, // since the status functions may get called // from some other thread while we're busy // performing sql queries elsewhere. Sql.Sql ret = _db->get(); if (ret && !ret->ping()) return ret; return _db->set(Sql.Sql( url )); }
b575b82001-07-05Johan Schön Web.Crawler.Stats stats; Web.Crawler.Policy policy; Web.Crawler.RuleSet allow, deny;
ff17962014-08-15Martin Nilsson protected string to_md5(string url)
b575b82001-07-05Johan Schön {
95cfd92008-08-15Martin Stjernholm  Crypto.MD5 md5 = Crypto.MD5();
1f5a862001-08-08Per Hedbor  md5->update(string_to_utf8(url));
95cfd92008-08-15Martin Stjernholm  return String.string2hex(md5->digest());
b575b82001-07-05Johan Schön }
9dbff72013-09-12Henrik Grubbström (Grubba) //! @param _url //! @[Sql.Sql] URL for the database to store the queue. //! //! @param _table //! @[Sql.Sql] table name to store the queue in. //! //! If the table doesn't exist it will be created.
b575b82001-07-05Johan Schön void create( Web.Crawler.Stats _stats, Web.Crawler.Policy _policy, string _url, string _table, void|Web.Crawler.RuleSet _allow, void|Web.Crawler.RuleSet _deny) { stats = _stats; policy = _policy; allow=_allow; deny=_deny; table = _table;
faf0e32013-03-14Henrik Grubbström (Grubba)  url = _url;
b575b82001-07-05Johan Schön  perhaps_create_table( ); }
ff17962014-08-15Martin Nilsson protected void perhaps_create_table( )
b575b82001-07-05Johan Schön {
8c93212001-08-14Johan Schön  db->query(
b575b82001-07-05Johan Schön #"
8c93212001-08-14Johan Schön  create table IF NOT EXISTS "+table+#" (
b575b82001-07-05Johan Schön  uri blob not null, uri_md5 char(32) not null default '', template varchar(255) not null default '', md5 char(32) not null default '', recurse tinyint not null, stage tinyint not null,
8c93212001-08-14Johan Schön  UNIQUE(uri_md5),
509e682011-05-25Henrik Grubbström (Grubba)  INDEX stage (stage),
7004fc2011-10-18Henrik Grubbström (Grubba)  INDEX uri (uri(255))
b575b82001-07-05Johan Schön  )
8c93212001-08-14Johan Schön  ");
ee52f22012-01-17Henrik Grubbström (Grubba)  if (!((multiset)db->query("SHOW INDEX FROM " + table)->Key_name)["uri"]) {
509e682011-05-25Henrik Grubbström (Grubba)  db->query("ALTER TABLE " + table +
7004fc2011-10-18Henrik Grubbström (Grubba)  " ADD INDEX uri (uri(255))");
509e682011-05-25Henrik Grubbström (Grubba)  }
b575b82001-07-05Johan Schön }
ff17962014-08-15Martin Nilsson protected mapping hascache = ([]);
e089252002-02-18Johan Schön  void clear_cache() { hascache = ([]); }
ff17962014-08-15Martin Nilsson protected int has_uri( string|Standards.URI uri )
b575b82001-07-05Johan Schön { uri = (string)uri; if( sizeof(hascache) > 100000 ) hascache = ([]); return hascache[uri]|| (hascache[uri]= sizeof(db->query("select stage from "+table+" where uri_md5=%s", to_md5(uri)))); } void add_uri( Standards.URI uri, int recurse, string template, void|int force ) { // The language is encoded in the fragment. Standards.URI r = Standards.URI( (string)uri ); if( r->query ) r->query = normalize_query( r->query ); if(r->query && !strlen(r->query)) r->query = 0; // Remove any trailing index filename string rpath=reverse(r->path); // FIXME: Make these configurable? foreach( ({"index.xml", "index.html", "index.htm"}), string index)
62440b2002-08-06Mattias Andersson  if(search(rpath,reverse(index))==0 && rpath[sizeof(index)]=='/')
b575b82001-07-05Johan Schön  rpath=rpath[sizeof(index)..]; r->path=reverse(rpath);
f21ccf2001-12-17Johan Schön  r->path = combine_path(r->path);
b575b82001-07-05Johan Schön 
086f552001-08-14Johan Schön  if( force || check_link(uri, allow, deny) )
8c93212001-08-14Johan Schön  {
086f552001-08-14Johan Schön  if(has_uri(r))
8c93212001-08-14Johan Schön  {
7038542001-08-20Johan Schön  // FIXME: // Race condition: // If a url is forced to be indexed *while* it's being indexed,
aacb1f2002-02-20Johan Schön  // and it's changed since the indexing started, setting the stage
7038542001-08-20Johan Schön  // to 0 here might be worthless, since it could be overwritten before // it's fetched again.
ea46792003-08-14Mattias Andersson  if(force) {
086f552001-08-14Johan Schön  set_stage(r, 0);
ea46792003-08-14Mattias Andersson  set_recurse(r, recurse); }
8c93212001-08-14Johan Schön  }
086f552001-08-14Johan Schön  else
0a8c452008-03-07Martin Jonsson  // There's a race condition between the select query in has_uri() // and this query, so we ignore duplicate key errors from MySQL // by using the "ignore" keyword. db->query( "insert ignore into "+table+
086f552001-08-14Johan Schön  " (uri,uri_md5,recurse,template) values (%s,%s,%d,%s)", string_to_utf8((string)r),
52a24e2001-08-14Johan Schön  to_md5((string)r), recurse, (template||"") );
8c93212001-08-14Johan Schön  }
b575b82001-07-05Johan Schön } void set_md5( Standards.URI uri, string md5 ) {
ea46792003-08-14Mattias Andersson  if( extra_data[(string)uri] ) extra_data[(string)uri]->md5 = md5;
b575b82001-07-05Johan Schön  db->query( "update "+table+ " set md5=%s WHERE uri_md5=%s", md5, to_md5((string)uri) ); }
ea46792003-08-14Mattias Andersson void set_recurse( Standards.URI uri, int recurse ) { if( extra_data[(string)uri] )
7520b62003-09-01Anders Johansson  extra_data[(string)uri]->recurse = (string)recurse;
ea46792003-08-14Mattias Andersson  db->query( "update "+table+ " set recurse=%d WHERE uri_md5=%s", recurse, to_md5((string)uri)); }
b575b82001-07-05Johan Schön mapping(string:mapping(string:string)) extra_data = ([]); mapping get_extra( Standards.URI uri ) { if( extra_data[(string)uri] )
fda9932010-01-21Jonas Wallden  return extra_data[(string)uri] || ([ ]);
b575b82001-07-05Johan Schön  array r = db->query( "SELECT md5,recurse,stage,template " "FROM "+table+" WHERE uri_md5=%s", to_md5((string)uri) );
fda9932010-01-21Jonas Wallden  return (sizeof(r) && r[0]) || ([ ]);
b575b82001-07-05Johan Schön }
ff17962014-08-15Martin Nilsson protected int empty_count; protected int retry_count;
b575b82001-07-05Johan Schön  // cache, for performance reasons.
ff17962014-08-15Martin Nilsson protected array possible=({}); protected int p_c;
b575b82001-07-05Johan Schön  int|Standards.URI get() { if(stats->concurrent_fetchers() > policy->max_concurrent_fetchers) return -1; if( sizeof( possible ) <= p_c ) { p_c = 0; possible = db->query( "select * from "+table+" where stage=0 limit 20" );
1f5a862001-08-08Per Hedbor  extra_data = mkmapping( map(possible->uri,utf8_to_string), possible ); possible = map(possible->uri,utf8_to_string);
b575b82001-07-05Johan Schön  } while( sizeof( possible ) > p_c ) { empty_count=0; if( possible[ p_c ] ) {
898c182001-08-27Johan Schön  Standards.URI uri = Standards.URI( possible[p_c++] );
b575b82001-07-05Johan Schön 
898c182001-08-27Johan Schön  if( stats->concurrent_fetchers( uri->host ) >
b575b82001-07-05Johan Schön  policy->max_concurrent_fetchers_per_host ) { retry_count++; continue; // not this host.. } possible[p_c-1] = 0; retry_count=0;
898c182001-08-27Johan Schön  set_stage( uri, 1 ); return uri;
b575b82001-07-05Johan Schön  } p_c++; continue; } if( stats->concurrent_fetchers() ) { return -1; }
80cb452001-08-27Johan Schön  // This is needed for the following race condition scenario: // 1. The queue contains one page // 2. The crawler indexes the page // 3a. In thread/process A, document filtering and fetching is done, and // links are found // 3b. In thread/process B, queue->get() returns 0 since the queue doesn't contain // any more pages to crawl. // // The workaround is to wait 40 cycles (i.e. 4 seconds) after fetching the last page. if( empty_count++ > 40 ) { if( num_with_stage( 2 ) || num_with_stage( 3 ) ) { empty_count=0; return -1; } return 0; } return -1;
b575b82001-07-05Johan Schön }
fc30ea2003-08-14Mattias Andersson array(Standards.URI) get_uris(void|int stage) { array uris = ({}); if (stage) uris = db->query( "select * from "+table+" where stage=%d", stage ); else uris = db->query( "select * from "+table ); uris = map(uris->uri, utf8_to_string); uris = map(uris, Standards.URI); return uris; }
9dbff72013-09-12Henrik Grubbström (Grubba) //! @returns //! Returns an array with all URI schemes currently used in the queue.
087ebe2011-05-25Henrik Grubbström (Grubba) array(string) get_schemes() { // FIXME: Consider using SUBSTRING_INDEX(). array(string) schemes = db->query("SELECT DISTINCT" " SUBSTRING(uri, 1, 20) AS scheme" " FROM "+table)->scheme; schemes = map(schemes, lambda(string s) { return (s/":")[0]; });
f25bb12011-05-25Henrik Grubbström (Grubba)  return Array.uniq(sort(schemes));
087ebe2011-05-25Henrik Grubbström (Grubba) }
b575b82001-07-05Johan Schön void put(string|array(string)|Standards.URI|array(Standards.URI) uri) { if(arrayp(uri)) { foreach(uri, string|object _uri) put(_uri); return; } if(!objectp(uri)) uri=Standards.URI(uri); add_uri( uri, 1, 0 ); } void clear() {
8c93212001-08-14Johan Schön  hascache = ([ ]);
b575b82001-07-05Johan Schön  db->query("delete from "+table); }
2e991b2001-08-20Johan Schön void remove_uri(string|Standards.URI uri) {
b0b3fa2002-02-18Johan Schön  hascache[(string)uri]=0;
2e991b2001-08-20Johan Schön  db->query("delete from "+table+" where uri_md5=%s", to_md5((string)uri)); }
b575b82001-07-05Johan Schön 
299e1a2007-11-16Marcus Wellhardh void remove_uri_prefix(string|Standards.URI uri) { string uri_string = (string)uri; foreach(indices(hascache), string _uri) if(has_prefix(_uri, uri_string)) hascache[_uri]=0; db->query("delete from "+table+" where uri like '" + db->quote(uri_string) + "%%'"); }
b575b82001-07-05Johan Schön void clear_stage( int ... stages ) { foreach( stages, int s ) db->query( "update "+table+" set stage=0 where stage=%d", s ); }
7bf0982010-10-25Martin Stjernholm void remove_stage (int stage) { db->query( "delete from "+table+" where stage=%d", stage ); }
b575b82001-07-05Johan Schön void clear_md5( int ... stages ) { foreach( stages, int s ) db->query( "update "+table+" set md5='' where stage=%d", s ); } int num_with_stage( int ... stage ) { return (int) db->query( "select COUNT(*) as c from "+table+" where stage IN (%s)", ((array(string))stage)*"," )[ 0 ]->c; } void set_stage( Standards.URI uri, int stage ) { db->query( "update "+table+" set stage=%d where uri_md5=%s",stage, to_md5((string)uri)); }
086f552001-08-14Johan Schön 
9dbff72013-09-12Henrik Grubbström (Grubba) //! @returns //! Returns the current stage for the specified URI. //! //! @seealso //! @[set_stage()]
52a24e2001-08-14Johan Schön int get_stage( Standards.URI uri )
086f552001-08-14Johan Schön {
52a24e2001-08-14Johan Schön  array a = db->query( "select stage from "+table+" where uri_md5=%s", to_md5((string)uri));
086f552001-08-14Johan Schön  if(sizeof(a))
52a24e2001-08-14Johan Schön  return (int)a[0]->stage;
086f552001-08-14Johan Schön  else return -1; }
087ebe2011-05-25Henrik Grubbström (Grubba) 
9dbff72013-09-12Henrik Grubbström (Grubba) //! Reset the stage to @expr{0@} (zero) for all URIs with the specified //! @[uri_prefix]. If no @[uri_prefix] is specified reset the stage for //! all URIs.
087ebe2011-05-25Henrik Grubbström (Grubba) void reset_stage(string|void uri_prefix) { if (uri_prefix) { db->query("UPDATE " + table + " SET stage = 0" " WHERE SUBSTRING(uri, 1, " + sizeof(uri_prefix) + ") = %s", uri_prefix); } else { db->query("UPDATE " + table + " SET stage = 0"); } }