pike.git / lib / modules / Search.pmod / Database.pmod / MySQL.pike

version» Context lines:

pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:1:   #pike __REAL_VERSION__      inherit .Base;      // Creates the SQL tables we need.      //#define SEARCH_DEBUG    + //#define SEARCH_DB_CONSISTENCY_CHECKS +    #define DB_MAX_WORD_SIZE 64    - static + protected   { - // This is the database object that all queries will be made to. -  Sql.Sql db; +  // This is the database that all queries will be made to.    string host; -  +  Sql.Sql get_db() +  { +  return Sql.Sql(host); +  }    mapping options;    string mergefile_path;    int mergefile_counter = 0;    int init_done = 0;   };      void create(string db_url, void|mapping _options)   { -  db=Sql.Sql(host=db_url); +  host = db_url; +  get_db();    options = _options || ([]);    mergefile_path = options->mergefiles;       if(!mergefile_path)    mergefile_path = "/tmp/";       if(options->mergefiles)    foreach(get_mergefiles(), string fn)    rm(fn);   }      #ifdef SEARCH_DEBUG - void destroy() + void _destruct()   {    if (blobs_dirty)    werror("Search.Database.MySQL: WARNING: Forgot to sync before "    "abandoning db object?\n");   }   #endif      string _sprintf()   { -  return sprintf("Search.Database.MySQL(%O,%O)", host, mergefile_path); +  return sprintf("Search.Database.MySQL(%O,%O)", +  Sql.censor_sql_url(host), mergefile_path);   }         // Support for old- and new-style padded blobs must be determined at   // runtime. This is because the format must be compatible with whatever   // high-level Search module currently available, specifically the compactor.   int cache_supports_padded_blobs = -1;      int supports_padded_blobs()   {
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:67:         // ----------------------------------------------   // Database initialization   // ----------------------------------------------      void init_tables()   {    int use_padded_blobs = supports_padded_blobs();    +  Sql.Sql db = get_db();    db->query(   #"create table if not exists uri (id int unsigned primary key    auto_increment not null,    uri blob not null,    uri_md5 varchar(32) binary not null,    UNIQUE(uri_md5))"    );       db->query(   #"create table if not exists document (id int unsigned primary key
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:152:       db->query(   #"create table if not exists field (id tinyint unsigned primary key not null,    name varchar(127) not null,    UNIQUE(name))");      }      void clear()   { -  db->query("delete from word_hit"); -  db->query("delete from uri"); -  db->query("delete from document"); -  db->query("delete from deleted_document"); -  db->query("delete from metadata"); -  db->query("delete from lastmodified"); +  Sql.Sql db = get_db(); +  db->query("TRUNCATE word_hit"); +  db->query("TRUNCATE uri"); +  db->query("TRUNCATE document"); +  db->query("TRUNCATE deleted_document"); +  db->query("TRUNCATE metadata"); +  db->query("TRUNCATE lastmodified");   }         // ----------------------------------------------   // Utility functions   // ----------------------------------------------    - static array(string) get_mergefiles() + protected array(string) get_mergefiles()   {    return map(glob("mergefile*.dat", get_dir(mergefile_path) || ({ })),    lambda(string s) { return combine_path(mergefile_path, s);});   }    - static string to_md5(string url) + protected string to_md5(string url)   {   #if constant(Crypto.md5) && constant(Crypto.string_to_hex)    return Crypto.string_to_hex( Crypto.md5()->    update( string_to_utf8(url) )->digest() );   #else    return String.string2hex( Crypto.MD5.hash( string_to_utf8(url) ) );   #endif   }         // ----------------------------------------------   // Document handling   // ----------------------------------------------      int get_uri_id(string uri, void|int do_not_create)   { -  +  Sql.Sql db = get_db();    string s=sprintf("select id from uri where uri_md5='%s'", to_md5(uri));    array a=db->query(s);    if(sizeof(a))    return (int)a[0]->id;       if(do_not_create)    return 0;       db->query("insert into uri (uri,uri_md5) "    "values (%s,%s)",
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:209:    return db->master_sql->insert_id();   }      int get_document_id(string uri, void|string language, void|int do_not_create)   {    int uri_id=get_uri_id(uri, do_not_create);       if (!uri_id)    return 0;    +  Sql.Sql db = get_db();    string s=sprintf("select id from document where "    "uri_id='%d'", uri_id);    if(language)    s+=sprintf(" and language='%s'",db->quote(language));       array a = db->query(s);       if(sizeof(a))    return (int)a[0]->id;       db->query("insert into document (uri_id, language) "    "values (%d,"+(language?"%s":"NULL")+")",    uri_id, language);    return db->master_sql->insert_id();   }      mapping get_uri_and_language(int|array(int) doc_id)   { -  +  Sql.Sql db = get_db();    if(arrayp(doc_id))    {    array a=db->query("select document.id,document.language, uri.uri from document, uri "    "where uri.id=document.uri_id and document.id IN ("+    ((array(string))doc_id)*","+")");    return mkmapping( (array(int))a->id, a );    }    else    {    array a=db->query("select document.language,uri.uri from document,uri "    "where uri.id=document.uri_id and document.id=%d",doc_id);    if(!sizeof(a))    return 0;       return (["uri":1,"language":1]) & a[0];    }   }      void remove_uri(string|Standards.URI uri)   { -  +  Sql.Sql db = get_db();    db->query("delete from uri where uri_md5=%s", to_md5((string)uri));   }      void remove_uri_prefix(string|Standards.URI uri)   { -  +  Sql.Sql db = get_db();    string uri_string = (string)uri;    db->query("delete from uri where uri like '" + db->quote(uri_string) + "%%'");   }      #ifdef SEARCH_DEBUG - static int docs; - static int blobs_dirty; + protected int docs; + protected int blobs_dirty;   #endif      void remove_document(string|Standards.URI uri, void|string language)   {   #ifdef SEARCH_DEBUG    docs++;   #endif       int uri_id=get_uri_id((string)uri, 1);       if(!uri_id)    return; -  +  Sql.Sql db = get_db();    array a;    if(language) {    // Need to remove this particular language fork as well as any    // non-language version of the document (since they are mutually    // exclusive).    //    // Note however that a document with several language forks where    // one fork is removed will keep that entry since we cannot know    // which entries that are garbage and hence leave them in place.    // It is up to the query filter to only show valid forks.
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:292:    "(language=%s OR language IS NULL)", uri_id, language);    } else {    // This also deletes any past language-specific forks    a=db->query("select id from document where uri_id=%d",uri_id);    }       if(!sizeof(a))    return;       db->query("delete from document where id in ("+a->id*","+")"); -  db->query("insert delayed into deleted_document (doc_id) values "+ +  db->query("insert into deleted_document (doc_id) values "+    "("+a->id*"),("+")");   }      void remove_document_prefix(string|Standards.URI uri)   { -  +  Sql.Sql db = get_db();    array a =    db->query("SELECT document.id AS id"    " FROM document, uri "    " WHERE document.uri_id=uri.id "    " AND uri.uri like '" + db->quote(uri) + "%%'");    if(!sizeof(a))    return;       array ids = a->id;   #ifdef SEARCH_DEBUG    docs += sizeof(ids);   #endif    db->query("DELETE FROM document "    " WHERE id IN (" + (ids * ",") + ")"); -  db->query("INSERT DELAYED INTO deleted_document " +  db->query("INSERT INTO deleted_document "    "(doc_id) VALUES (" + (ids * "),(") + ")");   }    - static Search.ResultSet deleted_documents = Search.ResultSet(); - static int deleted_max, deleted_count; + protected Search.ResultSet deleted_documents = Search.ResultSet(); + protected int deleted_max, deleted_count;   Search.ResultSet get_deleted_documents()   {    // FIXME: Make something better    -  +  Sql.Sql db = get_db();    array a = db->query("select max(doc_id) as m, count(*) as c from deleted_document");    int max_id = (int)a[0]->m;    int count = (int)a[0]->c;       if(max_id==deleted_max && count == deleted_count)    return deleted_documents;    else    {    array ids = (array(int))db->query("select doc_id from deleted_document "    "order by doc_id")->doc_id;    deleted_count = count;    deleted_max = max_id;    return deleted_documents = Search.ResultSet(ids);    }   }      Search.ResultSet get_all_documents()   { -  +  Sql.Sql db = get_db();    array ids =    (array(int)) db->query("SELECT id FROM document ORDER BY id")->id;    return Search.ResultSet(ids);   }            // ----------------------------------------------   // Field handling   // ----------------------------------------------    - static mapping(string:int) list_fields_cache; + protected mapping(string:int) list_fields_cache;    - static void init_fields() + protected void init_fields()   {    if(init_done)    return;       init_done=1;    foreach(({"uri","path1", "path2"})+Search.get_filter_fields(), string field)    allocate_field_id(field);   }      mapping(string:int) list_fields()   {    if(list_fields_cache)    return list_fields_cache;    init_fields(); -  +  Sql.Sql db = get_db();    array a=db->query("select name,id from field") + ({ (["name":"body",    "id": "0"]) });    return list_fields_cache=mkmapping(a->name, (array(int))a->id);   }      int allocate_field_id(string field)   {    if(!init_done)    init_fields();    if(field=="body")    return 0; -  +  Sql.Sql db = get_db();    array a =db->query("select id from field where name=%s", field);    if(sizeof(a))    return (int)a[0]->id;    db->query("lock tables field write");    mixed err = catch {    for(int i=1; i<64; i++)    {    array a=db->query("select name from field where id=%d",i);    if(!sizeof(a))    {
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:403:    return i;    }    }    };    mixed unlock_err = catch (db->query("unlock tables"));    if (err) throw (err);    if (unlock_err) throw (unlock_err);    return -1;   }    - static mapping field_cache = ([]); + protected mapping field_cache = ([]);   int get_field_id(string field, void|int do_not_create)   {    // The one special case.    if(field=="body") return 0;    if(field_cache[field]) return field_cache[field];       init_fields(); -  +  Sql.Sql db = get_db();    string s=sprintf("select id from field where name='%s'",db->quote(field));    array a=db->query(s);    if(sizeof(a))    {    field_cache[field]=(int)a[0]->id;    return (int)a[0]->id;    }       if(do_not_create)    return -1;       return allocate_field_id(field);   }      void remove_field(string field)   {    init_fields();    m_delete(field_cache, field);    list_fields_cache=0; -  +  Sql.Sql db = get_db();    db->query("delete from field where name=%s", field);   }      void safe_remove_field(string field)   {    if( search(({"uri","path1","path2"})+Search.get_filter_fields(), field) == -1 )    remove_field( field );   }      // ----------------------------------------------   // Word/blob handling   // ----------------------------------------------    - static _WhiteFish.Blobs blobs = _WhiteFish.Blobs(); + protected _WhiteFish.Blobs blobs = _WhiteFish.Blobs();      #define MAXMEM 64*1024*1024      void insert_words(Standards.URI|string uri, void|string language,    string field, array(string) words)   {    // Remove long words that won't fit into the database.    words = filter(words, lambda (string word)    { return sizeof(string_to_utf8(word)) <= DB_MAX_WORD_SIZE; });   
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:474:       if(blobs->memsize() > MAXMEM)    if(options->mergefiles)    mergefile_sync();    else    sync();   }      array(string) expand_word_glob(string g, void|int max_hits)   { -  g = replace( string_to_utf8(g), ({ "*", "?" }), ({ "%", "_" }) ); -  if(max_hits) -  return map(db->query("select distinct word from word_hit where word like %s limit %d", -  g, max_hits)->word,utf8_to_string); -  else -  return map(db->query("select distinct word from word_hit where word like %s",g)->word,utf8_to_string); +  string g_sql = replace(string_to_utf8(g), ({ "*", "?" }), ({ "%", "_" }) ); +  Sql.Sql db = get_db(); +  if (max_hits) { +  // Sort candidates before capping based on offset of the first non-glob +  // substring and then alphabetically. This gives a stable expansion +  // where e.g. "*test*" prioritizes "testing" before "latest" and thus +  // becomes more intuitive in conjunction with auto-globbing. +  array(string) non_glob_words = (replace(g, "?", "*") / "*" - ({ "" })); +  if (sizeof(non_glob_words)) { +  string first_word_sql = string_to_utf8(non_glob_words[0]); +  return map(db->query("SELECT DISTINCT word, " +  " LOCATE(%s, word) AS score " +  " FROM word_hit " +  " WHERE word LIKE %s " +  " ORDER BY score ASC, word ASC " +  " LIMIT %d", +  first_word_sql, g_sql, max_hits)->word, +  utf8_to_string); +  } else { +  return map(db->query("SELECT DISTINCT word " +  " FROM word_hit " +  " WHERE word LIKE %s " +  " ORDER BY word ASC " +  " LIMIT %d", +  g_sql, max_hits)->word, +  utf8_to_string);    } -  +  } else { +  return map(db->query("SELECT DISTINCT word " +  " FROM word_hit " +  " WHERE word LIKE %s", +  g_sql)->word, +  utf8_to_string); +  } + }         int get_padded_blob_length(int used_len)   {    // Suggest a padded length based on current length. We'll use this    // strategy:    //    // - no blobs smaller than 64 bytes    // - blobs grow 25% rounded up to nearest 64 bytes    int new_len = (((used_len >> 2) + used_len) | 63) + 1;    return min(new_len, max_blob_size);   }       - static int blobs_per_select = 40; + protected int blobs_per_select = 40;      string get_blob(string word, int num,    void|mapping(string:mapping(int:string)) blobcache)   {    word = string_to_utf8( word );    if(blobcache[word] && blobcache[word][num])    return blobcache[word][num];    if( blobcache[word] && blobcache[word][-1] )    {   #ifdef SEARCH_DEBUG    times[word] = 0;   #endif    return 0;    }   #ifdef SEARCH_DEBUG    int t0 = gethrtime();   #endif       int use_padded_blobs = supports_padded_blobs(); -  +  Sql.Sql db = get_db();    array a =    db->query(" SELECT hits, first_doc_id " +    (use_padded_blobs ? ", used_len, real_len " : "") +    " FROM word_hit "    " WHERE word = %s "    "ORDER BY first_doc_id "    " LIMIT %d,%d",    word, num, blobs_per_select);      #ifdef SEARCH_DEBUG
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:570:      // ----------------------------------------------   // Metadata handling   // ----------------------------------------------      void remove_metadata(Standards.URI|string uri, void|string language)   {    int doc_id;    if(!intp(uri))    doc_id = get_document_id((string)uri, language, 1); +  Sql.Sql db = get_db();    db->query("delete from metadata where doc_id = %d", doc_id);   }    - static string make_fields_sql(void|array(string) wanted_fields) + protected string make_fields_sql(void|array(string) wanted_fields)   { -  +  Sql.Sql db = get_db();    if(wanted_fields && sizeof(wanted_fields))    return " and name IN ('"+map(wanted_fields,db->quote)*"','"+"')";    else    return "";   }      mapping(string:string) get_metadata(int|Standards.URI|string uri,    void|string language,    void|array(string) wanted_fields)   {    int doc_id;    if(intp(uri))    doc_id=uri;    else    doc_id = get_document_id((string)uri, language); -  +  Sql.Sql db = get_db();    array a=db->query("select name,value from metadata where doc_id=%d"+    make_fields_sql(wanted_fields),    doc_id);    mapping md=mkmapping(a->name,a->value); -  + #if constant(Gz)    if(md->body)    md->body=Gz.inflate()->inflate(md->body); -  + #endif       foreach(indices(md), string field)    md[field] = utf8_to_string(md[field]);       return md;   }      mapping(int:string) get_special_metadata(array(int) doc_ids,    string wanted_field)   { -  +  Sql.Sql db = get_db();    array a=db->query("select doc_id,value from metadata where doc_id IN ("+    ((array(string))doc_ids)*","+") and name = %s",    wanted_field);       return mkmapping( (array(int))a->doc_id, a->value);   }      // ----------------------------------------------   // Date stuff   // ----------------------------------------------
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:631:    if(!intp(uri))    doc_id = get_document_id((string)uri, language);       init_fields();       // Still our one, single special case    if(md->body)    {    if(sizeof(md->body))    md->body = Unicode.normalize( Unicode.split_words_and_normalize( md->body ) * " ", "C"); + #if constant(Gz)    md->body = Gz.deflate(6)->deflate(string_to_utf8(md->body[..64000]),    Gz.FINISH); -  + #endif    }       if(!sizeof(md))    return 0;       foreach(indices(md), string ind)    if(ind!="body")    md[ind]=string_to_utf8(md[ind]);    -  +  Sql.Sql db = get_db();    string s=map(Array.transpose( ({ map(indices(md),db->quote),    map(values(md), db->quote) }) ),    lambda(array a)    {    return sprintf("(%d,'%s','%s')", doc_id,    a[0], a[1]);    }) * ", ";    -  db->query("replace delayed into metadata (doc_id, name, value) values "+s); +  db->query("replace into metadata (doc_id, name, value) values "+s);   }      void set_lastmodified(Standards.URI|string uri,    void|string language,    int when)   {    int doc_id = get_document_id((string)uri, language); -  +  Sql.Sql db = get_db();    db->query("replace into lastmodified (doc_id, at) values (%d,%d)", doc_id, when);   }      int get_lastmodified(Standards.URI|string|array(Standards.URI|string) uri, void|string language)   {    int doc_id = get_document_id((string)uri, language); -  +  Sql.Sql db = get_db();    array q = db->query("select at from lastmodified where doc_id=%d", doc_id);    if( sizeof( q ) )    return (int)q[0]->at;   }      void randomize_dates()   { -  +  Sql.Sql db = get_db();    foreach(db->query("select id from document")->id, string id)    db->query("replace into lastmodified (doc_id,at) values (%s,%d)",    id,    random(365*24*3600)+time()-365*24*3600);      }    - static + protected   {    _WhiteFish.DateSet dateset_cache;    int dateset_cache_max_doc_id = -1;       int get_max_doc_id()    { -  +  Sql.Sql db = get_db();    array a = db->query("select doc_id from lastmodified order by doc_id desc limit 1");    if(!sizeof(a))    return 0;    else    return (int)a[0]->doc_id;    }   };      _WhiteFish.DateSet get_global_dateset()   {    int max_doc_id = get_max_doc_id();    if(max_doc_id == dateset_cache_max_doc_id)    return dateset_cache;    else    { -  +  Sql.Sql db = get_db();    array a = db->query("select doc_id,at from lastmodified where "    "doc_id > %d order by doc_id asc", dateset_cache_max_doc_id);       dateset_cache_max_doc_id = max_doc_id;    if(!dateset_cache)    dateset_cache = _WhiteFish.DateSet();    dateset_cache->add_many( (array(int))a->doc_id,    (array(int))a->at );    return dateset_cache;    }   }    - static + protected   {    _WhiteFish.DateSet publ_dateset_cache;    int publ_dateset_cache_max_doc_id = -1;   };      _WhiteFish.DateSet get_global_publ_dateset()   {    int max_doc_id = get_max_doc_id();    if(max_doc_id == publ_dateset_cache_max_doc_id)    return publ_dateset_cache;    else    { -  +  Sql.Sql db = get_db();    array(mapping(string:mixed)) a =    db->query("SELECT doc_id, value FROM metadata "    " WHERE name = 'publish-time' "    " AND doc_id > %d ORDER BY doc_id ASC",    publ_dateset_cache_max_doc_id);       publ_dateset_cache_max_doc_id = max_doc_id;    if(!publ_dateset_cache)    publ_dateset_cache = _WhiteFish.DateSet();    publ_dateset_cache->add_many( (array(int))a->doc_id,
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:766:    return get_document_id( (string)uri, language);    });       string res =    "replace into link (from_id, to_id) values " +    map(to_ids,    lambda(int to_id)    {    return sprintf("(%d, %d)", doc_id, to_id);    }) * ", "; +  Sql.Sql db = get_db();    db->query(res);   }      void remove_links(Standards.URI|string uri,    void|string language)   {    int doc_id = get_document_id((string)uri, language, 1);    -  +  Sql.Sql db = get_db();    db->query("delete from link where from_id=%d", doc_id);   }      array(int) get_broken_links()   { -  +  Sql.Sql db = get_db();    db->query("select 'Not yet done :-)'");   }      // ----------------------------------------------   // Sync stuff   // ----------------------------------------------    - static function sync_callback; + protected function sync_callback;   void set_sync_callback( function f )   {    sync_callback = f;   }      // The maximum blob size on disk must be at least big enough to hold as   // many entries that can be found in a single document. This is needed so   // a split blob doesn't get the same docid in separate records.   //   // We can get at most 255 occurrences of the same word from each document,   // and if all of those are the same word AND the update takes place   // incrementally we'll write [ docid | nhits | hit ] for every occurrence,   // i.e. 7 bytes every time. Minimum blob size is therefore 1785 bytes.   constant max_blob_size = 512 * 1024;       - static array(array(int|string)) split_blobs(int blob_size, string blob, + protected array(array(int|string)) split_blobs(int blob_size, string blob,    int max_blob_size)   {    /*    +-----------+----------+---------+---------+---------+    | docid: 32 | nhits: 8 | hit: 16 | hit: 16 | hit: 16 |...    +-----------+----------+---------+---------+---------+    */       sscanf(blob, "%4c", int first_doc_id);    int ptr = blob_size;
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:836:    if (sizeof(me))    sscanf(me, "%4c", first_doc_id);    blobs += ({ ({ first_doc_id, me }) });    start = end;    blob_size=0;    }       return blobs;   }    - static void store_to_db( void|string mergedfilename ) + protected void store_to_db( void|string mergedfilename )   {    Search.MergeFile mergedfile;       if(mergedfilename)    mergedfile = Search.MergeFile(Stdio.File(mergedfilename, "r"));       int use_padded_blobs = supports_padded_blobs();       int s = time();    int q; -  Sql.Sql db = Sql.Sql( host ); +  Sql.Sql db = get_db();   #ifdef SEARCH_DEBUG    werror("----------- sync() %4d docs --------------\n", docs);   #endif    db->query("LOCK TABLES word_hit LOW_PRIORITY WRITE");       mixed err = catch {    String.Buffer multi_query = String.Buffer();       do    {
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:884:    // to store in sorted form since the hit analysis depend on it. The    // data() method in Blob performs sorting so instantiate a temp blob    // just to access this.    blob = _WhiteFish.Blob(blob)->data();    }       q++;       // Don't unlock and lock every word to reduce overhead    if (q % 32 == 0) { +  // Flush the accumulated updates before releasing the lock. +  if( sizeof( multi_query ) ) +  db->query( multi_query->get()); +     db->query("UNLOCK TABLES");    db->query("LOCK TABLES word_hit LOW_PRIORITY WRITE");    }       // NOTE: Concatenation of hits info is strictly speaking not correct in    // the general case since we may have the same docid repeated. In practice    // the only code path that adds words also invalidates the old docid and    // gets a fresh one.    -  +  // FIXME: The following two functions ought to be able to use multi_query().    void add_padded_blobs(string word, array new_blobs)    {    // Write all blobs except the last one that should be padded    foreach (new_blobs[..<1], array new_blob_pair) {    [int first_doc_id, string blob] = new_blob_pair;    int new_used_len = sizeof(blob);    db->query("INSERT INTO word_hit "    " (word, first_doc_id, used_len, real_len, hits)"    " VALUES (%s, %d, %d, %d, %s)",    word, first_doc_id, new_used_len, new_used_len, blob);
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:915:    // Write final blob with padding    [int first_doc_id, string blob] = new_blobs[-1];    int new_used_len = sizeof(blob);    int new_real_len = get_padded_blob_length(new_used_len);    int space_count = new_real_len - new_used_len;    db->query("INSERT INTO word_hit "    " (word, first_doc_id, used_len, real_len, hits)"    " VALUES (%s, %d, %d, %d, CONCAT(%s, SPACE(%d)))",    word, first_doc_id, new_used_len, new_real_len,    blob, space_count); + #ifdef SEARCH_DB_CONSISTENCY_CHECKS +  array new = db->query("SELECT real_len, LENGTH(hits) AS actual_len " +  " FROM word_hit " +  " WHERE word = %s " +  " AND first_doc_id = %d", +  word, first_doc_id); +  if (!sizeof(new)) { +  werror("Search.Database: Added blob not in db!\n"); +  } else if (new_real_len != (int)new[0]->actual_len) { +  werror("Search.Database: Added blob has different real_len: %d != %d\n", +  new_real_len, (int)new[0]->actual_len); +  } + #endif    };       void add_oldstyle_blobs(string word, array new_blobs)    {    // Write all blobs as new entries    foreach (new_blobs, array new_blob_pair) {    [int first_doc_id, string blob] = new_blob_pair;    db->query("INSERT INTO word_hit "    " (word, first_doc_id, hits)"    " VALUES (%s, %d, %s)",    word, first_doc_id, blob);    }    };    -  + #ifdef SEARCH_DB_CONSISTENCY_CHECKS +  string consistency_log = ""; + #define CONSISTENCY_LOG(X ...) do { \ +  consistency_log += sprintf(X); \ +  } while(0) + #else + #define CONSISTENCY_LOG(X ...) + #endif +     // We only care about the most recent blob for the given word so look    // for the highest document ID.    int first_doc_id;    array old;    if (use_padded_blobs) {    old = db->query(" SELECT first_doc_id, used_len, real_len " -  + #ifdef SEARCH_DB_CONSISTENCY_CHECKS +  " , LENGTH(hits) AS actual_len " + #endif    " FROM word_hit "    " WHERE word=%s "    "ORDER BY first_doc_id DESC "    " LIMIT 1", word);    } else { -  old = db->query(" SELECT first_doc_id, LENGTH(hits) AS used_len " +  old = db->query(" SELECT first_doc_id, LENGTH(hits) AS used_len, " +  " LENGTH(hits) AS real_len " + #ifdef SEARCH_DB_CONSISTENCY_CHECKS +  " , LENGTH(hits) AS actual_len " + #endif    " FROM word_hit "    " WHERE word=%s "    "ORDER BY first_doc_id DESC "    " LIMIT 1", word);    }       if (sizeof(old)) { -  int used_len = (int) old[-1]->used_len; -  int real_len = use_padded_blobs ? ((int) old[-1]->real_len) : used_len; -  int first_doc_id = (int) old[-1]->first_doc_id; +  int used_len = (int) old[0]->used_len; +  int real_len = (int) old[0]->real_len; +  int first_doc_id = (int) old[0]->first_doc_id; +  int new_used_len = used_len + sizeof(blob); +  int new_real_len = new_used_len; // NB: No padding.    -  // Can the new blob fit in the existing padding space? -  // -  // NOTE: This is never true for old-style blobs. -  if (real_len - used_len >= sizeof(blob)) { -  // Yes, update in place -  db->query(" UPDATE word_hit " -  " SET hits = INSERT(hits, %d, %d, %s), " -  " used_len = %d " -  " WHERE word = %s " -  " AND first_doc_id = %d", -  used_len + 1, sizeof(blob), blob, -  used_len + sizeof(blob), -  word, first_doc_id); -  } else if (used_len + sizeof(blob) <= max_blob_size) { -  // The old blob can grow to accomodate the new data without -  // exceeding the maximum blob size. +  array new_blobs = ({ blob }); +  + #ifdef SEARCH_DB_CONSISTENCY_CHECKS +  if (real_len != (int)old[0]->actual_len) { +  werror("Search.Database: Broken accounting for old word %O: %d != %d\n", +  word, real_len, (int)old[0]->actual_len); +  CONSISTENCY_LOG("Broken accounting for old word %O: %d != %d\n", +  word, real_len, (int)old[0]->actual_len); +  } + #endif +  +  if (new_used_len > max_blob_size) { +  // Need to split blobs +  new_blobs = split_blobs(used_len, blob, max_blob_size); +  CONSISTENCY_LOG("Splitting old %d byte blob into %d bytes.\n", +  sizeof(blob), sizeof(new_blobs[0][1])); +  blob = new_blobs[0][1]; +  new_used_len = used_len + sizeof(blob); +  +  // NB: No extra padding! +  new_real_len = new_used_len; +  } else if (use_padded_blobs) { +  // Add padding. +  new_real_len = get_padded_blob_length(new_used_len); +  } +  +  // Do we need to grow the old blob? +  if (new_real_len != real_len) { +  CONSISTENCY_LOG("Old (%d bytes) and new real_len (%d bytes) differ.\n", +  real_len, new_real_len);    if (use_padded_blobs) { -  // Make sure we make room for new padding for future use -  int new_used_len = used_len + sizeof(blob); -  int new_real_len = get_padded_blob_length(new_used_len); -  int space_count = new_real_len - new_used_len; +  // We can grow the old blob to accomodate the new data without +  // exceeding the maximum blob size. +  +  int space_count = new_real_len - real_len; +  int repl_size = sizeof(blob); +  +  if (space_count < 0) { +  // Truncate hits to new_real_len size (typically == new_used_len). +  // +  // Increase the third argument to INSERT() with the number of +  // padding bytes to remove. Note that space_count is negative. +  repl_size -= space_count; +  space_count = 0; +  CONSISTENCY_LOG("Truncating old hits by %d bytes.\n", +  repl_size - sizeof(blob)); +  } +  +  // NB: Concat the padding first, and then overwrite it with INSERT(), +  // to work around the corner case that INSERT() doesn't support +  // being a CONCAT().    db->query("UPDATE word_hit " -  " SET hits = INSERT(hits, %d, %d, CONCAT(%s, SPACE(%d)))," +  " SET hits = INSERT(CONCAT(hits, SPACE(%d)), %d, %d, %s),"    " used_len = %d, "    " real_len = %d "    " WHERE word = %s "    " AND first_doc_id = %d", -  used_len + 1, sizeof(blob) + space_count, blob, space_count, +  space_count, used_len + 1, repl_size, blob,    new_used_len,    new_real_len,    word, first_doc_id); -  +  CONSISTENCY_LOG("Updating used_len %d ==> %d and real_len %d ==> %d.\n", +  used_len, new_used_len, real_len, new_real_len);    } else {    // Append blob data to old record    db->query("UPDATE word_hit "    " SET hits = CONCAT(hits, %s) "    " WHERE word = %s "    " AND first_doc_id = %d",    blob, word, first_doc_id);    }    } else { -  // Need to split blobs -  array new_blobs = split_blobs(used_len, blob, max_blob_size); -  blob = new_blobs[0][1]; +  // NOTE: This is never true for old-style blobs. +  // +  // Update in place +  db->query(" UPDATE word_hit " +  " SET hits = INSERT(hits, %d, %d, %s), " +  " used_len = %d " +  " WHERE word = %s " +  " AND first_doc_id = %d", +  used_len + 1, sizeof(blob), blob, +  used_len + sizeof(blob), +  word, first_doc_id); +  CONSISTENCY_LOG("Updating in place (real_len: %d). used_len %d ==> %d\n", +  real_len, used_len, used_len + sizeof(blob)); +  }    -  + #ifdef SEARCH_DB_CONSISTENCY_CHECKS    if (use_padded_blobs) { -  // Write the first chunk at the end of the existing blob and remove -  // any left-over padding by giving a sufficiently bigger blob size -  // as third parameter compared to the actual data. -  int new_used_len = used_len + sizeof(blob); -  db->query("UPDATE word_hit " -  " SET hits = INSERT(hits, %d, %d, %s), " -  " used_len = %d, " -  " real_len = %d " +  array new = db->query("SELECT used_len, real_len, " +  " LENGTH(hits) AS actual_len " +  " FROM work_hit "    " WHERE word = %s "    " AND first_doc_id = %d", -  used_len + 1, sizeof(blob) + max_blob_size, blob, -  new_used_len, -  new_used_len, +     word, first_doc_id); -  } else { -  // Write the first chunk at the end of the existing blob -  db->query("UPDATE word_hit " -  " SET hits = CONCAT(hits, %s) " -  " WHERE word = %s " -  " AND first_doc_id = %d", -  blob, word, first_doc_id); +  if (!sizeof(new)) { +  werror("Search.Database: Lost track of word %O!\n", word); +  werror("Log:\n%s\n", consistency_log); +  } else if (new[0]->real_len != new[0]->actual_len) { +  werror("Search.Database: Broken accounting for new word %O: %d != %d\n", +  word, (int)new[0]->real_len, (int)new[0]->actual_len); +  werror("Log:\n%s\n", consistency_log);    } -  +  } + #endif    -  +  if (sizeof(new_blobs) > 1) {    // Write remaining ones    if (use_padded_blobs)    add_padded_blobs(word, new_blobs[1..]);    else    add_oldstyle_blobs(word, new_blobs[1..]);    }    } else {    // No existing entries so create new blobs    if (sizeof(blob) > max_blob_size) {    // Blobs must be split in several records
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:1098: Inside #if defined(SEARCH_DEBUG)
   }   #ifdef SEARCH_DEBUG    werror("----------- sync() done %3ds %5dw -------\n", time()-s,q);   #endif      #ifdef SEARCH_DEBUG    blobs_dirty = 0;   #endif   }    - static string get_mergefilename() + protected string get_mergefilename()   {    return combine_path(mergefile_path,    sprintf("mergefile%03d.dat", mergefile_counter));   }    - static void mergefile_sync() + protected void mergefile_sync()   {   #ifdef SEARCH_DEBUG    System.Timer t = System.Timer();    werror("----------- mergefile_sync() %4d docs --------------\n", docs);   #endif    Search.MergeFile mergefile = Search.MergeFile(    Stdio.File(get_mergefilename(), "wct"));       mergefile->write_blobs(blobs);   
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:1127: Inside #if defined(SEARCH_DEBUG)
  #ifdef SEARCH_DEBUG    werror("----------- mergefile_sync() done %.3f s %2.1f MB -------\n",    t->get(),    file_stat(get_mergefilename())->size/(1024.0*1024.0));   #endif       mergefile_counter++;    blobs = _WhiteFish.Blobs();   }    - static string merge_mergefiles(array(string) mergefiles) + protected string merge_mergefiles(array(string) mergefiles)   {   #ifdef SEARCH_DEBUG    werror("merge_mergefiles( %s )\n", mergefiles*", ");   #endif    if(sizeof(mergefiles)==1)    return mergefiles[0];       if(sizeof(mergefiles)>2)    {    int pivot = sizeof(mergefiles)/2;
pike.git/lib/modules/Search.pmod/Database.pmod/MySQL.pike:1198:   // Statistics   // ----------------------------------------------      int memsize()   {    return blobs->memsize();   }      mapping(string|int:int) get_language_stats()   { +  Sql.Sql db = get_db();    array a=db->query("select count(id) as c,language from document group by language");    return mkmapping( a->language, a->c);   }      int get_num_words()   { -  +  Sql.Sql db = get_db();    return (int)(db->query("select count(distinct word) as c from word_hit") +    ({ (["c": 0]) }))[0]->c;   }      int get_database_size()   { -  +  Sql.Sql db = get_db();    int size;    foreach(db->query("show table status"), mapping table)    size += (int)table->Data_length + (int)table->Index_length;    return size;   }      int get_num_deleted_documents()   { -  +  Sql.Sql db = get_db();    return (int)db->query("select count(*) as c from deleted_document")[0]->c;   }    - static string my_denormalize(string in) + protected string my_denormalize(string in)   {    return Unicode.normalize(utf8_to_string(in), "C");   }      array(array) get_most_common_words(void|int count)   { -  +  Sql.Sql db = get_db();    array a =    db->query(" SELECT word, " +    (supports_padded_blobs() ?    " SUM(used_len) / 5 AS c " :    " SUM(LENGTH(hits)) / 5 AS c ") +    " FROM word_hit "    " GROUP BY word "    " ORDER BY c DESC "    " LIMIT %d", count || 10);       if(!sizeof(a))    return ({ });    else    return Array.transpose( ({ map(a->word, my_denormalize),    (array(int))a->c }) );   }      void list_url_by_prefix(string url_prefix, function(string:void) cb)   { -  Sql.sql_result q = +  Sql.Sql db = get_db(); +  Sql.Result q =    db->big_query("SELECT uri "    " FROM uri "    " WHERE uri LIKE '"+db->quote(url_prefix)+"%'");    for(;;) {    array row = q->fetch_row();    if(!row)    break;    cb(row[0]);    }   }