eb01b42010-10-26Martin Stjernholm #pike __REAL_VERSION__
9c9da72001-06-23Johan Schön 
f863bc2001-06-10Per Hedbor inherit .Base;
4c5a2f2000-05-15Martin Nilsson  // Creates the SQL tables we need.
ec7cbe2001-03-18Johan Schön 
40a44d2004-08-07Johan Schön //#define SEARCH_DEBUG
cf68c32007-11-07Marcus Wellhardh #define DB_MAX_WORD_SIZE 64
40a44d2004-08-07Johan Schön 
ff17962014-08-15Martin Nilsson protected
40a44d2004-08-07Johan Schön {
5121c22015-10-07Henrik Grubbström (Grubba)  // This is the database that all queries will be made to.
40a44d2004-08-07Johan Schön  string host;
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql get_db() { return Sql.Sql(host); }
31c2e42004-08-08Johan Schön  mapping options;
40a44d2004-08-07Johan Schön  string mergefile_path; int mergefile_counter = 0; int init_done = 0; };
31c2e42004-08-08Johan Schön void create(string db_url, void|mapping _options)
40a44d2004-08-07Johan Schön {
5121c22015-10-07Henrik Grubbström (Grubba)  host = db_url; get_db();
31c2e42004-08-08Johan Schön  options = _options || ([]); mergefile_path = options->mergefiles;
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  if(!mergefile_path) mergefile_path = "/tmp/";
31c2e42004-08-08Johan Schön  if(options->mergefiles) foreach(get_mergefiles(), string fn) rm(fn);
40a44d2004-08-07Johan Schön }
335c2d2013-02-06Jonas Walldén #ifdef SEARCH_DEBUG void destroy() { if (blobs_dirty) werror("Search.Database.MySQL: WARNING: Forgot to sync before " "abandoning db object?\n"); } #endif
40a44d2004-08-07Johan Schön string _sprintf() {
5121c22015-10-07Henrik Grubbström (Grubba)  return sprintf("Search.Database.MySQL(%O,%O)", Sql.censor_sql_url(host), mergefile_path);
40a44d2004-08-07Johan Schön }
4005e52013-06-14Jonas Walldén  // Support for old- and new-style padded blobs must be determined at // runtime. This is because the format must be compatible with whatever // high-level Search module currently available, specifically the compactor. int cache_supports_padded_blobs = -1; int supports_padded_blobs() { if (cache_supports_padded_blobs < 0) { mixed compactor_class = master()->resolv("Search.Process.Compactor"); if (compactor_class && compactor_class->supports_padded_blobs) cache_supports_padded_blobs = 1; else cache_supports_padded_blobs = 0; } return cache_supports_padded_blobs; }
40a44d2004-08-07Johan Schön // ---------------------------------------------- // Database initialization // ----------------------------------------------
dc5a142001-06-27Johan Schön void init_tables()
7ec58b2000-10-26Johan Schön {
4005e52013-06-14Jonas Walldén  int use_padded_blobs = supports_padded_blobs();
3524712015-05-26Martin Nilsson 
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
4c5a2f2000-05-15Martin Nilsson  db->query(
dc5a142001-06-27Johan Schön #"create table if not exists uri (id int unsigned primary key
f863bc2001-06-10Per Hedbor  auto_increment not null, uri blob not null, uri_md5 varchar(32) binary not null,
3452ab2001-05-31Johan Schön  UNIQUE(uri_md5))"
0982082001-01-05Martin Nilsson  ); db->query(
dc5a142001-06-27Johan Schön #"create table if not exists document (id int unsigned primary key
f863bc2001-06-10Per Hedbor  auto_increment not null,
ec7cbe2001-03-18Johan Schön  uri_id int unsigned not null,
2a13d12001-08-31Johan Schön  language varchar(255) default null,
32976a2001-07-02Johan Schön  INDEX index_language (language),
ec7cbe2001-03-18Johan Schön  INDEX index_uri_id (uri_id))"
16a29b2001-09-26Johan Schön  ); //FIXME: Remove index_language?
3524712015-05-26Martin Nilsson 
193fb02011-02-01Henrik Grubbström (Grubba)  db->query("create table if not exists deleted_document (doc_id int unsigned not null primary key)");
0813082001-05-17Johan Schön 
40a44d2004-08-07Johan Schön 
3524712015-05-26Martin Nilsson 
0813082001-05-17Johan Schön  db->query(
cf68c32007-11-07Marcus Wellhardh #"create table if not exists word_hit (word varchar("+DB_MAX_WORD_SIZE+#") binary not null,
4005e52013-06-14Jonas Walldén  first_doc_id int not null, " + (use_padded_blobs ? #" used_len int not null, real_len int not null, " : "") + #"
feb96a2001-05-31Johan Schön  hits mediumblob not null,
40a44d2004-08-07Johan Schön  primary key (word,first_doc_id))");
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  int has_padded_blobs_fields = sizeof(db->query("DESCRIBE word_hit used_len")); if (use_padded_blobs && !has_padded_blobs_fields) { // Add used_len and real_len to older tables werror("Search: Upgrading '%s.word_hit' table to support padded blobs.\n", (host / "/")[-1]); db->query("ALTER TABLE word_hit " " ADD COLUMN used_len INT NOT NULL " " AFTER first_doc_id, " " ADD COLUMN real_len INT NOT NULL " " AFTER used_len"); db->query("UPDATE word_hit " " SET used_len = LENGTH(hits), real_len = LENGTH(hits)"); } else if (!use_padded_blobs && has_padded_blobs_fields) { // Newer database format found in a context where we don't expect it. // In order to not misinterpret or even write incorrect records we // must drop the extra fields. (Trying to set the supports flag here // will not survive new instances of the MySQL object.) werror("Search: Downgrading '%s.word_hit' table to remove padded blobs.\n", (host / "/")[-1]); db->query("UPDATE word_hit " " SET hits = LEFT(hits, used_len) " " WHERE used_len < real_len"); db->query("ALTER TABLE word_hit " " DROP COLUMN used_len, " " DROP COLUMN real_len"); }
feb96a2001-05-31Johan Schön  db->query(
40a44d2004-08-07Johan Schön #"create table if not exists lastmodified(doc_id int not null primary key, at int not null, index index_at(at))"); db->query( #"create table if not exists link(from_id int not null, to_id int not null, index index_from(from_id), index index_to(to_id))");
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  db->query(
dc5a142001-06-27Johan Schön #"create table if not exists metadata (doc_id int not null,
feb96a2001-05-31Johan Schön  name varchar(32) not null, value mediumblob not null,
f70d322001-09-26Johan Schön  index index_doc_id(doc_id))");
ec7cbe2001-03-18Johan Schön  db->query(
dc5a142001-06-27Johan Schön #"create table if not exists field (id tinyint unsigned primary key not null,
b3b4802001-03-19Johan Schön  name varchar(127) not null,
733b7b2001-06-26Johan Schön  UNIQUE(name))");
83c6bd2001-06-23Johan Schön 
4c5a2f2000-05-15Martin Nilsson }
40a44d2004-08-07Johan Schön void clear()
83c6bd2001-06-23Johan Schön {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  db->query("delete from word_hit"); db->query("delete from uri"); db->query("delete from document"); db->query("delete from deleted_document"); db->query("delete from metadata"); db->query("delete from lastmodified");
83c6bd2001-06-23Johan Schön }
4c5a2f2000-05-15Martin Nilsson 
40a44d2004-08-07Johan Schön // ---------------------------------------------- // Utility functions // ----------------------------------------------
ff17962014-08-15Martin Nilsson protected array(string) get_mergefiles()
78fadb2000-11-30Johan Schön {
40a44d2004-08-07Johan Schön  return map(glob("mergefile*.dat", get_dir(mergefile_path) || ({ })), lambda(string s) { return combine_path(mergefile_path, s);});
78fadb2000-11-30Johan Schön }
ff17962014-08-15Martin Nilsson protected string to_md5(string url)
1394222000-11-10Johan Schön {
40a44d2004-08-07Johan Schön #if constant(Crypto.md5) && constant(Crypto.string_to_hex) return Crypto.string_to_hex( Crypto.md5()-> update( string_to_utf8(url) )->digest() ); #else return String.string2hex( Crypto.MD5.hash( string_to_utf8(url) ) ); #endif
1394222000-11-10Johan Schön }
40a44d2004-08-07Johan Schön  // ---------------------------------------------- // Document handling // ----------------------------------------------
1394222000-11-10Johan Schön 
feb96a2001-05-31Johan Schön int get_uri_id(string uri, void|int do_not_create)
ec7cbe2001-03-18Johan Schön {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
9e7daa2001-06-10Johan Schön  string s=sprintf("select id from uri where uri_md5='%s'", to_md5(uri));
ec7cbe2001-03-18Johan Schön  array a=db->query(s); if(sizeof(a)) return (int)a[0]->id;
feb96a2001-05-31Johan Schön  if(do_not_create) return 0;
3452ab2001-05-31Johan Schön  db->query("insert into uri (uri,uri_md5) " "values (%s,%s)",
cee23c2001-08-08Per Hedbor  string_to_utf8( uri ), to_md5(uri));
ec7cbe2001-03-18Johan Schön  return db->master_sql->insert_id(); }
4c5a2f2000-05-15Martin Nilsson 
455bda2009-06-26Fredrik Noring int get_document_id(string uri, void|string language, void|int do_not_create)
ec7cbe2001-03-18Johan Schön {
455bda2009-06-26Fredrik Noring  int uri_id=get_uri_id(uri, do_not_create); if (!uri_id) return 0;
3524712015-05-26Martin Nilsson 
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
ec7cbe2001-03-18Johan Schön  string s=sprintf("select id from document where " "uri_id='%d'", uri_id);
32976a2001-07-02Johan Schön  if(language) s+=sprintf(" and language='%s'",db->quote(language));
ec7cbe2001-03-18Johan Schön  array a = db->query(s);
40a44d2004-08-07Johan Schön 
ec7cbe2001-03-18Johan Schön  if(sizeof(a)) return (int)a[0]->id;
40a44d2004-08-07Johan Schön  db->query("insert into document (uri_id, language) "
3524712015-05-26Martin Nilsson  "values (%d,"+(language?"%s":"NULL")+")",
40a44d2004-08-07Johan Schön  uri_id, language);
ec7cbe2001-03-18Johan Schön  return db->master_sql->insert_id(); }
4c5a2f2000-05-15Martin Nilsson 
40a44d2004-08-07Johan Schön mapping get_uri_and_language(int|array(int) doc_id) {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  if(arrayp(doc_id)) { array a=db->query("select document.id,document.language, uri.uri from document, uri " "where uri.id=document.uri_id and document.id IN ("+ ((array(string))doc_id)*","+")"); return mkmapping( (array(int))a->id, a ); } else { array a=db->query("select document.language,uri.uri from document,uri " "where uri.id=document.uri_id and document.id=%d",doc_id); if(!sizeof(a)) return 0;
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  return (["uri":1,"language":1]) & a[0]; } }
455bda2009-06-26Fredrik Noring void remove_uri(string|Standards.URI uri) {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
455bda2009-06-26Fredrik Noring  db->query("delete from uri where uri_md5=%s", to_md5((string)uri)); } void remove_uri_prefix(string|Standards.URI uri) {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
455bda2009-06-26Fredrik Noring  string uri_string = (string)uri; db->query("delete from uri where uri like '" + db->quote(uri_string) + "%%'"); }
335c2d2013-02-06Jonas Walldén #ifdef SEARCH_DEBUG
ff17962014-08-15Martin Nilsson protected int docs; protected int blobs_dirty;
335c2d2013-02-06Jonas Walldén #endif
40a44d2004-08-07Johan Schön  void remove_document(string|Standards.URI uri, void|string language) {
335c2d2013-02-06Jonas Walldén #ifdef SEARCH_DEBUG docs++; #endif
40a44d2004-08-07Johan Schön 
455bda2009-06-26Fredrik Noring  int uri_id=get_uri_id((string)uri, 1);
40a44d2004-08-07Johan Schön  if(!uri_id) return;
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  array a;
37a8502012-06-20Jonas Wallden  if(language) { // Need to remove this particular language fork as well as any // non-language version of the document (since they are mutually // exclusive). // // Note however that a document with several language forks where // one fork is removed will keep that entry since we cannot know // which entries that are garbage and hence leave them in place. // It is up to the query filter to only show valid forks.
40a44d2004-08-07Johan Schön  a=db->query("select id from document where uri_id=%d and "
37a8502012-06-20Jonas Wallden  "(language=%s OR language IS NULL)", uri_id, language); } else { // This also deletes any past language-specific forks
40a44d2004-08-07Johan Schön  a=db->query("select id from document where uri_id=%d",uri_id);
37a8502012-06-20Jonas Wallden  }
40a44d2004-08-07Johan Schön  if(!sizeof(a)) return;
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  db->query("delete from document where id in ("+a->id*","+")");
9718b22014-08-29Henrik Grubbström (Grubba)  db->query("insert into deleted_document (doc_id) values "+
40a44d2004-08-07Johan Schön  "("+a->id*"),("+")"); }
299e1a2007-11-16Marcus Wellhardh void remove_document_prefix(string|Standards.URI uri) {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
299e1a2007-11-16Marcus Wellhardh  array a = db->query("SELECT document.id AS id" " FROM document, uri " " WHERE document.uri_id=uri.id " " AND uri.uri like '" + db->quote(uri) + "%%'"); if(!sizeof(a)) return; array ids = a->id;
335c2d2013-02-06Jonas Walldén #ifdef SEARCH_DEBUG docs += sizeof(ids); #endif
299e1a2007-11-16Marcus Wellhardh  db->query("DELETE FROM document " " WHERE id IN (" + (ids * ",") + ")");
9718b22014-08-29Henrik Grubbström (Grubba)  db->query("INSERT INTO deleted_document "
299e1a2007-11-16Marcus Wellhardh  "(doc_id) VALUES (" + (ids * "),(") + ")"); }
ff17962014-08-15Martin Nilsson protected Search.ResultSet deleted_documents = Search.ResultSet(); protected int deleted_max, deleted_count;
40a44d2004-08-07Johan Schön Search.ResultSet get_deleted_documents() { // FIXME: Make something better
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  array a = db->query("select max(doc_id) as m, count(*) as c from deleted_document"); int max_id = (int)a[0]->m; int count = (int)a[0]->c; if(max_id==deleted_max && count == deleted_count) return deleted_documents; else { array ids = (array(int))db->query("select doc_id from deleted_document " "order by doc_id")->doc_id; deleted_count = count; deleted_max = max_id; return deleted_documents = Search.ResultSet(ids); } }
26c3412008-03-26Jonas Wallden  Search.ResultSet get_all_documents() {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
26c3412008-03-26Jonas Wallden  array ids = (array(int)) db->query("SELECT id FROM document ORDER BY id")->id; return Search.ResultSet(ids); }
40a44d2004-08-07Johan Schön // ---------------------------------------------- // Field handling // ----------------------------------------------
ff17962014-08-15Martin Nilsson protected mapping(string:int) list_fields_cache;
d879f12001-06-23Johan Schön 
ff17962014-08-15Martin Nilsson protected void init_fields()
40a44d2004-08-07Johan Schön { if(init_done) return; init_done=1; foreach(({"uri","path1", "path2"})+Search.get_filter_fields(), string field) allocate_field_id(field); }
e2afd02001-06-23Johan Schön mapping(string:int) list_fields() {
d879f12001-06-23Johan Schön  if(list_fields_cache) return list_fields_cache;
83c6bd2001-06-23Johan Schön  init_fields();
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
f3735c2001-06-23Johan Schön  array a=db->query("select name,id from field") + ({ (["name":"body", "id": "0"]) });
d879f12001-06-23Johan Schön  return list_fields_cache=mkmapping(a->name, (array(int))a->id);
e2afd02001-06-23Johan Schön }
693cb12001-06-23Johan Schön int allocate_field_id(string field) {
f3735c2001-06-23Johan Schön  if(!init_done) init_fields();
83c6bd2001-06-23Johan Schön  if(field=="body") return 0;
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
51a5c02001-08-09Johan Schön  array a =db->query("select id from field where name=%s", field); if(sizeof(a)) return (int)a[0]->id;
f3735c2001-06-23Johan Schön  db->query("lock tables field write");
2af7372005-06-02Martin Stjernholm  mixed err = catch { for(int i=1; i<64; i++) { array a=db->query("select name from field where id=%d",i); if(!sizeof(a)) { a=db->query("replace into field (id,name) values (%d,%s)", i, field); list_fields_cache=0; db->query("unlock tables"); return i; } } }; mixed unlock_err = catch (db->query("unlock tables")); if (err) throw (err); if (unlock_err) throw (unlock_err);
32976a2001-07-02Johan Schön  return -1;
693cb12001-06-23Johan Schön }
ff17962014-08-15Martin Nilsson protected mapping field_cache = ([]);
3452ab2001-05-31Johan Schön int get_field_id(string field, void|int do_not_create)
bfbcc62001-03-19Johan Schön {
693cb12001-06-23Johan Schön  // The one special case.
8758c92001-05-25Johan Schön  if(field=="body") return 0; if(field_cache[field]) return field_cache[field];
3524712015-05-26Martin Nilsson 
83c6bd2001-06-23Johan Schön  init_fields();
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
bfbcc62001-03-19Johan Schön  string s=sprintf("select id from field where name='%s'",db->quote(field)); array a=db->query(s); if(sizeof(a))
0813082001-05-17Johan Schön  {
f3735c2001-06-23Johan Schön  field_cache[field]=(int)a[0]->id; return (int)a[0]->id;
0813082001-05-17Johan Schön  }
bfbcc62001-03-19Johan Schön 
3452ab2001-05-31Johan Schön  if(do_not_create)
32976a2001-07-02Johan Schön  return -1;
3452ab2001-05-31Johan Schön 
693cb12001-06-23Johan Schön  return allocate_field_id(field); }
32976a2001-07-02Johan Schön void remove_field(string field)
693cb12001-06-23Johan Schön {
83c6bd2001-06-23Johan Schön  init_fields();
693cb12001-06-23Johan Schön  m_delete(field_cache, field);
d879f12001-06-23Johan Schön  list_fields_cache=0;
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
693cb12001-06-23Johan Schön  db->query("delete from field where name=%s", field);
bfbcc62001-03-19Johan Schön }
8a06e92003-01-27Mattias Andersson void safe_remove_field(string field) {
12f1df2004-12-20Anders Johansson  if( search(({"uri","path1","path2"})+Search.get_filter_fields(), field) == -1 )
8a06e92003-01-27Mattias Andersson  remove_field( field ); }
40a44d2004-08-07Johan Schön // ---------------------------------------------- // Word/blob handling // ----------------------------------------------
8758c92001-05-25Johan Schön 
ff17962014-08-15Martin Nilsson protected _WhiteFish.Blobs blobs = _WhiteFish.Blobs();
32976a2001-07-02Johan Schön 
31c2e42004-08-08Johan Schön #define MAXMEM 64*1024*1024
f70d322001-09-26Johan Schön 
ec7cbe2001-03-18Johan Schön void insert_words(Standards.URI|string uri, void|string language,
32976a2001-07-02Johan Schön  string field, array(string) words)
ec7cbe2001-03-18Johan Schön {
cf68c32007-11-07Marcus Wellhardh  // Remove long words that won't fit into the database. words = filter(words, lambda (string word) { return sizeof(string_to_utf8(word)) <= DB_MAX_WORD_SIZE; });
3524712015-05-26Martin Nilsson 
db7a1e2001-05-26Per Hedbor  if(!sizeof(words)) return;
83c6bd2001-06-23Johan Schön  init_fields();
57ab6d2001-05-26Per Hedbor 
3452ab2001-05-31Johan Schön  int doc_id = get_document_id((string)uri, language); int field_id = get_field_id(field);
3524712015-05-26Martin Nilsson 
af27a62001-07-31Johan Schön  blobs->add_words( doc_id, words, field_id );
335c2d2013-02-06Jonas Walldén #ifdef SEARCH_DEBUG blobs_dirty = 1; #endif
3524712015-05-26Martin Nilsson 
db7a1e2001-05-26Per Hedbor  if(blobs->memsize() > MAXMEM)
31c2e42004-08-08Johan Schön  if(options->mergefiles) mergefile_sync(); else sync();
0813082001-05-17Johan Schön }
86f8882001-03-28Johan Schön 
8b4ffb2002-03-12Johan Schön array(string) expand_word_glob(string g, void|int max_hits) { g = replace( string_to_utf8(g), ({ "*", "?" }), ({ "%", "_" }) );
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
8b4ffb2002-03-12Johan Schön  if(max_hits)
31c2e42004-08-08Johan Schön  return map(db->query("select distinct word from word_hit where word like %s limit %d", g, max_hits)->word,utf8_to_string);
8b4ffb2002-03-12Johan Schön  else
31c2e42004-08-08Johan Schön  return map(db->query("select distinct word from word_hit where word like %s",g)->word,utf8_to_string);
40a44d2004-08-07Johan Schön }
4005e52013-06-14Jonas Walldén  int get_padded_blob_length(int used_len) { // Suggest a padded length based on current length. We'll use this // strategy: // // - no blobs smaller than 64 bytes // - blobs grow 25% rounded up to nearest 64 bytes int new_len = (((used_len >> 2) + used_len) | 63) + 1; return min(new_len, max_blob_size); }
ff17962014-08-15Martin Nilsson protected int blobs_per_select = 40;
40a44d2004-08-07Johan Schön 
4005e52013-06-14Jonas Walldén string get_blob(string word, int num, void|mapping(string:mapping(int:string)) blobcache)
40a44d2004-08-07Johan Schön { word = string_to_utf8( word ); if(blobcache[word] && blobcache[word][num]) return blobcache[word][num]; if( blobcache[word] && blobcache[word][-1] ) { #ifdef SEARCH_DEBUG times[word] = 0; #endif return 0; } #ifdef SEARCH_DEBUG int t0 = gethrtime(); #endif
4005e52013-06-14Jonas Walldén  int use_padded_blobs = supports_padded_blobs();
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
4005e52013-06-14Jonas Walldén  array a = db->query(" SELECT hits, first_doc_id " + (use_padded_blobs ? ", used_len, real_len " : "") + " FROM word_hit " " WHERE word = %s " "ORDER BY first_doc_id " " LIMIT %d,%d", word, num, blobs_per_select);
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön #ifdef SEARCH_DEBUG int t1 = gethrtime()-t0; times[word] += t1; werror("word: %O time accum: %.2f ms delta_t: %.2f\n", word, times[word]/1000.0, t1/1000.0); #endif
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  blobcache[word] = ([]); if( sizeof( a ) < blobs_per_select ) blobcache[word][-1]=""; if(!sizeof(a)) { #ifdef SEARCH_DEBUG times[word] = 0; #endif return 0; }
4005e52013-06-14Jonas Walldén  foreach(a, mapping m) { if (use_padded_blobs) { // Each blob may be padded with trailing space to reduce fragmentation. // The feeder requesting the data will however not understand that so // we cut it off. In the unlikely event that real_len isn't the true // length we take care of that as well (this would indicate something // fishy in the writing of padded blobs). int used_len = (int) m->used_len; int real_len = (int) m->real_len; if ((used_len < real_len) || (real_len != sizeof(m->hits))) m->hits = m->hits[..(used_len - 1)]; }
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  blobcache[word][num++] = m->hits;
4005e52013-06-14Jonas Walldén  }
40a44d2004-08-07Johan Schön  return a[0]->hits;
8b4ffb2002-03-12Johan Schön }
40a44d2004-08-07Johan Schön  // ---------------------------------------------- // Metadata handling // ----------------------------------------------
39fa4d2001-08-20Johan Schön void remove_metadata(Standards.URI|string uri, void|string language) { int doc_id; if(!intp(uri))
455bda2009-06-26Fredrik Noring  doc_id = get_document_id((string)uri, language, 1);
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
39fa4d2001-08-20Johan Schön  db->query("delete from metadata where doc_id = %d", doc_id); }
ff17962014-08-15Martin Nilsson protected string make_fields_sql(void|array(string) wanted_fields)
40a44d2004-08-07Johan Schön {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  if(wanted_fields && sizeof(wanted_fields)) return " and name IN ('"+map(wanted_fields,db->quote)*"','"+"')"; else return ""; } mapping(string:string) get_metadata(int|Standards.URI|string uri, void|string language, void|array(string) wanted_fields) { int doc_id; if(intp(uri)) doc_id=uri; else doc_id = get_document_id((string)uri, language);
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  array a=db->query("select name,value from metadata where doc_id=%d"+ make_fields_sql(wanted_fields), doc_id); mapping md=mkmapping(a->name,a->value);
f0b0d42014-05-16Martin Nilsson #if constant(Gz)
40a44d2004-08-07Johan Schön  if(md->body) md->body=Gz.inflate()->inflate(md->body);
f0b0d42014-05-16Martin Nilsson #endif
40a44d2004-08-07Johan Schön  foreach(indices(md), string field) md[field] = utf8_to_string(md[field]); return md; } mapping(int:string) get_special_metadata(array(int) doc_ids, string wanted_field) {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  array a=db->query("select doc_id,value from metadata where doc_id IN ("+ ((array(string))doc_ids)*","+") and name = %s", wanted_field); return mkmapping( (array(int))a->doc_id, a->value); } // ---------------------------------------------- // Date stuff // ----------------------------------------------
39fa4d2001-08-20Johan Schön 
feb96a2001-05-31Johan Schön void set_metadata(Standards.URI|string uri, void|string language,
39fa4d2001-08-20Johan Schön  mapping(string:string) md)
feb96a2001-05-31Johan Schön { int doc_id; if(!intp(uri)) doc_id = get_document_id((string)uri, language);
e2afd02001-06-23Johan Schön 
83c6bd2001-06-23Johan Schön  init_fields();
e2afd02001-06-23Johan Schön  // Still our one, single special case
feb96a2001-05-31Johan Schön  if(md->body)
deab112001-08-01Johan Schön  {
31a04e2001-11-21Johan Schön  if(sizeof(md->body)) md->body = Unicode.normalize( Unicode.split_words_and_normalize( md->body ) * " ", "C");
f0b0d42014-05-16Martin Nilsson #if constant(Gz)
8cf66a2001-06-06Per Hedbor  md->body = Gz.deflate(6)->deflate(string_to_utf8(md->body[..64000]),
feb96a2001-05-31Johan Schön  Gz.FINISH);
f0b0d42014-05-16Martin Nilsson #endif
deab112001-08-01Johan Schön  }
feb96a2001-05-31Johan Schön  if(!sizeof(md)) return 0;
0584892001-06-11Johan Schön  foreach(indices(md), string ind) if(ind!="body")
96496b2001-06-11Johan Schön  md[ind]=string_to_utf8(md[ind]);
0584892001-06-11Johan Schön 
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
feb96a2001-05-31Johan Schön  string s=map(Array.transpose( ({ map(indices(md),db->quote), map(values(md), db->quote) }) ), lambda(array a) {
8cf66a2001-06-06Per Hedbor  return sprintf("(%d,'%s','%s')", doc_id,
0584892001-06-11Johan Schön  a[0], a[1]);
feb96a2001-05-31Johan Schön  }) * ", ";
3524712015-05-26Martin Nilsson 
9718b22014-08-29Henrik Grubbström (Grubba)  db->query("replace into metadata (doc_id, name, value) values "+s);
feb96a2001-05-31Johan Schön }
40a44d2004-08-07Johan Schön void set_lastmodified(Standards.URI|string uri, void|string language, int when)
713c292001-08-07Johan Schön {
40a44d2004-08-07Johan Schön  int doc_id = get_document_id((string)uri, language);
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  db->query("replace into lastmodified (doc_id, at) values (%d,%d)", doc_id, when);
713c292001-08-07Johan Schön }
40a44d2004-08-07Johan Schön int get_lastmodified(Standards.URI|string|array(Standards.URI|string) uri, void|string language)
feb96a2001-05-31Johan Schön {
40a44d2004-08-07Johan Schön  int doc_id = get_document_id((string)uri, language);
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  array q = db->query("select at from lastmodified where doc_id=%d", doc_id); if( sizeof( q ) ) return (int)q[0]->at;
feb96a2001-05-31Johan Schön }
40a44d2004-08-07Johan Schön void randomize_dates()
713c292001-08-07Johan Schön {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  foreach(db->query("select id from document")->id, string id) db->query("replace into lastmodified (doc_id,at) values (%s,%d)", id, random(365*24*3600)+time()-365*24*3600);
3524712015-05-26Martin Nilsson 
713c292001-08-07Johan Schön }
ff17962014-08-15Martin Nilsson protected
feb96a2001-05-31Johan Schön {
40a44d2004-08-07Johan Schön  _WhiteFish.DateSet dateset_cache; int dateset_cache_max_doc_id = -1;
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  int get_max_doc_id()
14ecbc2001-08-07Johan Schön  {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  array a = db->query("select doc_id from lastmodified order by doc_id desc limit 1"); if(!sizeof(a)) return 0; else return (int)a[0]->doc_id;
14ecbc2001-08-07Johan Schön  }
40a44d2004-08-07Johan Schön }; _WhiteFish.DateSet get_global_dateset() { int max_doc_id = get_max_doc_id(); if(max_doc_id == dateset_cache_max_doc_id) return dateset_cache;
14ecbc2001-08-07Johan Schön  else {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  array a = db->query("select doc_id,at from lastmodified where " "doc_id > %d order by doc_id asc", dateset_cache_max_doc_id);
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  dateset_cache_max_doc_id = max_doc_id; if(!dateset_cache) dateset_cache = _WhiteFish.DateSet(); dateset_cache->add_many( (array(int))a->doc_id, (array(int))a->at ); return dateset_cache;
14ecbc2001-08-07Johan Schön  }
feb96a2001-05-31Johan Schön }
ff17962014-08-15Martin Nilsson protected
a5b6312010-01-20Martin Jonsson { _WhiteFish.DateSet publ_dateset_cache; int publ_dateset_cache_max_doc_id = -1; }; _WhiteFish.DateSet get_global_publ_dateset() { int max_doc_id = get_max_doc_id(); if(max_doc_id == publ_dateset_cache_max_doc_id) return publ_dateset_cache; else {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
3524712015-05-26Martin Nilsson  array(mapping(string:mixed)) a =
a5b6312010-01-20Martin Jonsson  db->query("SELECT doc_id, value FROM metadata "
dd385e2011-05-09Martin Stjernholm  " WHERE name = 'publish-time' "
a5b6312010-01-20Martin Jonsson  " AND doc_id > %d ORDER BY doc_id ASC", publ_dateset_cache_max_doc_id);
dd385e2011-05-09Martin Stjernholm 
a5b6312010-01-20Martin Jonsson  publ_dateset_cache_max_doc_id = max_doc_id; if(!publ_dateset_cache) publ_dateset_cache = _WhiteFish.DateSet(); publ_dateset_cache->add_many( (array(int))a->doc_id,
dd385e2011-05-09Martin Stjernholm  (array(int))a->value );
a5b6312010-01-20Martin Jonsson  return publ_dateset_cache; } }
40a44d2004-08-07Johan Schön // ---------------------------------------------- // Link handling // ---------------------------------------------- void add_links(Standards.URI|string uri, void|string language, array(Standards.URI|string) links)
0813082001-05-17Johan Schön {
7583a42004-08-19Fredrik Noring  if(!links || !sizeof(links))
40a44d2004-08-07Johan Schön  return;
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  int doc_id = get_document_id((string)uri, language);
3524712015-05-26Martin Nilsson 
40a44d2004-08-07Johan Schön  array(int) to_ids = map(links, lambda(Standards.URI|string uri) { return get_document_id( (string)uri, language); });
f863bc2001-06-10Per Hedbor 
40a44d2004-08-07Johan Schön  string res = "replace into link (from_id, to_id) values " + map(to_ids, lambda(int to_id) { return sprintf("(%d, %d)", doc_id, to_id); }) * ", ";
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  db->query(res); }
feb96a2001-05-31Johan Schön 
40a44d2004-08-07Johan Schön void remove_links(Standards.URI|string uri, void|string language) {
455bda2009-06-26Fredrik Noring  int doc_id = get_document_id((string)uri, language, 1);
feb96a2001-05-31Johan Schön 
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  db->query("delete from link where from_id=%d", doc_id); } array(int) get_broken_links() {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
40a44d2004-08-07Johan Schön  db->query("select 'Not yet done :-)'");
d42a522001-03-15Johan Schön }
40a44d2004-08-07Johan Schön // ---------------------------------------------- // Sync stuff // ----------------------------------------------
ff17962014-08-15Martin Nilsson protected function sync_callback;
f863bc2001-06-10Per Hedbor void set_sync_callback( function f ) { sync_callback = f; }
86f8882001-03-28Johan Schön 
ef2c392013-06-14Jonas Walldén // The maximum blob size on disk must be at least big enough to hold as // many entries that can be found in a single document. This is needed so // a split blob doesn't get the same docid in separate records. // // We can get at most 255 occurrences of the same word from each document, // and if all of those are the same word AND the update takes place // incrementally we'll write [ docid | nhits | hit ] for every occurrence, // i.e. 7 bytes every time. Minimum blob size is therefore 1785 bytes. constant max_blob_size = 512 * 1024;
40a44d2004-08-07Johan Schön 
ff17962014-08-15Martin Nilsson protected array(array(int|string)) split_blobs(int blob_size, string blob,
40a44d2004-08-07Johan Schön  int max_blob_size) { /* +-----------+----------+---------+---------+---------+ | docid: 32 | nhits: 8 | hit: 16 | hit: 16 | hit: 16 |... +-----------+----------+---------+---------+---------+ */
3524712015-05-26Martin Nilsson 
e2ccca2013-06-14Jonas Walldén  sscanf(blob, "%4c", int first_doc_id);
40a44d2004-08-07Johan Schön  int ptr = blob_size; int start = 0, end=0; array blobs = ({}); while( end+5 < sizeof(blob) ) { while(end+5 < sizeof(blob) && blob_size < (max_blob_size-517)) { int l = 4 + 1 + 2*blob[end+4]; end += l; blob_size += l; } string me = blob[start..end-1];
e2ccca2013-06-14Jonas Walldén  if (sizeof(me)) sscanf(me, "%4c", first_doc_id); blobs += ({ ({ first_doc_id, me }) });
40a44d2004-08-07Johan Schön  start = end; blob_size=0; } return blobs; }
ff17962014-08-15Martin Nilsson protected void store_to_db( void|string mergedfilename )
3c87782001-05-25Per Hedbor {
31c2e42004-08-08Johan Schön  Search.MergeFile mergedfile;
40a44d2004-08-07Johan Schön 
31c2e42004-08-08Johan Schön  if(mergedfilename) mergedfile = Search.MergeFile(Stdio.File(mergedfilename, "r"));
4005e52013-06-14Jonas Walldén  int use_padded_blobs = supports_padded_blobs();
3524712015-05-26Martin Nilsson 
57ab6d2001-05-26Per Hedbor  int s = time(); int q;
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
3524712015-05-26Martin Nilsson #ifdef SEARCH_DEBUG
c2c5f52001-08-16Martin Nilsson  werror("----------- sync() %4d docs --------------\n", docs);
3524712015-05-26Martin Nilsson #endif
119e002005-05-23Anders Johansson  db->query("LOCK TABLES word_hit LOW_PRIORITY WRITE");
2af7372005-06-02Martin Stjernholm  mixed err = catch {
40a44d2004-08-07Johan Schön  String.Buffer multi_query = String.Buffer();
57ab6d2001-05-26Per Hedbor  do {
31c2e42004-08-08Johan Schön  string word, blob; if(mergedfilename) { array a = mergedfile->get_next_word_blob(); if( !a ) break; [word, blob] = a; } else { [word, blob] = blobs->read(); if(!word) break;
7dc5be2004-08-24Johan Schön  word = string_to_utf8(word);
3524712015-05-26Martin Nilsson 
cb010f2013-02-05Jonas Walldén  // Blob hits are grouped by docid but not sorted internally. We need // to store in sorted form since the hit analysis depend on it. The // data() method in Blob performs sorting so instantiate a temp blob // just to access this. blob = _WhiteFish.Blob(blob)->data();
31c2e42004-08-08Johan Schön  }
40a44d2004-08-07Johan Schön 
f3735c2001-06-23Johan Schön  q++;
f863bc2001-06-10Per Hedbor 
eee6ff2013-06-14Jonas Walldén  // Don't unlock and lock every word to reduce overhead if (q % 32 == 0) { db->query("UNLOCK TABLES"); db->query("LOCK TABLES word_hit LOW_PRIORITY WRITE"); }
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  // NOTE: Concatenation of hits info is strictly speaking not correct in // the general case since we may have the same docid repeated. In practice // the only code path that adds words also invalidates the old docid and // gets a fresh one. void add_padded_blobs(string word, array new_blobs) { // Write all blobs except the last one that should be padded foreach (new_blobs[..<1], array new_blob_pair) { [int first_doc_id, string blob] = new_blob_pair; int new_used_len = sizeof(blob); db->query("INSERT INTO word_hit " " (word, first_doc_id, used_len, real_len, hits)" " VALUES (%s, %d, %d, %d, %s)", word, first_doc_id, new_used_len, new_used_len, blob); }
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  // Write final blob with padding [int first_doc_id, string blob] = new_blobs[-1]; int new_used_len = sizeof(blob); int new_real_len = get_padded_blob_length(new_used_len); int space_count = new_real_len - new_used_len; db->query("INSERT INTO word_hit " " (word, first_doc_id, used_len, real_len, hits)" " VALUES (%s, %d, %d, %d, CONCAT(%s, SPACE(%d)))", word, first_doc_id, new_used_len, new_real_len, blob, space_count); };
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  void add_oldstyle_blobs(string word, array new_blobs) { // Write all blobs as new entries foreach (new_blobs, array new_blob_pair) { [int first_doc_id, string blob] = new_blob_pair; db->query("INSERT INTO word_hit " " (word, first_doc_id, hits)" " VALUES (%s, %d, %s)", word, first_doc_id, blob); } };
3524712015-05-26Martin Nilsson 
d6e13a2013-05-30Jonas Walldén  // We only care about the most recent blob for the given word so look // for the highest document ID.
40a44d2004-08-07Johan Schön  int first_doc_id;
4005e52013-06-14Jonas Walldén  array old; if (use_padded_blobs) { old = db->query(" SELECT first_doc_id, used_len, real_len " " FROM word_hit " " WHERE word=%s " "ORDER BY first_doc_id DESC " " LIMIT 1", word); } else { old = db->query(" SELECT first_doc_id, LENGTH(hits) AS used_len " " FROM word_hit " " WHERE word=%s " "ORDER BY first_doc_id DESC " " LIMIT 1", word);
40a44d2004-08-07Johan Schön  }
4005e52013-06-14Jonas Walldén  if (sizeof(old)) { int used_len = (int) old[-1]->used_len; int real_len = use_padded_blobs ? ((int) old[-1]->real_len) : used_len; int first_doc_id = (int) old[-1]->first_doc_id;
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  // Can the new blob fit in the existing padding space? // // NOTE: This is never true for old-style blobs. if (real_len - used_len >= sizeof(blob)) { // Yes, update in place db->query(" UPDATE word_hit " " SET hits = INSERT(hits, %d, %d, %s), " " used_len = %d " " WHERE word = %s " " AND first_doc_id = %d", used_len + 1, sizeof(blob), blob, used_len + sizeof(blob), word, first_doc_id); } else if (used_len + sizeof(blob) <= max_blob_size) { // The old blob can grow to accomodate the new data without // exceeding the maximum blob size. if (use_padded_blobs) { // Make sure we make room for new padding for future use int new_used_len = used_len + sizeof(blob); int new_real_len = get_padded_blob_length(new_used_len); int space_count = new_real_len - new_used_len; db->query("UPDATE word_hit " " SET hits = INSERT(hits, %d, %d, CONCAT(%s, SPACE(%d)))," " used_len = %d, " " real_len = %d " " WHERE word = %s " " AND first_doc_id = %d", used_len + 1, sizeof(blob) + space_count, blob, space_count, new_used_len, new_real_len, word, first_doc_id); } else { // Append blob data to old record db->query("UPDATE word_hit " " SET hits = CONCAT(hits, %s) " " WHERE word = %s " " AND first_doc_id = %d", blob, word, first_doc_id); } } else { // Need to split blobs array new_blobs = split_blobs(used_len, blob, max_blob_size); blob = new_blobs[0][1];
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  if (use_padded_blobs) { // Write the first chunk at the end of the existing blob and remove // any left-over padding by giving a sufficiently bigger blob size // as third parameter compared to the actual data. int new_used_len = used_len + sizeof(blob); db->query("UPDATE word_hit " " SET hits = INSERT(hits, %d, %d, %s), " " used_len = %d, " " real_len = %d " " WHERE word = %s " " AND first_doc_id = %d", used_len + 1, sizeof(blob) + max_blob_size, blob, new_used_len, new_used_len, word, first_doc_id); } else { // Write the first chunk at the end of the existing blob db->query("UPDATE word_hit " " SET hits = CONCAT(hits, %s) " " WHERE word = %s " " AND first_doc_id = %d", blob, word, first_doc_id); }
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  // Write remaining ones if (use_padded_blobs) add_padded_blobs(word, new_blobs[1..]); else add_oldstyle_blobs(word, new_blobs[1..]);
40a44d2004-08-07Johan Schön  }
4005e52013-06-14Jonas Walldén  } else { // No existing entries so create new blobs if (sizeof(blob) > max_blob_size) { // Blobs must be split in several records array new_blobs = split_blobs(0, blob, max_blob_size); if (use_padded_blobs) add_padded_blobs(word, new_blobs);
40a44d2004-08-07Johan Schön  else
4005e52013-06-14Jonas Walldén  add_oldstyle_blobs(word, new_blobs); } else { // Queue writing of single blob sscanf(blob, "%4c", first_doc_id); string new_query; if (use_padded_blobs) { int new_used_len = sizeof(blob); int new_real_len = get_padded_blob_length(new_used_len); int space_count = new_real_len - new_used_len; new_query = sprintf("('%s', %d, %d, %d, CONCAT('%s', SPACE(%d)))", db->quote(word), first_doc_id, new_used_len, new_real_len, db->quote(blob), space_count); } else { new_query = sprintf("('%s', %d, '%s')", db->quote(word), first_doc_id, db->quote(blob)); }
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  // If aggregated query is too big we run the old one now if (sizeof(multi_query) + sizeof(new_query) > 900 * 1024) db->query(multi_query->get());
3524712015-05-26Martin Nilsson 
4005e52013-06-14Jonas Walldén  // Append to delayed query if (!sizeof(multi_query)) { multi_query->add("INSERT INTO word_hit ", (use_padded_blobs ? " (word, first_doc_id, used_len, real_len, hits) " : " (word, first_doc_id, hits) "), "VALUES ", new_query); } else { multi_query->add(",", new_query); }
40a44d2004-08-07Johan Schön  } }
31c2e42004-08-08Johan Schön  } while( 1 );
2af7372005-06-02Martin Stjernholm 
40a44d2004-08-07Johan Schön  if( sizeof( multi_query ) ) db->query( multi_query->get());
2af7372005-06-02Martin Stjernholm  }; // catch mixed unlock_err = catch (db->query("UNLOCK TABLES")); if (err) throw (err); if (unlock_err) throw (unlock_err);
3524712015-05-26Martin Nilsson 
f863bc2001-06-10Per Hedbor  if( sync_callback ) sync_callback();
3524712015-05-26Martin Nilsson 
31c2e42004-08-08Johan Schön  if(mergedfilename) { mergedfile->close(); rm(mergedfilename); }
2c7bf22001-08-30Johan Schön #ifdef SEARCH_DEBUG
c2c5f52001-08-16Martin Nilsson  werror("----------- sync() done %3ds %5dw -------\n", time()-s,q);
2c7bf22001-08-30Johan Schön #endif
3524712015-05-26Martin Nilsson 
335c2d2013-02-06Jonas Walldén #ifdef SEARCH_DEBUG blobs_dirty = 0; #endif
db7a1e2001-05-26Per Hedbor }
ff17962014-08-15Martin Nilsson protected string get_mergefilename()
db7a1e2001-05-26Per Hedbor {
40a44d2004-08-07Johan Schön  return combine_path(mergefile_path, sprintf("mergefile%03d.dat", mergefile_counter));
8493872001-03-15Johan Schön }
ff17962014-08-15Martin Nilsson protected void mergefile_sync()
8493872001-03-15Johan Schön {
3524712015-05-26Martin Nilsson #ifdef SEARCH_DEBUG
40a44d2004-08-07Johan Schön  System.Timer t = System.Timer(); werror("----------- mergefile_sync() %4d docs --------------\n", docs);
3524712015-05-26Martin Nilsson #endif
40a44d2004-08-07Johan Schön  Search.MergeFile mergefile = Search.MergeFile( Stdio.File(get_mergefilename(), "wct"));
f70d322001-09-26Johan Schön 
40a44d2004-08-07Johan Schön  mergefile->write_blobs(blobs); if( sync_callback ) sync_callback();
f70d322001-09-26Johan Schön #ifdef SEARCH_DEBUG
40a44d2004-08-07Johan Schön  werror("----------- mergefile_sync() done %.3f s %2.1f MB -------\n", t->get(), file_stat(get_mergefilename())->size/(1024.0*1024.0));
f70d322001-09-26Johan Schön #endif
0813082001-05-17Johan Schön 
40a44d2004-08-07Johan Schön  mergefile_counter++; blobs = _WhiteFish.Blobs(); }
ff17962014-08-15Martin Nilsson protected string merge_mergefiles(array(string) mergefiles)
40a44d2004-08-07Johan Schön {
3524712015-05-26Martin Nilsson #ifdef SEARCH_DEBUG
40a44d2004-08-07Johan Schön  werror("merge_mergefiles( %s )\n", mergefiles*", ");
f70d322001-09-26Johan Schön #endif
40a44d2004-08-07Johan Schön  if(sizeof(mergefiles)==1) return mergefiles[0]; if(sizeof(mergefiles)>2)
f70d322001-09-26Johan Schön  {
40a44d2004-08-07Johan Schön  int pivot = sizeof(mergefiles)/2; return merge_mergefiles( ({ merge_mergefiles(mergefiles[..pivot-1] ), merge_mergefiles(mergefiles[pivot..] ) }) );
f70d322001-09-26Johan Schön  }
40a44d2004-08-07Johan Schön  // Else: actually merge two mergefiles
34df362001-05-29Johan Schön 
40a44d2004-08-07Johan Schön  string mergedfile_fn = get_mergefilename(); mergefile_counter++; Search.MergeFile mergedfile = Search.MergeFile(Stdio.File(mergedfile_fn, "wct")); System.Timer t = System.Timer(); mergedfile->merge_mergefiles(Search.MergeFile(Stdio.File(mergefiles[0], "r")), Search.MergeFile(Stdio.File(mergefiles[1], "r")));
3524712015-05-26Martin Nilsson #ifdef SEARCH_DEBUG
40a44d2004-08-07Johan Schön  werror("Merging %s (%.1f MB) took %.1f s\n", mergedfile_fn, file_stat(mergedfile_fn)->size/(1024.0*1024.0), t->get()); #endif rm(mergefiles[0]); rm(mergefiles[1]); return mergedfile_fn;
d42a522001-03-15Johan Schön }
40a44d2004-08-07Johan Schön void sync()
d8aff32001-07-16Johan Schön {
31c2e42004-08-08Johan Schön  if(options->mergefiles) { mergefile_sync(); store_to_db(merge_mergefiles(sort(get_mergefiles()))); } else { store_to_db(); blobs = _WhiteFish.Blobs(); }
335c2d2013-02-06Jonas Walldén #ifdef SEARCH_DEBUG
40a44d2004-08-07Johan Schön  docs = 0;
335c2d2013-02-06Jonas Walldén #endif
d8aff32001-07-16Johan Schön }
40a44d2004-08-07Johan Schön #ifdef SEARCH_DEBUG mapping times = ([ ]); #endif // ---------------------------------------------- // Statistics // ---------------------------------------------- int memsize() { return blobs->memsize(); }
d8aff32001-07-16Johan Schön 
d074bc2001-08-17Johan Schön mapping(string|int:int) get_language_stats() {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
d074bc2001-08-17Johan Schön  array a=db->query("select count(id) as c,language from document group by language"); return mkmapping( a->language, a->c); } int get_num_words() {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
fe8f662001-08-18Johan Schön  return (int)(db->query("select count(distinct word) as c from word_hit") + ({ (["c": 0]) }))[0]->c;
d074bc2001-08-17Johan Schön } int get_database_size() {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
d074bc2001-08-17Johan Schön  int size; foreach(db->query("show table status"), mapping table) size += (int)table->Data_length + (int)table->Index_length; return size; } int get_num_deleted_documents() {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
d074bc2001-08-17Johan Schön  return (int)db->query("select count(*) as c from deleted_document")[0]->c; }
ff17962014-08-15Martin Nilsson protected string my_denormalize(string in)
fe8f662001-08-18Johan Schön { return Unicode.normalize(utf8_to_string(in), "C"); }
d074bc2001-08-17Johan Schön array(array) get_most_common_words(void|int count) {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
d074bc2001-08-17Johan Schön  array a =
4005e52013-06-14Jonas Walldén  db->query(" SELECT word, " + (supports_padded_blobs() ? " SUM(used_len) / 5 AS c " : " SUM(LENGTH(hits)) / 5 AS c ") + " FROM word_hit " " GROUP BY word " " ORDER BY c DESC " " LIMIT %d", count || 10);
d074bc2001-08-17Johan Schön  if(!sizeof(a)) return ({ }); else
fe8f662001-08-18Johan Schön  return Array.transpose( ({ map(a->word, my_denormalize), (array(int))a->c }) );
d074bc2001-08-17Johan Schön }
7ad8a32009-06-30Fredrik Noring  void list_url_by_prefix(string url_prefix, function(string:void) cb) {
0f020c2015-10-07Henrik Grubbström (Grubba)  Sql.Sql db = get_db();
7ad8a32009-06-30Fredrik Noring  Sql.sql_result q = db->big_query("SELECT uri " " FROM uri " " WHERE uri LIKE '"+db->quote(url_prefix)+"%'"); for(;;) { array row = q->fetch_row(); if(!row) break; cb(row[0]); } }