eb01b4 | 2010-10-26 | Martin Stjernholm | | #pike __REAL_VERSION__
|
9c9da7 | 2001-06-23 | Johan Schön | |
|
f863bc | 2001-06-10 | Per Hedbor | | inherit .Base;
|
4c5a2f | 2000-05-15 | Martin Nilsson | |
|
ec7cbe | 2001-03-18 | Johan Schön | |
|
40a44d | 2004-08-07 | Johan Schön | |
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | |
|
cf68c3 | 2007-11-07 | Marcus Wellhardh | | #define DB_MAX_WORD_SIZE 64
|
40a44d | 2004-08-07 | Johan Schön | |
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected
|
40a44d | 2004-08-07 | Johan Schön | | {
|
5121c2 | 2015-10-07 | Henrik Grubbström (Grubba) | |
|
40a44d | 2004-08-07 | Johan Schön | | string host;
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql get_db()
{
return Sql.Sql(host);
}
|
31c2e4 | 2004-08-08 | Johan Schön | | mapping options;
|
40a44d | 2004-08-07 | Johan Schön | | string mergefile_path;
int mergefile_counter = 0;
int init_done = 0;
};
|
a47c12 | 2022-09-15 | Martin Nilsson | | protected void create(string db_url, mapping options = ([]))
|
40a44d | 2004-08-07 | Johan Schön | | {
|
5121c2 | 2015-10-07 | Henrik Grubbström (Grubba) | | host = db_url;
get_db();
|
31c2e4 | 2004-08-08 | Johan Schön | | mergefile_path = options->mergefiles;
|
352471 | 2015-05-26 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | | if(!mergefile_path)
mergefile_path = "/tmp/";
|
31c2e4 | 2004-08-08 | Johan Schön | | if(options->mergefiles)
foreach(get_mergefiles(), string fn)
rm(fn);
|
40a44d | 2004-08-07 | Johan Schön | | }
|
335c2d | 2013-02-06 | Jonas Walldén | | #ifdef SEARCH_DEBUG
|
41be41 | 2019-04-20 | Henrik Grubbström (Grubba) | | protected void _destruct()
|
335c2d | 2013-02-06 | Jonas Walldén | | {
if (blobs_dirty)
werror("Search.Database.MySQL: WARNING: Forgot to sync before "
"abandoning db object?\n");
}
#endif
|
724045 | 2021-06-09 | Chris Angelico | | protected string _sprintf(int t)
|
40a44d | 2004-08-07 | Johan Schön | | {
|
5121c2 | 2015-10-07 | Henrik Grubbström (Grubba) | | return sprintf("Search.Database.MySQL(%O,%O)",
Sql.censor_sql_url(host), mergefile_path);
|
40a44d | 2004-08-07 | Johan Schön | | }
|
4005e5 | 2013-06-14 | Jonas Walldén | |
int cache_supports_padded_blobs = -1;
int supports_padded_blobs()
{
if (cache_supports_padded_blobs < 0) {
mixed compactor_class = master()->resolv("Search.Process.Compactor");
if (compactor_class && compactor_class->supports_padded_blobs)
cache_supports_padded_blobs = 1;
else
cache_supports_padded_blobs = 0;
}
return cache_supports_padded_blobs;
}
|
40a44d | 2004-08-07 | Johan Schön | |
|
dc5a14 | 2001-06-27 | Johan Schön | | void init_tables()
|
7ec58b | 2000-10-26 | Johan Schön | | {
|
4005e5 | 2013-06-14 | Jonas Walldén | | int use_padded_blobs = supports_padded_blobs();
|
352471 | 2015-05-26 | Martin Nilsson | |
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
4c5a2f | 2000-05-15 | Martin Nilsson | | db->query(
|
dc5a14 | 2001-06-27 | Johan Schön | | #"create table if not exists uri (id int unsigned primary key
|
f863bc | 2001-06-10 | Per Hedbor | | auto_increment not null,
uri blob not null,
uri_md5 varchar(32) binary not null,
|
3452ab | 2001-05-31 | Johan Schön | | UNIQUE(uri_md5))"
|
098208 | 2001-01-05 | Martin Nilsson | | );
db->query(
|
dc5a14 | 2001-06-27 | Johan Schön | | #"create table if not exists document (id int unsigned primary key
|
f863bc | 2001-06-10 | Per Hedbor | | auto_increment not null,
|
ec7cbe | 2001-03-18 | Johan Schön | | uri_id int unsigned not null,
|
2a13d1 | 2001-08-31 | Johan Schön | | language varchar(255) default null,
|
32976a | 2001-07-02 | Johan Schön | | INDEX index_language (language),
|
ec7cbe | 2001-03-18 | Johan Schön | | INDEX index_uri_id (uri_id))"
|
16a29b | 2001-09-26 | Johan Schön | | );
|
352471 | 2015-05-26 | Martin Nilsson | |
|
193fb0 | 2011-02-01 | Henrik Grubbström (Grubba) | | db->query("create table if not exists deleted_document (doc_id int unsigned not null primary key)");
|
081308 | 2001-05-17 | Johan Schön | |
|
40a44d | 2004-08-07 | Johan Schön | |
|
352471 | 2015-05-26 | Martin Nilsson | |
|
081308 | 2001-05-17 | Johan Schön | | db->query(
|
cf68c3 | 2007-11-07 | Marcus Wellhardh | | #"create table if not exists word_hit (word varchar("+DB_MAX_WORD_SIZE+#") binary not null,
|
4005e5 | 2013-06-14 | Jonas Walldén | | first_doc_id int not null, " +
(use_padded_blobs ? #"
used_len int not null,
real_len int not null, " : "") + #"
|
feb96a | 2001-05-31 | Johan Schön | | hits mediumblob not null,
|
40a44d | 2004-08-07 | Johan Schön | | primary key (word,first_doc_id))");
|
352471 | 2015-05-26 | Martin Nilsson | |
|
4005e5 | 2013-06-14 | Jonas Walldén | | int has_padded_blobs_fields =
sizeof(db->query("DESCRIBE word_hit used_len"));
if (use_padded_blobs && !has_padded_blobs_fields) {
werror("Search: Upgrading '%s.word_hit' table to support padded blobs.\n",
(host / "/")[-1]);
db->query("ALTER TABLE word_hit "
" ADD COLUMN used_len INT NOT NULL "
" AFTER first_doc_id, "
" ADD COLUMN real_len INT NOT NULL "
" AFTER used_len");
db->query("UPDATE word_hit "
" SET used_len = LENGTH(hits), real_len = LENGTH(hits)");
} else if (!use_padded_blobs && has_padded_blobs_fields) {
werror("Search: Downgrading '%s.word_hit' table to remove padded blobs.\n",
(host / "/")[-1]);
db->query("UPDATE word_hit "
" SET hits = LEFT(hits, used_len) "
" WHERE used_len < real_len");
db->query("ALTER TABLE word_hit "
" DROP COLUMN used_len, "
" DROP COLUMN real_len");
}
|
feb96a | 2001-05-31 | Johan Schön | |
|
40a44d | 2004-08-07 | Johan Schön | | db->query(
#"create table if not exists lastmodified(doc_id int not null primary key,
at int not null,
index index_at(at))");
db->query(
#"create table if not exists link(from_id int not null,
to_id int not null,
index index_from(from_id),
index index_to(to_id))");
|
352471 | 2015-05-26 | Martin Nilsson | |
|
feb96a | 2001-05-31 | Johan Schön | | db->query(
|
dc5a14 | 2001-06-27 | Johan Schön | | #"create table if not exists metadata (doc_id int not null,
|
feb96a | 2001-05-31 | Johan Schön | | name varchar(32) not null,
value mediumblob not null,
|
f70d32 | 2001-09-26 | Johan Schön | | index index_doc_id(doc_id))");
|
ec7cbe | 2001-03-18 | Johan Schön | |
db->query(
|
dc5a14 | 2001-06-27 | Johan Schön | | #"create table if not exists field (id tinyint unsigned primary key not null,
|
b3b480 | 2001-03-19 | Johan Schön | | name varchar(127) not null,
|
733b7b | 2001-06-26 | Johan Schön | | UNIQUE(name))");
|
83c6bd | 2001-06-23 | Johan Schön | |
|
4c5a2f | 2000-05-15 | Martin Nilsson | | }
|
40a44d | 2004-08-07 | Johan Schön | | void clear()
|
83c6bd | 2001-06-23 | Johan Schön | | {
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
bf8d02 | 2017-08-05 | Henrik Grubbström (Grubba) | | db->query("TRUNCATE word_hit");
db->query("TRUNCATE uri");
db->query("TRUNCATE document");
db->query("TRUNCATE deleted_document");
db->query("TRUNCATE metadata");
db->query("TRUNCATE lastmodified");
|
83c6bd | 2001-06-23 | Johan Schön | | }
|
4c5a2f | 2000-05-15 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | |
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected array(string) get_mergefiles()
|
78fadb | 2000-11-30 | Johan Schön | | {
|
40a44d | 2004-08-07 | Johan Schön | | return map(glob("mergefile*.dat", get_dir(mergefile_path) || ({ })),
lambda(string s) { return combine_path(mergefile_path, s);});
|
78fadb | 2000-11-30 | Johan Schön | | }
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected string to_md5(string url)
|
139422 | 2000-11-10 | Johan Schön | | {
|
40a44d | 2004-08-07 | Johan Schön | | #if constant(Crypto.md5) && constant(Crypto.string_to_hex)
return Crypto.string_to_hex( Crypto.md5()->
update( string_to_utf8(url) )->digest() );
#else
return String.string2hex( Crypto.MD5.hash( string_to_utf8(url) ) );
#endif
|
139422 | 2000-11-10 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | |
|
139422 | 2000-11-10 | Johan Schön | |
|
feb96a | 2001-05-31 | Johan Schön | | int get_uri_id(string uri, void|int do_not_create)
|
ec7cbe | 2001-03-18 | Johan Schön | | {
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
9e7daa | 2001-06-10 | Johan Schön | | string s=sprintf("select id from uri where uri_md5='%s'", to_md5(uri));
|
ec7cbe | 2001-03-18 | Johan Schön | | array a=db->query(s);
if(sizeof(a))
return (int)a[0]->id;
|
feb96a | 2001-05-31 | Johan Schön | | if(do_not_create)
return 0;
|
3452ab | 2001-05-31 | Johan Schön | | db->query("insert into uri (uri,uri_md5) "
"values (%s,%s)",
|
cee23c | 2001-08-08 | Per Hedbor | | string_to_utf8( uri ), to_md5(uri));
|
57c53a | 2019-07-12 | Marcus Comstedt | | return db->insert_id();
|
ec7cbe | 2001-03-18 | Johan Schön | | }
|
4c5a2f | 2000-05-15 | Martin Nilsson | |
|
455bda | 2009-06-26 | Fredrik Noring | | int get_document_id(string uri, void|string language, void|int do_not_create)
|
ec7cbe | 2001-03-18 | Johan Schön | | {
|
455bda | 2009-06-26 | Fredrik Noring | | int uri_id=get_uri_id(uri, do_not_create);
if (!uri_id)
return 0;
|
352471 | 2015-05-26 | Martin Nilsson | |
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
ec7cbe | 2001-03-18 | Johan Schön | | string s=sprintf("select id from document where "
"uri_id='%d'", uri_id);
|
32976a | 2001-07-02 | Johan Schön | | if(language)
s+=sprintf(" and language='%s'",db->quote(language));
|
ec7cbe | 2001-03-18 | Johan Schön | |
array a = db->query(s);
|
40a44d | 2004-08-07 | Johan Schön | |
|
ec7cbe | 2001-03-18 | Johan Schön | | if(sizeof(a))
return (int)a[0]->id;
|
40a44d | 2004-08-07 | Johan Schön | | db->query("insert into document (uri_id, language) "
|
352471 | 2015-05-26 | Martin Nilsson | | "values (%d,"+(language?"%s":"NULL")+")",
|
40a44d | 2004-08-07 | Johan Schön | | uri_id, language);
|
57c53a | 2019-07-12 | Marcus Comstedt | | return db->insert_id();
|
ec7cbe | 2001-03-18 | Johan Schön | | }
|
4c5a2f | 2000-05-15 | Martin Nilsson | |
|
74664e | 2022-06-02 | Martin Nilsson | | mapping|zero get_uri_and_language(int|array(int) doc_id)
|
40a44d | 2004-08-07 | Johan Schön | | {
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | if(arrayp(doc_id))
{
array a=db->query("select document.id,document.language, uri.uri from document, uri "
"where uri.id=document.uri_id and document.id IN ("+
((array(string))doc_id)*","+")");
return mkmapping( (array(int))a->id, a );
}
|
352471 | 2015-05-26 | Martin Nilsson | |
|
a47c12 | 2022-09-15 | Martin Nilsson | | array a=db->query("select document.language,uri.uri from document,uri "
"where uri.id=document.uri_id and document.id=%d",doc_id);
if(!sizeof(a))
return 0;
return (["uri":1,"language":1]) & a[0];
|
40a44d | 2004-08-07 | Johan Schön | | }
|
455bda | 2009-06-26 | Fredrik Noring | | void remove_uri(string|Standards.URI uri)
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
455bda | 2009-06-26 | Fredrik Noring | | db->query("delete from uri where uri_md5=%s", to_md5((string)uri));
}
void remove_uri_prefix(string|Standards.URI uri)
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
455bda | 2009-06-26 | Fredrik Noring | | string uri_string = (string)uri;
db->query("delete from uri where uri like '" + db->quote(uri_string) + "%%'");
}
|
335c2d | 2013-02-06 | Jonas Walldén | | #ifdef SEARCH_DEBUG
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected int docs;
protected int blobs_dirty;
|
335c2d | 2013-02-06 | Jonas Walldén | | #endif
|
40a44d | 2004-08-07 | Johan Schön | |
void remove_document(string|Standards.URI uri, void|string language)
{
|
335c2d | 2013-02-06 | Jonas Walldén | | #ifdef SEARCH_DEBUG
docs++;
#endif
|
40a44d | 2004-08-07 | Johan Schön | |
|
455bda | 2009-06-26 | Fredrik Noring | | int uri_id=get_uri_id((string)uri, 1);
|
40a44d | 2004-08-07 | Johan Schön | |
if(!uri_id)
return;
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | array a;
|
37a850 | 2012-06-20 | Jonas Wallden | | if(language) {
|
40a44d | 2004-08-07 | Johan Schön | | a=db->query("select id from document where uri_id=%d and "
|
37a850 | 2012-06-20 | Jonas Wallden | | "(language=%s OR language IS NULL)", uri_id, language);
} else {
|
40a44d | 2004-08-07 | Johan Schön | | a=db->query("select id from document where uri_id=%d",uri_id);
|
37a850 | 2012-06-20 | Jonas Wallden | | }
|
40a44d | 2004-08-07 | Johan Schön | |
if(!sizeof(a))
return;
|
352471 | 2015-05-26 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | | db->query("delete from document where id in ("+a->id*","+")");
|
9718b2 | 2014-08-29 | Henrik Grubbström (Grubba) | | db->query("insert into deleted_document (doc_id) values "+
|
40a44d | 2004-08-07 | Johan Schön | | "("+a->id*"),("+")");
}
|
299e1a | 2007-11-16 | Marcus Wellhardh | | void remove_document_prefix(string|Standards.URI uri)
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
299e1a | 2007-11-16 | Marcus Wellhardh | | array a =
db->query("SELECT document.id AS id"
" FROM document, uri "
" WHERE document.uri_id=uri.id "
" AND uri.uri like '" + db->quote(uri) + "%%'");
if(!sizeof(a))
return;
array ids = a->id;
|
335c2d | 2013-02-06 | Jonas Walldén | | #ifdef SEARCH_DEBUG
docs += sizeof(ids);
#endif
|
299e1a | 2007-11-16 | Marcus Wellhardh | | db->query("DELETE FROM document "
" WHERE id IN (" + (ids * ",") + ")");
|
9718b2 | 2014-08-29 | Henrik Grubbström (Grubba) | | db->query("INSERT INTO deleted_document "
|
299e1a | 2007-11-16 | Marcus Wellhardh | | "(doc_id) VALUES (" + (ids * "),(") + ")");
}
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected Search.ResultSet deleted_documents = Search.ResultSet();
protected int deleted_max, deleted_count;
|
40a44d | 2004-08-07 | Johan Schön | | Search.ResultSet get_deleted_documents()
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | array a = db->query("select max(doc_id) as m, count(*) as c from deleted_document");
int max_id = (int)a[0]->m;
int count = (int)a[0]->c;
if(max_id==deleted_max && count == deleted_count)
return deleted_documents;
|
a47c12 | 2022-09-15 | Martin Nilsson | |
array ids = (array(int))db->query("select doc_id from deleted_document "
"order by doc_id")->doc_id;
deleted_count = count;
deleted_max = max_id;
return deleted_documents = Search.ResultSet(ids);
|
40a44d | 2004-08-07 | Johan Schön | | }
|
26c341 | 2008-03-26 | Jonas Wallden | |
Search.ResultSet get_all_documents()
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
26c341 | 2008-03-26 | Jonas Wallden | | array ids =
(array(int)) db->query("SELECT id FROM document ORDER BY id")->id;
return Search.ResultSet(ids);
}
|
40a44d | 2004-08-07 | Johan Schön | |
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected mapping(string:int) list_fields_cache;
|
d879f1 | 2001-06-23 | Johan Schön | |
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected void init_fields()
|
40a44d | 2004-08-07 | Johan Schön | | {
if(init_done)
return;
init_done=1;
foreach(({"uri","path1", "path2"})+Search.get_filter_fields(), string field)
allocate_field_id(field);
}
|
e2afd0 | 2001-06-23 | Johan Schön | | mapping(string:int) list_fields()
{
|
d879f1 | 2001-06-23 | Johan Schön | | if(list_fields_cache)
return list_fields_cache;
|
83c6bd | 2001-06-23 | Johan Schön | | init_fields();
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
f3735c | 2001-06-23 | Johan Schön | | array a=db->query("select name,id from field") + ({ (["name":"body",
"id": "0"]) });
|
d879f1 | 2001-06-23 | Johan Schön | | return list_fields_cache=mkmapping(a->name, (array(int))a->id);
|
e2afd0 | 2001-06-23 | Johan Schön | | }
|
693cb1 | 2001-06-23 | Johan Schön | | int allocate_field_id(string field)
{
|
f3735c | 2001-06-23 | Johan Schön | | if(!init_done)
init_fields();
|
83c6bd | 2001-06-23 | Johan Schön | | if(field=="body")
return 0;
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
51a5c0 | 2001-08-09 | Johan Schön | | array a =db->query("select id from field where name=%s", field);
if(sizeof(a))
return (int)a[0]->id;
|
f3735c | 2001-06-23 | Johan Schön | | db->query("lock tables field write");
|
2af737 | 2005-06-02 | Martin Stjernholm | | mixed err = catch {
for(int i=1; i<64; i++)
{
array a=db->query("select name from field where id=%d",i);
if(!sizeof(a))
{
a=db->query("replace into field (id,name) values (%d,%s)",
i, field);
list_fields_cache=0;
db->query("unlock tables");
return i;
}
}
};
mixed unlock_err = catch (db->query("unlock tables"));
if (err) throw (err);
if (unlock_err) throw (unlock_err);
|
32976a | 2001-07-02 | Johan Schön | | return -1;
|
693cb1 | 2001-06-23 | Johan Schön | | }
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected mapping field_cache = ([]);
|
3452ab | 2001-05-31 | Johan Schön | | int get_field_id(string field, void|int do_not_create)
|
bfbcc6 | 2001-03-19 | Johan Schön | | {
|
693cb1 | 2001-06-23 | Johan Schön | |
|
8758c9 | 2001-05-25 | Johan Schön | | if(field=="body") return 0;
if(field_cache[field]) return field_cache[field];
|
352471 | 2015-05-26 | Martin Nilsson | |
|
83c6bd | 2001-06-23 | Johan Schön | | init_fields();
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
bfbcc6 | 2001-03-19 | Johan Schön | | string s=sprintf("select id from field where name='%s'",db->quote(field));
array a=db->query(s);
if(sizeof(a))
|
081308 | 2001-05-17 | Johan Schön | | {
|
f3735c | 2001-06-23 | Johan Schön | | field_cache[field]=(int)a[0]->id;
return (int)a[0]->id;
|
081308 | 2001-05-17 | Johan Schön | | }
|
bfbcc6 | 2001-03-19 | Johan Schön | |
|
3452ab | 2001-05-31 | Johan Schön | | if(do_not_create)
|
32976a | 2001-07-02 | Johan Schön | | return -1;
|
3452ab | 2001-05-31 | Johan Schön | |
|
693cb1 | 2001-06-23 | Johan Schön | | return allocate_field_id(field);
}
|
32976a | 2001-07-02 | Johan Schön | | void remove_field(string field)
|
693cb1 | 2001-06-23 | Johan Schön | | {
|
83c6bd | 2001-06-23 | Johan Schön | | init_fields();
|
693cb1 | 2001-06-23 | Johan Schön | | m_delete(field_cache, field);
|
d879f1 | 2001-06-23 | Johan Schön | | list_fields_cache=0;
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
693cb1 | 2001-06-23 | Johan Schön | | db->query("delete from field where name=%s", field);
|
bfbcc6 | 2001-03-19 | Johan Schön | | }
|
8a06e9 | 2003-01-27 | Mattias Andersson | | void safe_remove_field(string field)
{
|
12f1df | 2004-12-20 | Anders Johansson | | if( search(({"uri","path1","path2"})+Search.get_filter_fields(), field) == -1 )
|
8a06e9 | 2003-01-27 | Mattias Andersson | | remove_field( field );
}
|
40a44d | 2004-08-07 | Johan Schön | |
|
8758c9 | 2001-05-25 | Johan Schön | |
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected _WhiteFish.Blobs blobs = _WhiteFish.Blobs();
|
32976a | 2001-07-02 | Johan Schön | |
|
31c2e4 | 2004-08-08 | Johan Schön | | #define MAXMEM 64*1024*1024
|
f70d32 | 2001-09-26 | Johan Schön | |
|
ec7cbe | 2001-03-18 | Johan Schön | | void insert_words(Standards.URI|string uri, void|string language,
|
32976a | 2001-07-02 | Johan Schön | | string field, array(string) words)
|
ec7cbe | 2001-03-18 | Johan Schön | | {
|
cf68c3 | 2007-11-07 | Marcus Wellhardh | |
words = filter(words, lambda (string word)
{ return sizeof(string_to_utf8(word)) <= DB_MAX_WORD_SIZE; });
|
352471 | 2015-05-26 | Martin Nilsson | |
|
db7a1e | 2001-05-26 | Per Hedbor | | if(!sizeof(words)) return;
|
83c6bd | 2001-06-23 | Johan Schön | | init_fields();
|
57ab6d | 2001-05-26 | Per Hedbor | |
|
3452ab | 2001-05-31 | Johan Schön | | int doc_id = get_document_id((string)uri, language);
int field_id = get_field_id(field);
|
352471 | 2015-05-26 | Martin Nilsson | |
|
af27a6 | 2001-07-31 | Johan Schön | | blobs->add_words( doc_id, words, field_id );
|
335c2d | 2013-02-06 | Jonas Walldén | | #ifdef SEARCH_DEBUG
blobs_dirty = 1;
#endif
|
352471 | 2015-05-26 | Martin Nilsson | |
|
db7a1e | 2001-05-26 | Per Hedbor | | if(blobs->memsize() > MAXMEM)
|
31c2e4 | 2004-08-08 | Johan Schön | | if(options->mergefiles)
mergefile_sync();
else
sync();
|
081308 | 2001-05-17 | Johan Schön | | }
|
86f888 | 2001-03-28 | Johan Schön | |
|
8b4ffb | 2002-03-12 | Johan Schön | | array(string) expand_word_glob(string g, void|int max_hits)
{
|
6876a7 | 2016-02-02 | Jonas Walldén | | string g_sql = replace(string_to_utf8(g), ({ "*", "?" }), ({ "%", "_" }) );
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
6876a7 | 2016-02-02 | Jonas Walldén | | if (max_hits) {
array(string) non_glob_words = (replace(g, "?", "*") / "*" - ({ "" }));
if (sizeof(non_glob_words)) {
string first_word_sql = string_to_utf8(non_glob_words[0]);
return map(db->query("SELECT DISTINCT word, "
" LOCATE(%s, word) AS score "
" FROM word_hit "
" WHERE word LIKE %s "
" ORDER BY score ASC, word ASC "
" LIMIT %d",
first_word_sql, g_sql, max_hits)->word,
utf8_to_string);
}
return map(db->query("SELECT DISTINCT word "
|
a47c12 | 2022-09-15 | Martin Nilsson | | " FROM word_hit "
" WHERE word LIKE %s "
" ORDER BY word ASC "
" LIMIT %d",
g_sql, max_hits)->word,
utf8_to_string);
|
6876a7 | 2016-02-02 | Jonas Walldén | | }
|
a47c12 | 2022-09-15 | Martin Nilsson | | return map(db->query("SELECT DISTINCT word "
" FROM word_hit "
" WHERE word LIKE %s",
g_sql)->word,
utf8_to_string);
|
40a44d | 2004-08-07 | Johan Schön | | }
|
4005e5 | 2013-06-14 | Jonas Walldén | |
int get_padded_blob_length(int used_len)
{
int new_len = (((used_len >> 2) + used_len) | 63) + 1;
return min(new_len, max_blob_size);
}
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected int blobs_per_select = 40;
|
40a44d | 2004-08-07 | Johan Schön | |
|
74664e | 2022-06-02 | Martin Nilsson | | string|zero get_blob(string word, int num,
void|mapping(string:mapping(int:string)) blobcache)
|
40a44d | 2004-08-07 | Johan Schön | | {
word = string_to_utf8( word );
if(blobcache[word] && blobcache[word][num])
return blobcache[word][num];
if( blobcache[word] && blobcache[word][-1] )
{
#ifdef SEARCH_DEBUG
times[word] = 0;
#endif
return 0;
}
#ifdef SEARCH_DEBUG
int t0 = gethrtime();
#endif
|
4005e5 | 2013-06-14 | Jonas Walldén | |
int use_padded_blobs = supports_padded_blobs();
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
4005e5 | 2013-06-14 | Jonas Walldén | | array a =
db->query(" SELECT hits, first_doc_id " +
(use_padded_blobs ? ", used_len, real_len " : "") +
" FROM word_hit "
" WHERE word = %s "
"ORDER BY first_doc_id "
" LIMIT %d,%d",
word, num, blobs_per_select);
|
352471 | 2015-05-26 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | | #ifdef SEARCH_DEBUG
int t1 = gethrtime()-t0;
times[word] += t1;
werror("word: %O time accum: %.2f ms delta_t: %.2f\n", word, times[word]/1000.0, t1/1000.0);
#endif
|
352471 | 2015-05-26 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | | blobcache[word] = ([]);
if( sizeof( a ) < blobs_per_select )
blobcache[word][-1]="";
if(!sizeof(a))
{
#ifdef SEARCH_DEBUG
times[word] = 0;
#endif
return 0;
}
|
4005e5 | 2013-06-14 | Jonas Walldén | | foreach(a, mapping m) {
if (use_padded_blobs) {
int used_len = (int) m->used_len;
int real_len = (int) m->real_len;
if ((used_len < real_len) || (real_len != sizeof(m->hits)))
m->hits = m->hits[..(used_len - 1)];
}
|
352471 | 2015-05-26 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | | blobcache[word][num++] = m->hits;
|
4005e5 | 2013-06-14 | Jonas Walldén | | }
|
40a44d | 2004-08-07 | Johan Schön | |
return a[0]->hits;
|
8b4ffb | 2002-03-12 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | |
|
39fa4d | 2001-08-20 | Johan Schön | | void remove_metadata(Standards.URI|string uri, void|string language)
{
int doc_id;
if(!intp(uri))
|
455bda | 2009-06-26 | Fredrik Noring | | doc_id = get_document_id((string)uri, language, 1);
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
39fa4d | 2001-08-20 | Johan Schön | | db->query("delete from metadata where doc_id = %d", doc_id);
}
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected string make_fields_sql(void|array(string) wanted_fields)
|
40a44d | 2004-08-07 | Johan Schön | | {
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | if(wanted_fields && sizeof(wanted_fields))
return " and name IN ('"+map(wanted_fields,db->quote)*"','"+"')";
|
a47c12 | 2022-09-15 | Martin Nilsson | | return "";
|
40a44d | 2004-08-07 | Johan Schön | | }
mapping(string:string) get_metadata(int|Standards.URI|string uri,
void|string language,
void|array(string) wanted_fields)
{
int doc_id;
if(intp(uri))
doc_id=uri;
else
doc_id = get_document_id((string)uri, language);
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | array a=db->query("select name,value from metadata where doc_id=%d"+
make_fields_sql(wanted_fields),
doc_id);
mapping md=mkmapping(a->name,a->value);
|
f0b0d4 | 2014-05-16 | Martin Nilsson | | #if constant(Gz)
|
40a44d | 2004-08-07 | Johan Schön | | if(md->body)
md->body=Gz.inflate()->inflate(md->body);
|
f0b0d4 | 2014-05-16 | Martin Nilsson | | #endif
|
40a44d | 2004-08-07 | Johan Schön | |
foreach(indices(md), string field)
md[field] = utf8_to_string(md[field]);
return md;
}
mapping(int:string) get_special_metadata(array(int) doc_ids,
string wanted_field)
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | array a=db->query("select doc_id,value from metadata where doc_id IN ("+
((array(string))doc_ids)*","+") and name = %s",
wanted_field);
return mkmapping( (array(int))a->doc_id, a->value);
}
|
39fa4d | 2001-08-20 | Johan Schön | |
|
feb96a | 2001-05-31 | Johan Schön | | void set_metadata(Standards.URI|string uri, void|string language,
|
39fa4d | 2001-08-20 | Johan Schön | | mapping(string:string) md)
|
feb96a | 2001-05-31 | Johan Schön | | {
int doc_id;
if(!intp(uri))
doc_id = get_document_id((string)uri, language);
|
e2afd0 | 2001-06-23 | Johan Schön | |
|
83c6bd | 2001-06-23 | Johan Schön | | init_fields();
|
e2afd0 | 2001-06-23 | Johan Schön | |
|
feb96a | 2001-05-31 | Johan Schön | | if(md->body)
|
deab11 | 2001-08-01 | Johan Schön | | {
|
31a04e | 2001-11-21 | Johan Schön | | if(sizeof(md->body))
md->body = Unicode.normalize( Unicode.split_words_and_normalize( md->body ) * " ", "C");
|
f0b0d4 | 2014-05-16 | Martin Nilsson | | #if constant(Gz)
|
8cf66a | 2001-06-06 | Per Hedbor | | md->body = Gz.deflate(6)->deflate(string_to_utf8(md->body[..64000]),
|
feb96a | 2001-05-31 | Johan Schön | | Gz.FINISH);
|
f0b0d4 | 2014-05-16 | Martin Nilsson | | #endif
|
deab11 | 2001-08-01 | Johan Schön | | }
|
feb96a | 2001-05-31 | Johan Schön | |
if(!sizeof(md))
|
74664e | 2022-06-02 | Martin Nilsson | | return;
|
feb96a | 2001-05-31 | Johan Schön | |
|
058489 | 2001-06-11 | Johan Schön | | foreach(indices(md), string ind)
if(ind!="body")
|
96496b | 2001-06-11 | Johan Schön | | md[ind]=string_to_utf8(md[ind]);
|
058489 | 2001-06-11 | Johan Schön | |
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
feb96a | 2001-05-31 | Johan Schön | | string s=map(Array.transpose( ({ map(indices(md),db->quote),
map(values(md), db->quote) }) ),
lambda(array a)
{
|
8cf66a | 2001-06-06 | Per Hedbor | | return sprintf("(%d,'%s','%s')", doc_id,
|
058489 | 2001-06-11 | Johan Schön | | a[0], a[1]);
|
feb96a | 2001-05-31 | Johan Schön | | }) * ", ";
|
352471 | 2015-05-26 | Martin Nilsson | |
|
9718b2 | 2014-08-29 | Henrik Grubbström (Grubba) | | db->query("replace into metadata (doc_id, name, value) values "+s);
|
feb96a | 2001-05-31 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | | void set_lastmodified(Standards.URI|string uri,
void|string language,
int when)
|
713c29 | 2001-08-07 | Johan Schön | | {
|
40a44d | 2004-08-07 | Johan Schön | | int doc_id = get_document_id((string)uri, language);
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | db->query("replace into lastmodified (doc_id, at) values (%d,%d)", doc_id, when);
|
713c29 | 2001-08-07 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | | int get_lastmodified(Standards.URI|string|array(Standards.URI|string) uri, void|string language)
|
feb96a | 2001-05-31 | Johan Schön | | {
|
40a44d | 2004-08-07 | Johan Schön | | int doc_id = get_document_id((string)uri, language);
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | array q = db->query("select at from lastmodified where doc_id=%d", doc_id);
if( sizeof( q ) )
return (int)q[0]->at;
|
feb96a | 2001-05-31 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | | void randomize_dates()
|
713c29 | 2001-08-07 | Johan Schön | | {
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | foreach(db->query("select id from document")->id, string id)
db->query("replace into lastmodified (doc_id,at) values (%s,%d)",
id,
random(365*24*3600)+time()-365*24*3600);
|
352471 | 2015-05-26 | Martin Nilsson | |
|
713c29 | 2001-08-07 | Johan Schön | | }
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected
|
feb96a | 2001-05-31 | Johan Schön | | {
|
40a44d | 2004-08-07 | Johan Schön | | _WhiteFish.DateSet dateset_cache;
int dateset_cache_max_doc_id = -1;
|
352471 | 2015-05-26 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | | int get_max_doc_id()
|
14ecbc | 2001-08-07 | Johan Schön | | {
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | array a = db->query("select doc_id from lastmodified order by doc_id desc limit 1");
if(!sizeof(a))
return 0;
|
a47c12 | 2022-09-15 | Martin Nilsson | | return (int)a[0]->doc_id;
|
14ecbc | 2001-08-07 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | | };
_WhiteFish.DateSet get_global_dateset()
{
int max_doc_id = get_max_doc_id();
if(max_doc_id == dateset_cache_max_doc_id)
return dateset_cache;
|
a47c12 | 2022-09-15 | Martin Nilsson | |
Sql.Sql db = get_db();
array a = db->query("select doc_id,at from lastmodified where "
"doc_id > %d order by doc_id asc", dateset_cache_max_doc_id);
dateset_cache_max_doc_id = max_doc_id;
if(!dateset_cache)
dateset_cache = _WhiteFish.DateSet();
dateset_cache->add_many( (array(int))a->doc_id,
(array(int))a->at );
return dateset_cache;
|
feb96a | 2001-05-31 | Johan Schön | | }
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected
|
a5b631 | 2010-01-20 | Martin Jonsson | | {
_WhiteFish.DateSet publ_dateset_cache;
int publ_dateset_cache_max_doc_id = -1;
};
_WhiteFish.DateSet get_global_publ_dateset()
{
int max_doc_id = get_max_doc_id();
if(max_doc_id == publ_dateset_cache_max_doc_id)
return publ_dateset_cache;
|
a47c12 | 2022-09-15 | Martin Nilsson | |
Sql.Sql db = get_db();
array(mapping(string:mixed)) a =
db->query("SELECT doc_id, value FROM metadata "
" WHERE name = 'publish-time' "
" AND doc_id > %d ORDER BY doc_id ASC",
publ_dateset_cache_max_doc_id);
publ_dateset_cache_max_doc_id = max_doc_id;
if(!publ_dateset_cache)
publ_dateset_cache = _WhiteFish.DateSet();
publ_dateset_cache->add_many( (array(int))a->doc_id,
|
dd385e | 2011-05-09 | Martin Stjernholm | | (array(int))a->value );
|
a47c12 | 2022-09-15 | Martin Nilsson | | return publ_dateset_cache;
|
a5b631 | 2010-01-20 | Martin Jonsson | | }
|
40a44d | 2004-08-07 | Johan Schön | |
void add_links(Standards.URI|string uri,
void|string language,
array(Standards.URI|string) links)
|
081308 | 2001-05-17 | Johan Schön | | {
|
7583a4 | 2004-08-19 | Fredrik Noring | | if(!links || !sizeof(links))
|
40a44d | 2004-08-07 | Johan Schön | | return;
|
352471 | 2015-05-26 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | | int doc_id = get_document_id((string)uri, language);
|
352471 | 2015-05-26 | Martin Nilsson | |
|
40a44d | 2004-08-07 | Johan Schön | | array(int) to_ids = map(links,
lambda(Standards.URI|string uri)
{
return get_document_id( (string)uri, language);
});
|
f863bc | 2001-06-10 | Per Hedbor | |
|
40a44d | 2004-08-07 | Johan Schön | | string res =
"replace into link (from_id, to_id) values " +
map(to_ids,
lambda(int to_id)
{
return sprintf("(%d, %d)", doc_id, to_id);
}) * ", ";
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | db->query(res);
}
|
feb96a | 2001-05-31 | Johan Schön | |
|
40a44d | 2004-08-07 | Johan Schön | | void remove_links(Standards.URI|string uri,
void|string language)
{
|
455bda | 2009-06-26 | Fredrik Noring | | int doc_id = get_document_id((string)uri, language, 1);
|
feb96a | 2001-05-31 | Johan Schön | |
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | db->query("delete from link where from_id=%d", doc_id);
}
array(int) get_broken_links()
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
40a44d | 2004-08-07 | Johan Schön | | db->query("select 'Not yet done :-)'");
|
d42a52 | 2001-03-15 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | |
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected function sync_callback;
|
f863bc | 2001-06-10 | Per Hedbor | | void set_sync_callback( function f )
{
sync_callback = f;
}
|
86f888 | 2001-03-28 | Johan Schön | |
|
ef2c39 | 2013-06-14 | Jonas Walldén | |
constant max_blob_size = 512 * 1024;
|
40a44d | 2004-08-07 | Johan Schön | |
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected array(array(int|string)) split_blobs(int blob_size, string blob,
|
40a44d | 2004-08-07 | Johan Schön | | int max_blob_size)
{
|
352471 | 2015-05-26 | Martin Nilsson | |
|
e2ccca | 2013-06-14 | Jonas Walldén | | sscanf(blob, "%4c", int first_doc_id);
|
40a44d | 2004-08-07 | Johan Schön | | int ptr = blob_size;
int start = 0, end=0;
array blobs = ({});
while( end+5 < sizeof(blob) )
{
while(end+5 < sizeof(blob) && blob_size < (max_blob_size-517))
{
int l = 4 + 1 + 2*blob[end+4];
end += l;
blob_size += l;
}
string me = blob[start..end-1];
|
e2ccca | 2013-06-14 | Jonas Walldén | | if (sizeof(me))
sscanf(me, "%4c", first_doc_id);
blobs += ({ ({ first_doc_id, me }) });
|
40a44d | 2004-08-07 | Johan Schön | | start = end;
blob_size=0;
}
return blobs;
}
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected void store_to_db( void|string mergedfilename )
|
3c8778 | 2001-05-25 | Per Hedbor | | {
|
31c2e4 | 2004-08-08 | Johan Schön | | Search.MergeFile mergedfile;
|
40a44d | 2004-08-07 | Johan Schön | |
|
31c2e4 | 2004-08-08 | Johan Schön | | if(mergedfilename)
mergedfile = Search.MergeFile(Stdio.File(mergedfilename, "r"));
|
4005e5 | 2013-06-14 | Jonas Walldén | |
int use_padded_blobs = supports_padded_blobs();
|
352471 | 2015-05-26 | Martin Nilsson | |
|
57ab6d | 2001-05-26 | Per Hedbor | | int s = time();
int q;
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
352471 | 2015-05-26 | Martin Nilsson | | #ifdef SEARCH_DEBUG
|
c2c5f5 | 2001-08-16 | Martin Nilsson | | werror("----------- sync() %4d docs --------------\n", docs);
|
352471 | 2015-05-26 | Martin Nilsson | | #endif
|
119e00 | 2005-05-23 | Anders Johansson | | db->query("LOCK TABLES word_hit LOW_PRIORITY WRITE");
|
2af737 | 2005-06-02 | Martin Stjernholm | |
mixed err = catch {
|
40a44d | 2004-08-07 | Johan Schön | | String.Buffer multi_query = String.Buffer();
|
57ab6d | 2001-05-26 | Per Hedbor | | do
{
|
31c2e4 | 2004-08-08 | Johan Schön | | string word, blob;
if(mergedfilename)
{
array a = mergedfile->get_next_word_blob();
if( !a )
break;
[word, blob] = a;
}
else
{
[word, blob] = blobs->read();
if(!word)
break;
|
7dc5be | 2004-08-24 | Johan Schön | | word = string_to_utf8(word);
|
352471 | 2015-05-26 | Martin Nilsson | |
|
cb010f | 2013-02-05 | Jonas Walldén | |
blob = _WhiteFish.Blob(blob)->data();
|
31c2e4 | 2004-08-08 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | |
|
f3735c | 2001-06-23 | Johan Schön | | q++;
|
f863bc | 2001-06-10 | Per Hedbor | |
|
eee6ff | 2013-06-14 | Jonas Walldén | |
if (q % 32 == 0) {
|
5cd497 | 2017-06-09 | Henrik Grubbström (Grubba) | |
if( sizeof( multi_query ) )
db->query( multi_query->get());
|
eee6ff | 2013-06-14 | Jonas Walldén | | db->query("UNLOCK TABLES");
db->query("LOCK TABLES word_hit LOW_PRIORITY WRITE");
}
|
352471 | 2015-05-26 | Martin Nilsson | |
|
4005e5 | 2013-06-14 | Jonas Walldén | |
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | |
|
4005e5 | 2013-06-14 | Jonas Walldén | | void add_padded_blobs(string word, array new_blobs)
{
foreach (new_blobs[..<1], array new_blob_pair) {
[int first_doc_id, string blob] = new_blob_pair;
int new_used_len = sizeof(blob);
db->query("INSERT INTO word_hit "
" (word, first_doc_id, used_len, real_len, hits)"
" VALUES (%s, %d, %d, %d, %s)",
word, first_doc_id, new_used_len, new_used_len, blob);
}
|
352471 | 2015-05-26 | Martin Nilsson | |
|
4005e5 | 2013-06-14 | Jonas Walldén | |
[int first_doc_id, string blob] = new_blobs[-1];
int new_used_len = sizeof(blob);
int new_real_len = get_padded_blob_length(new_used_len);
int space_count = new_real_len - new_used_len;
db->query("INSERT INTO word_hit "
" (word, first_doc_id, used_len, real_len, hits)"
" VALUES (%s, %d, %d, %d, CONCAT(%s, SPACE(%d)))",
word, first_doc_id, new_used_len, new_real_len,
blob, space_count);
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | #ifdef SEARCH_DB_CONSISTENCY_CHECKS
array new = db->query("SELECT real_len, LENGTH(hits) AS actual_len "
" FROM word_hit "
" WHERE word = %s "
" AND first_doc_id = %d",
word, first_doc_id);
if (!sizeof(new)) {
werror("Search.Database: Added blob not in db!\n");
} else if (new_real_len != (int)new[0]->actual_len) {
werror("Search.Database: Added blob has different real_len: %d != %d\n",
new_real_len, (int)new[0]->actual_len);
}
#endif
|
4005e5 | 2013-06-14 | Jonas Walldén | | };
|
352471 | 2015-05-26 | Martin Nilsson | |
|
4005e5 | 2013-06-14 | Jonas Walldén | | void add_oldstyle_blobs(string word, array new_blobs)
{
foreach (new_blobs, array new_blob_pair) {
[int first_doc_id, string blob] = new_blob_pair;
db->query("INSERT INTO word_hit "
" (word, first_doc_id, hits)"
" VALUES (%s, %d, %s)",
word, first_doc_id, blob);
}
};
|
352471 | 2015-05-26 | Martin Nilsson | |
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | #ifdef SEARCH_DB_CONSISTENCY_CHECKS
string consistency_log = "";
#define CONSISTENCY_LOG(X ...) do { \
consistency_log += sprintf(X); \
} while(0)
#else
#define CONSISTENCY_LOG(X ...)
#endif
|
d6e13a | 2013-05-30 | Jonas Walldén | |
|
40a44d | 2004-08-07 | Johan Schön | | int first_doc_id;
|
4005e5 | 2013-06-14 | Jonas Walldén | | array old;
if (use_padded_blobs) {
old = db->query(" SELECT first_doc_id, used_len, real_len "
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | #ifdef SEARCH_DB_CONSISTENCY_CHECKS
" , LENGTH(hits) AS actual_len "
#endif
|
4005e5 | 2013-06-14 | Jonas Walldén | | " FROM word_hit "
" WHERE word=%s "
"ORDER BY first_doc_id DESC "
" LIMIT 1", word);
} else {
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | old = db->query(" SELECT first_doc_id, LENGTH(hits) AS used_len, "
" LENGTH(hits) AS real_len "
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | #ifdef SEARCH_DB_CONSISTENCY_CHECKS
" , LENGTH(hits) AS actual_len "
#endif
|
4005e5 | 2013-06-14 | Jonas Walldén | | " FROM word_hit "
" WHERE word=%s "
"ORDER BY first_doc_id DESC "
" LIMIT 1", word);
|
40a44d | 2004-08-07 | Johan Schön | | }
|
4005e5 | 2013-06-14 | Jonas Walldén | |
if (sizeof(old)) {
|
6082ee | 2017-09-20 | Henrik Grubbström (Grubba) | | int used_len = (int) old[0]->used_len;
int real_len = (int) old[0]->real_len;
int first_doc_id = (int) old[0]->first_doc_id;
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | int new_used_len = used_len + sizeof(blob);
|
6082ee | 2017-09-20 | Henrik Grubbström (Grubba) | | int new_real_len = new_used_len;
|
352471 | 2015-05-26 | Martin Nilsson | |
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | array new_blobs = ({ blob });
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | #ifdef SEARCH_DB_CONSISTENCY_CHECKS
if (real_len != (int)old[0]->actual_len) {
werror("Search.Database: Broken accounting for old word %O: %d != %d\n",
word, real_len, (int)old[0]->actual_len);
CONSISTENCY_LOG("Broken accounting for old word %O: %d != %d\n",
word, real_len, (int)old[0]->actual_len);
}
#endif
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | if (new_used_len > max_blob_size) {
|
4005e5 | 2013-06-14 | Jonas Walldén | |
|
45ce3e | 2018-01-17 | Martin Karlgren | | new_blobs = split_blobs(used_len, blob, max_blob_size);
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | CONSISTENCY_LOG("Splitting old %d byte blob into %d bytes.\n",
sizeof(blob), sizeof(new_blobs[0][1]));
|
4005e5 | 2013-06-14 | Jonas Walldén | | blob = new_blobs[0][1];
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | new_used_len = used_len + sizeof(blob);
new_real_len = new_used_len;
} else if (use_padded_blobs) {
|
6082ee | 2017-09-20 | Henrik Grubbström (Grubba) | |
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | new_real_len = get_padded_blob_length(new_used_len);
}
|
352471 | 2015-05-26 | Martin Nilsson | |
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | |
if (new_real_len != real_len) {
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | CONSISTENCY_LOG("Old (%d bytes) and new real_len (%d bytes) differ.\n",
real_len, new_real_len);
|
4005e5 | 2013-06-14 | Jonas Walldén | | if (use_padded_blobs) {
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | |
int space_count = new_real_len - real_len;
|
6082ee | 2017-09-20 | Henrik Grubbström (Grubba) | | int repl_size = sizeof(blob);
if (space_count < 0) {
repl_size -= space_count;
space_count = 0;
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | CONSISTENCY_LOG("Truncating old hits by %d bytes.\n",
repl_size - sizeof(blob));
|
6082ee | 2017-09-20 | Henrik Grubbström (Grubba) | | }
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | |
|
4005e5 | 2013-06-14 | Jonas Walldén | | db->query("UPDATE word_hit "
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | " SET hits = INSERT(CONCAT(hits, SPACE(%d)), %d, %d, %s),"
|
4005e5 | 2013-06-14 | Jonas Walldén | | " used_len = %d, "
" real_len = %d "
" WHERE word = %s "
" AND first_doc_id = %d",
|
6082ee | 2017-09-20 | Henrik Grubbström (Grubba) | | space_count, used_len + 1, repl_size, blob,
|
4005e5 | 2013-06-14 | Jonas Walldén | | new_used_len,
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | new_real_len,
|
4005e5 | 2013-06-14 | Jonas Walldén | | word, first_doc_id);
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | CONSISTENCY_LOG("Updating used_len %d ==> %d and real_len %d ==> %d.\n",
used_len, new_used_len, real_len, new_real_len);
|
4005e5 | 2013-06-14 | Jonas Walldén | | } else {
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | |
|
4005e5 | 2013-06-14 | Jonas Walldén | | db->query("UPDATE word_hit "
" SET hits = CONCAT(hits, %s) "
" WHERE word = %s "
" AND first_doc_id = %d",
blob, word, first_doc_id);
}
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | } else {
db->query(" UPDATE word_hit "
" SET hits = INSERT(hits, %d, %d, %s), "
" used_len = %d "
" WHERE word = %s "
" AND first_doc_id = %d",
used_len + 1, sizeof(blob), blob,
used_len + sizeof(blob),
word, first_doc_id);
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | CONSISTENCY_LOG("Updating in place (real_len: %d). used_len %d ==> %d\n",
real_len, used_len, used_len + sizeof(blob));
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | }
|
352471 | 2015-05-26 | Martin Nilsson | |
|
ed6e88 | 2017-09-20 | Henrik Grubbström (Grubba) | | #ifdef SEARCH_DB_CONSISTENCY_CHECKS
if (use_padded_blobs) {
array new = db->query("SELECT used_len, real_len, "
" LENGTH(hits) AS actual_len "
" FROM work_hit "
" WHERE word = %s "
" AND first_doc_id = %d",
word, first_doc_id);
if (!sizeof(new)) {
werror("Search.Database: Lost track of word %O!\n", word);
werror("Log:\n%s\n", consistency_log);
} else if (new[0]->real_len != new[0]->actual_len) {
werror("Search.Database: Broken accounting for new word %O: %d != %d\n",
word, (int)new[0]->real_len, (int)new[0]->actual_len);
werror("Log:\n%s\n", consistency_log);
}
}
#endif
|
1fe1e4 | 2017-09-19 | Henrik Grubbström (Grubba) | | if (sizeof(new_blobs) > 1) {
|
4005e5 | 2013-06-14 | Jonas Walldén | |
if (use_padded_blobs)
add_padded_blobs(word, new_blobs[1..]);
else
add_oldstyle_blobs(word, new_blobs[1..]);
|
40a44d | 2004-08-07 | Johan Schön | | }
|
4005e5 | 2013-06-14 | Jonas Walldén | | } else {
if (sizeof(blob) > max_blob_size) {
array new_blobs = split_blobs(0, blob, max_blob_size);
if (use_padded_blobs)
add_padded_blobs(word, new_blobs);
|
40a44d | 2004-08-07 | Johan Schön | | else
|
4005e5 | 2013-06-14 | Jonas Walldén | | add_oldstyle_blobs(word, new_blobs);
} else {
sscanf(blob, "%4c", first_doc_id);
string new_query;
if (use_padded_blobs) {
int new_used_len = sizeof(blob);
int new_real_len = get_padded_blob_length(new_used_len);
int space_count = new_real_len - new_used_len;
new_query =
sprintf("('%s', %d, %d, %d, CONCAT('%s', SPACE(%d)))",
db->quote(word), first_doc_id,
new_used_len, new_real_len,
db->quote(blob), space_count);
} else {
new_query =
sprintf("('%s', %d, '%s')",
db->quote(word), first_doc_id, db->quote(blob));
}
|
352471 | 2015-05-26 | Martin Nilsson | |
|
4005e5 | 2013-06-14 | Jonas Walldén | |
if (sizeof(multi_query) + sizeof(new_query) > 900 * 1024)
db->query(multi_query->get());
|
352471 | 2015-05-26 | Martin Nilsson | |
|
4005e5 | 2013-06-14 | Jonas Walldén | |
if (!sizeof(multi_query)) {
multi_query->add("INSERT INTO word_hit ",
(use_padded_blobs ?
" (word, first_doc_id, used_len, real_len, hits) " :
" (word, first_doc_id, hits) "),
"VALUES ",
new_query);
} else {
multi_query->add(",", new_query);
}
|
40a44d | 2004-08-07 | Johan Schön | | }
}
|
31c2e4 | 2004-08-08 | Johan Schön | | } while( 1 );
|
2af737 | 2005-06-02 | Martin Stjernholm | |
|
40a44d | 2004-08-07 | Johan Schön | | if( sizeof( multi_query ) )
db->query( multi_query->get());
|
2af737 | 2005-06-02 | Martin Stjernholm | |
};
mixed unlock_err = catch (db->query("UNLOCK TABLES"));
if (err) throw (err);
if (unlock_err) throw (unlock_err);
|
352471 | 2015-05-26 | Martin Nilsson | |
|
f863bc | 2001-06-10 | Per Hedbor | | if( sync_callback )
sync_callback();
|
352471 | 2015-05-26 | Martin Nilsson | |
|
31c2e4 | 2004-08-08 | Johan Schön | | if(mergedfilename)
{
mergedfile->close();
rm(mergedfilename);
}
|
2c7bf2 | 2001-08-30 | Johan Schön | | #ifdef SEARCH_DEBUG
|
c2c5f5 | 2001-08-16 | Martin Nilsson | | werror("----------- sync() done %3ds %5dw -------\n", time()-s,q);
|
2c7bf2 | 2001-08-30 | Johan Schön | | #endif
|
352471 | 2015-05-26 | Martin Nilsson | |
|
335c2d | 2013-02-06 | Jonas Walldén | | #ifdef SEARCH_DEBUG
blobs_dirty = 0;
#endif
|
db7a1e | 2001-05-26 | Per Hedbor | | }
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected string get_mergefilename()
|
db7a1e | 2001-05-26 | Per Hedbor | | {
|
40a44d | 2004-08-07 | Johan Schön | | return combine_path(mergefile_path,
sprintf("mergefile%03d.dat", mergefile_counter));
|
849387 | 2001-03-15 | Johan Schön | | }
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected void mergefile_sync()
|
849387 | 2001-03-15 | Johan Schön | | {
|
352471 | 2015-05-26 | Martin Nilsson | | #ifdef SEARCH_DEBUG
|
40a44d | 2004-08-07 | Johan Schön | | System.Timer t = System.Timer();
werror("----------- mergefile_sync() %4d docs --------------\n", docs);
|
352471 | 2015-05-26 | Martin Nilsson | | #endif
|
40a44d | 2004-08-07 | Johan Schön | | Search.MergeFile mergefile = Search.MergeFile(
Stdio.File(get_mergefilename(), "wct"));
|
f70d32 | 2001-09-26 | Johan Schön | |
|
40a44d | 2004-08-07 | Johan Schön | | mergefile->write_blobs(blobs);
if( sync_callback )
sync_callback();
|
f70d32 | 2001-09-26 | Johan Schön | | #ifdef SEARCH_DEBUG
|
40a44d | 2004-08-07 | Johan Schön | | werror("----------- mergefile_sync() done %.3f s %2.1f MB -------\n",
t->get(),
file_stat(get_mergefilename())->size/(1024.0*1024.0));
|
f70d32 | 2001-09-26 | Johan Schön | | #endif
|
081308 | 2001-05-17 | Johan Schön | |
|
40a44d | 2004-08-07 | Johan Schön | | mergefile_counter++;
blobs = _WhiteFish.Blobs();
}
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected string merge_mergefiles(array(string) mergefiles)
|
40a44d | 2004-08-07 | Johan Schön | | {
|
352471 | 2015-05-26 | Martin Nilsson | | #ifdef SEARCH_DEBUG
|
40a44d | 2004-08-07 | Johan Schön | | werror("merge_mergefiles( %s )\n", mergefiles*", ");
|
f70d32 | 2001-09-26 | Johan Schön | | #endif
|
40a44d | 2004-08-07 | Johan Schön | | if(sizeof(mergefiles)==1)
return mergefiles[0];
if(sizeof(mergefiles)>2)
|
f70d32 | 2001-09-26 | Johan Schön | | {
|
40a44d | 2004-08-07 | Johan Schön | | int pivot = sizeof(mergefiles)/2;
return merge_mergefiles( ({ merge_mergefiles(mergefiles[..pivot-1] ),
merge_mergefiles(mergefiles[pivot..] ) }) );
|
f70d32 | 2001-09-26 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | |
|
34df36 | 2001-05-29 | Johan Schön | |
|
40a44d | 2004-08-07 | Johan Schön | | string mergedfile_fn = get_mergefilename();
mergefile_counter++;
Search.MergeFile mergedfile =
Search.MergeFile(Stdio.File(mergedfile_fn, "wct"));
System.Timer t = System.Timer();
mergedfile->merge_mergefiles(Search.MergeFile(Stdio.File(mergefiles[0], "r")),
Search.MergeFile(Stdio.File(mergefiles[1], "r")));
|
352471 | 2015-05-26 | Martin Nilsson | | #ifdef SEARCH_DEBUG
|
40a44d | 2004-08-07 | Johan Schön | | werror("Merging %s (%.1f MB) took %.1f s\n",
mergedfile_fn, file_stat(mergedfile_fn)->size/(1024.0*1024.0),
t->get());
#endif
rm(mergefiles[0]);
rm(mergefiles[1]);
return mergedfile_fn;
|
d42a52 | 2001-03-15 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | | void sync()
|
d8aff3 | 2001-07-16 | Johan Schön | | {
|
31c2e4 | 2004-08-08 | Johan Schön | | if(options->mergefiles)
{
mergefile_sync();
store_to_db(merge_mergefiles(sort(get_mergefiles())));
}
else
{
store_to_db();
blobs = _WhiteFish.Blobs();
}
|
335c2d | 2013-02-06 | Jonas Walldén | | #ifdef SEARCH_DEBUG
|
40a44d | 2004-08-07 | Johan Schön | | docs = 0;
|
335c2d | 2013-02-06 | Jonas Walldén | | #endif
|
d8aff3 | 2001-07-16 | Johan Schön | | }
|
40a44d | 2004-08-07 | Johan Schön | | #ifdef SEARCH_DEBUG
mapping times = ([ ]);
#endif
int memsize()
{
return blobs->memsize();
}
|
d8aff3 | 2001-07-16 | Johan Schön | |
|
d074bc | 2001-08-17 | Johan Schön | | mapping(string|int:int) get_language_stats()
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
d074bc | 2001-08-17 | Johan Schön | | array a=db->query("select count(id) as c,language from document group by language");
return mkmapping( a->language, a->c);
}
int get_num_words()
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
fe8f66 | 2001-08-18 | Johan Schön | | return (int)(db->query("select count(distinct word) as c from word_hit") +
({ (["c": 0]) }))[0]->c;
|
d074bc | 2001-08-17 | Johan Schön | | }
int get_database_size()
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
d074bc | 2001-08-17 | Johan Schön | | int size;
foreach(db->query("show table status"), mapping table)
size += (int)table->Data_length + (int)table->Index_length;
return size;
}
int get_num_deleted_documents()
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
d074bc | 2001-08-17 | Johan Schön | | return (int)db->query("select count(*) as c from deleted_document")[0]->c;
}
|
ff1796 | 2014-08-15 | Martin Nilsson | | protected string my_denormalize(string in)
|
fe8f66 | 2001-08-18 | Johan Schön | | {
return Unicode.normalize(utf8_to_string(in), "C");
}
|
d074bc | 2001-08-17 | Johan Schön | | array(array) get_most_common_words(void|int count)
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
d074bc | 2001-08-17 | Johan Schön | | array a =
|
4005e5 | 2013-06-14 | Jonas Walldén | | db->query(" SELECT word, " +
(supports_padded_blobs() ?
" SUM(used_len) / 5 AS c " :
" SUM(LENGTH(hits)) / 5 AS c ") +
" FROM word_hit "
" GROUP BY word "
" ORDER BY c DESC "
" LIMIT %d", count || 10);
|
d074bc | 2001-08-17 | Johan Schön | |
if(!sizeof(a))
|
a47c12 | 2022-09-15 | Martin Nilsson | | return ({});
return Array.transpose( ({ map(a->word, my_denormalize),
(array(int))a->c }) );
|
d074bc | 2001-08-17 | Johan Schön | | }
|
7ad8a3 | 2009-06-30 | Fredrik Noring | |
void list_url_by_prefix(string url_prefix, function(string:void) cb)
{
|
0f020c | 2015-10-07 | Henrik Grubbström (Grubba) | | Sql.Sql db = get_db();
|
5db9d7 | 2017-09-03 | Henrik Grubbström (Grubba) | | Sql.Result q =
|
7ad8a3 | 2009-06-30 | Fredrik Noring | | db->big_query("SELECT uri "
" FROM uri "
" WHERE uri LIKE '"+db->quote(url_prefix)+"%'");
for(;;) {
array row = q->fetch_row();
if(!row)
break;
cb(row[0]);
}
}
|