eb01b42010-10-26Martin Stjernholm #pike __REAL_VERSION__
87e9262001-06-22Martin Nilsson 
a17b4d2013-01-29Jonas Walldén  static string debug_blob(string b) { if (!b) return "Blob(empty)"; string res = "Blob("; while (sizeof(b) >= 5) { array hits = ({ }); sscanf(b, "%4c%c%s", int docid, int nhits, b); int iter = nhits; while (iter-- && sizeof(b) >= 2) { sscanf(b, "%2c%s", int hit, b); hits += ({ (string) hit }); } res += sprintf("[docid:%O hits:%s]", docid, hits * ","); } res += ")"; return res; } static function(string,int,int:string) blobfeeder(Search.Database.Base db, array words)
11aeec2001-05-25Johan Schön {
a17b4d2013-01-29Jonas Walldén  // Create state per word and stream so multiple occurrences of the same // word are kept apart. mapping state = mkmapping(words, allocate(sizeof(words), ([ ]) ));
f70d322001-09-26Johan Schön  mapping(string:mapping(int:string)) blobcache = ([ ]);
a17b4d2013-01-29Jonas Walldén  return lambda( string word, int foo, int blob_stream_id )
40a44d2004-08-07Johan Schön  {
a17b4d2013-01-29Jonas Walldén  return db->get_blob(word, state[word][blob_stream_id]++, blobcache);
40a44d2004-08-07Johan Schön  };
11aeec2001-05-25Johan Schön }
764c852008-03-25Tobias Liin 
dd6aa62001-06-01David Norlin static array(string) uniq_preserve_order(array(string) a) { array(string) result = ({}); foreach (a, string s) if (search(result, s) < 0) result += ({ s }); return result; }
965c0b2001-05-29Johan Schön 
eac31a2001-05-31Johan Schön Search.ResultSet do_query_or(Search.Database.Base db,
40a44d2004-08-07Johan Schön  array(string) words, Search.RankingProfile ranking)
965c0b2001-05-29Johan Schön {
5d3ab52001-07-05Martin Nilsson  Search.ResultSet result = _WhiteFish.do_query_or(words, ranking->field_ranking, ranking->proximity_ranking, ranking->cutoff, blobfeeder(db, words)); return result;
965c0b2001-05-29Johan Schön }
eac31a2001-05-31Johan Schön Search.ResultSet do_query_and(Search.Database.Base db,
40a44d2004-08-07Johan Schön  array(string) words, Search.RankingProfile ranking)
965c0b2001-05-29Johan Schön {
5d3ab52001-07-05Martin Nilsson  Search.ResultSet result = _WhiteFish.do_query_and(words, ranking->field_ranking, ranking->proximity_ranking, ranking->cutoff, blobfeeder(db, words)); return result;
965c0b2001-05-29Johan Schön }
eac31a2001-05-31Johan Schön Search.ResultSet do_query_phrase(Search.Database.Base db,
195e682001-06-12David Norlin  array(string) words, Search.RankingProfile ranking)
965c0b2001-05-29Johan Schön {
5d3ab52001-07-05Martin Nilsson  Search.ResultSet result = _WhiteFish.do_query_phrase(words, ranking->field_ranking, // ranking->cutoff, blobfeeder(db, words)); return result;
965c0b2001-05-29Johan Schön }
40a44d2004-08-07Johan Schön enum search_order {
a5b6312010-01-20Martin Jonsson  RELEVANCE=1, DATE_ASC, DATE_DESC, NONE, PUBL_DATE_ASC, PUBL_DATE_DESC
40a44d2004-08-07Johan Schön }; static Search.ResultSet sort_resultset(Search.ResultSet resultset, search_order order, Search.Database.Base db) { }
7886752001-05-31David Norlin //! @param query //! The query string entered by user. //! @param db //! The search database. //! @param defaultRanking //! Used when searching in the field "any:".
195e682001-06-12David Norlin //! //! @returns
ba49342008-03-27Tobias Liin //! An array with three elements:
195e682001-06-12David Norlin //! @array //! @elem Search.ResultSet 0 //! The ResultSet containing the hits. //! @elem array(string) 1 //! All wanted words in the query. (I.e. not the words that were //! preceded by minus.)
9445692008-03-28Tobias Liin //! @elem array(mapping) 2
764c852008-03-25Tobias Liin //! All wanted globs in the query. (I.e. not the globs that were //! preceded by minus.)
195e682001-06-12David Norlin //! @endarray //! array(Search.ResultSet|array(string)) execute(Search.Database.Base db, Search.Grammar.AbstractParser parser, string query,
40a44d2004-08-07Johan Schön  Search.RankingProfile ranking, void|array(string) stop_words, void|search_order order)
7886752001-05-31David Norlin { Search.Grammar.ParseNode q = parser->parse(query);
40a44d2004-08-07Johan Schön  if (stop_words && sizeof(stop_words)) Search.Grammar.remove_stop_words(q, stop_words);
8b4ffb2002-03-12Johan Schön 
7886752001-05-31David Norlin  q = Search.Grammar.optimize(q);
e650c62001-08-07David Norlin 
e88f2f2008-03-26Jonas Wallden  if (!q) // The query was a null query return ({ Search.ResultSet(), ({}), ({}) }); // so return an empty resultset
230b0e2001-08-08David Norlin 
7886752001-05-31David Norlin  string error = Search.Grammar.validate(q); if (error) throw (error);
40a44d2004-08-07Johan Schön  array(Search.ResultSet|array(string)) res = class {
7886752001-05-31David Norlin  static Search.RankingProfile defaultRanking; static Search.Database.Base db;
3771f12001-05-31David Norlin 
7886752001-05-31David Norlin  // Used when search is limited to another field than "any:". static Search.RankingProfile specialRanking; static void create(Search.Database.Base _db, Search.RankingProfile _defaultRanking) { db = _db; defaultRanking = _defaultRanking; specialRanking = defaultRanking->copy();
40a44d2004-08-07Johan Schön  pop = stack->pop; push = stack->push;
7886752001-05-31David Norlin  }
8b4ffb2002-03-12Johan Schön  static array(array(string)) split_words(array(string) words) { array a=({}),b=({}); foreach(words, string word)
40a44d2004-08-07Johan Schön  if(has_value(word, "*") || has_value(word, "?")) b+=({ word }); else a+=({ word });
8b4ffb2002-03-12Johan Schön  return ({ a, b }); }
7886752001-05-31David Norlin  static constant ParseNode = Search.Grammar.ParseNode;
195e682001-06-12David Norlin  static array(array(string)|string) words = ({ });
764c852008-03-25Tobias Liin  static array(array(string)|string) glob_words = ({ });
40a44d2004-08-07Johan Schön  static ADT.Stack stack = ADT.Stack(); static function(Search.ResultSet:void) push; static function(void:Search.ResultSet) pop;
7886752001-05-31David Norlin 
195e682001-06-12David Norlin  array(Search.ResultSet|array(string)) execute(ParseNode q) {
7886752001-05-31David Norlin  exec(q); if (sizeof(stack) != 1)
40a44d2004-08-07Johan Schön  error("Stack should have exactly one item!");
764c852008-03-25Tobias Liin  return ({ pop(), words, glob_words });
7886752001-05-31David Norlin  }
8b4ffb2002-03-12Johan Schön 
7a5d042008-03-27Jonas Wallden  void exec(ParseNode q, void|int use_AND_optimization) {
8b4ffb2002-03-12Johan Schön  int max_globs = 100;
7886752001-05-31David Norlin  switch (q->op) { case "and":
40a44d2004-08-07Johan Schön  {
7886752001-05-31David Norlin  int first = 1; foreach (q->children, ParseNode child)
40a44d2004-08-07Johan Schön  {
7a5d042008-03-27Jonas Wallden  exec(child, (q->op == "and") && !first);
40a44d2004-08-07Johan Schön  if (!first) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 & r2);
7886752001-05-31David Norlin  }
40a44d2004-08-07Johan Schön  else first = 0;
7886752001-05-31David Norlin  }
40a44d2004-08-07Johan Schön  } break;
7886752001-05-31David Norlin  case "or":
40a44d2004-08-07Johan Schön  {
7886752001-05-31David Norlin  int first = 1; foreach (q->children, ParseNode child) {
7a5d042008-03-27Jonas Wallden  exec(child, 0);
7886752001-05-31David Norlin  if (!first) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 | r2); } else first = 0; }
40a44d2004-08-07Johan Schön  }
7886752001-05-31David Norlin  break; case "date":
40a44d2004-08-07Johan Schön  _WhiteFish.DateSet global_dateset = db->get_global_dateset(); if(!sizeof(global_dateset)) { push(global_dateset); break; }
8a58cb2004-08-19Fredrik Noring  int t_low, t_high; catch {
2e2e4e2004-12-29Anders Johansson  t_low = t_high = Calendar.ISO.dwim_day(String.trim_whites(q->date))->unix_time();
8a58cb2004-08-19Fredrik Noring  t_high += 24*60*60-1; // Add 24 h to end of the day. }; // Fix to allow year-month "%04d-%02d" timerange. if(!t_low && sscanf(q->date, "%4d-%2d", int y, int m) == 2) catch { Calendar.ISO.Month month = Calendar.ISO.Month(y, m); t_low = month->unix_time(); t_high = month->next()->unix_time()-1; }; // Fix to allow year "%04d" timerange. if(!t_low && sscanf(q->date, "%4d", int y)) catch { Calendar.ISO.Year year = Calendar.ISO.Year(y); t_low = year->unix_time(); t_high = year->next()->unix_time()-1; };
7b5dad2004-08-19Fredrik Noring  if(t_low <= 0 || t_high <= 0 || object_program(t_low) || object_program(t_high)) // Guard against out-of-bounds and bignums.
8a58cb2004-08-19Fredrik Noring  { push(_WhiteFish.DateSet()); break; } _WhiteFish.DateSet restriction; switch(q->operator[1]) { case "=": restriction = global_dateset->between(t_low-1, t_high+1)->finalize(); break; case "<>": case "!=": restriction =
bcdefe2004-08-19Fredrik Noring  global_dateset->not_between(t_low, t_high)->finalize();
8a58cb2004-08-19Fredrik Noring  break; case "<=": restriction = global_dateset->before(t_high+1)->finalize(); break; case ">=": restriction = global_dateset->after(t_low-1)->finalize(); break; case "<": restriction = global_dateset->before(t_low)->finalize(); break; case ">": restriction = global_dateset->after(t_high)->finalize(); break; } push(restriction || _WhiteFish.DateSet());
7886752001-05-31David Norlin  break;
40a44d2004-08-07Johan Schön 
7886752001-05-31David Norlin  case "text": {
40a44d2004-08-07Johan Schön  Search.RankingProfile ranking = defaultRanking; if (q->field != "any") { ranking = specialRanking; int fieldID = db->get_field_id(q->field, 1); if (!fieldID && q->field != "body") { // There was no such field, so we push an empty ResultSet ! push(Search.ResultSet()); break; } ranking->field_ranking = allocate(65); ranking->field_ranking[fieldID] = 1; } [array plusWords, array plusWordGlobs] = split_words(q->plusWords); [array ordinaryWords, array ordinaryWordGlobs] = split_words(q->words); [array minusWords, array minusWordGlobs] = split_words(q->minusWords);
764c852008-03-25Tobias Liin 
40a44d2004-08-07Johan Schön // werror("[%-10s] plus: %-15s ordinary: %-15s minus: %-15s\n", q->field, q>plusWords*", ", q->words*", ", q->minusWords*", ");
e88f2f2008-03-26Jonas Wallden  // Subtracting "*" gives empty result if (has_value(minusWordGlobs, "*")) { push(Search.ResultSet()); break; }
40a44d2004-08-07Johan Schön  int hasPlus = sizeof(q->plusWords) || sizeof(q->plusPhrases); int hasOrdinary = sizeof(q->words) || sizeof(q->phrases); int hasMinus = sizeof(q->minusWords) || sizeof(q->minusPhrases);
e88f2f2008-03-26Jonas Wallden  int hasEverything = has_value(plusWordGlobs, "*") || has_value(ordinaryWordGlobs, "*"); if (hasEverything) { // FIXME: Ranking?
7a5d042008-03-27Jonas Wallden  if (use_AND_optimization && sizeof(stack)) { // If the current operation is AND we can never get a // result set after subtraction containing more entries than // the set pushed on the stack from our previous siblings. push(stack->top()); } else { push(db->get_all_documents()); }
e88f2f2008-03-26Jonas Wallden  hasPlus = 0; hasOrdinary = 0; plusWordGlobs -= ({ "*" }); ordinaryWordGlobs -= ({ "*" }); }
e4921a2013-01-17Jonas Walldén  foreach (Array.uniq(plusWordGlobs | ordinaryWordGlobs), string w)
9445692008-03-28Tobias Liin  glob_words += ({ ([ q->field : w]) });
40a44d2004-08-07Johan Schön  if(hasPlus) { int first = 1; if(sizeof(plusWords)) { words += plusWords; push(do_query_and(db, plusWords, ranking)); first = 0; }
764c852008-03-25Tobias Liin 
40a44d2004-08-07Johan Schön  foreach(plusWordGlobs, string plusWordGlob) { push(do_query_or(db, db->expand_word_glob(plusWordGlob, max_globs), ranking)); if (!first) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 & r2); } first = 0; } foreach (q->plusPhrases, array(string) ph) { words += ph; push(do_query_phrase(db, ph, ranking)); if (!first) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 & r2); } first = 0; } } if(hasOrdinary) { int first = 1; if (sizeof(ordinaryWords)) { words += ordinaryWords; push(do_query_or(db, ordinaryWords, ranking)); first = 0; } foreach(ordinaryWordGlobs, string ordinaryWordGlob) { push(do_query_or(db, db->expand_word_glob(ordinaryWordGlob, max_globs), ranking)); if (!first) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 | r2); } first = 0; } foreach (q->phrases, array(string) ph) { words += ph; push(do_query_phrase(db, ph, ranking)); if(!first) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 | r2); } first = 0; } } if(hasPlus && hasOrdinary) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); // If a document contains must-have words AND ALSO may-have words, // it's ranking is increased. push(r1->add_ranking(r2)); }
e88f2f2008-03-26Jonas Wallden  if((hasPlus || hasOrdinary || hasEverything) && hasMinus)
40a44d2004-08-07Johan Schön  { int first = 1; if (sizeof(q->minusWords)) { push(do_query_or(db, q->minusWords, ranking)); first = 0; } foreach(minusWordGlobs, string minusWordGlob) { push(do_query_or(db, db->expand_word_glob(minusWordGlob, max_globs), ranking)); if(!first) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 | r2); } first = 0; } foreach (q->minusPhrases, array(string) ph) { push(do_query_phrase(db, ph, ranking)); if (!first) { Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 | r2); } first = 0; } Search.ResultSet r2 = pop(); Search.ResultSet r1 = pop(); push(r1 - r2); }
7886752001-05-31David Norlin  } break; default: error("Unknown type of ParseNode!"); } // switch (q->op) }
3771f12001-05-31David Norlin 
40a44d2004-08-07Johan Schön  } (db, ranking)->execute(q); res[0] -= db->get_deleted_documents(); if(!order) order = RELEVANCE; if(order!=NONE) switch(order) { case RELEVANCE: res[0]->sort(); break; case DATE_ASC: case DATE_DESC: res[0] = res[0]->finalize()->add_ranking(db->get_global_dateset()); if(order==DATE_DESC) res[0]->sort(); else res[0]->sort_rev();
a5b6312010-01-20Martin Jonsson  break; case PUBL_DATE_ASC: case PUBL_DATE_DESC: res[0] = res[0]->finalize()->add_ranking(db->get_global_publ_dateset()); if(order==PUBL_DATE_DESC) res[0]->sort(); else res[0]->sort_rev();
40a44d2004-08-07Johan Schön  case NONE: } return res;
3771f12001-05-31David Norlin }