eb01b42010-10-26Martin Stjernholm #pike __REAL_VERSION__
0844382001-06-10Per Hedbor 
40a44d2004-08-07Johan Schön //! Abstract parse tree node.
6ec0572001-05-31David Norlin class ParseNode { string op = "<node>"; array(ParseNode) children = ({}); void addChild(ParseNode n) { if (n) children += ({ n }); }
ff17962014-08-15Martin Nilsson  protected string indentArray(array(string) stuff, string indent) {
6ec0572001-05-31David Norlin  return map(stuff, lambda(string s) { return replace(s, "\n", "\n" + indent); }) * ("\n" + indent); }
ff17962014-08-15Martin Nilsson  protected string printChildren(string indent) {
40a44d2004-08-07Johan Schön  return indentArray(children->print(), indent);
6ec0572001-05-31David Norlin  } string print() { string indent = " " * (strlen(op) + 2); return sprintf("(%s %s)", op, printChildren(indent)); }
7e56ca2001-05-31Martin Nilsson 
ff17962014-08-15Martin Nilsson  protected string _sprintf(int t) {
40a44d2004-08-07Johan Schön  return t=='O' && "ParseNode" + print();
7e56ca2001-05-31Martin Nilsson  }
6ec0572001-05-31David Norlin }
40a44d2004-08-07Johan Schön //! And node.
6ec0572001-05-31David Norlin class AndNode { inherit ParseNode; string op = "and"; }
40a44d2004-08-07Johan Schön //! Or node.
6ec0572001-05-31David Norlin class OrNode { inherit ParseNode; string op = "or"; }
40a44d2004-08-07Johan Schön //! Date node.
6ec0572001-05-31David Norlin class DateNode { inherit ParseNode; string op = "date";
40a44d2004-08-07Johan Schön  array operator;
6ec0572001-05-31David Norlin  string date;
40a44d2004-08-07Johan Schön  string print() { return sprintf("(%s %O %O)", op, date, operator); }
6ec0572001-05-31David Norlin }
40a44d2004-08-07Johan Schön //! Text node.
6ec0572001-05-31David Norlin class TextNode { inherit ParseNode; string op = "text"; string field; array(string) words = ({}); array(string) plusWords = ({}); array(string) minusWords = ({}); array(array(string)) phrases = ({}); array(array(string)) plusPhrases = ({}); array(array(string)) minusPhrases = ({}); string print() { array(string) a = ({ "" }); foreach (words, string w) a += ({ w }); foreach (plusWords, string w) a += ({ "+" + w }); foreach (minusWords, string w) a += ({ "-" + w }); foreach (phrases, array(string) p) a += ({ "\"" + p * " " + "\"" }); foreach (plusPhrases, array(string) p) a += ({ "+\"" + p * " " + "\"" }); foreach (minusPhrases, array(string) p) a += ({ "-\"" + p * " " + "\"" }); return sprintf("(%s %O %s)", op, field, indentArray(a, " " * (strlen(op) + 2))); } }
40a44d2004-08-07Johan Schön //! multiset(string) getDefaultFields() {
070ae42001-06-01David Norlin  return (< "anchor", "any", "body", "keywords", "title", "url", >);
6ec0572001-05-31David Norlin } // AND merge: Can merge all nodes with - or + bef. each thing. // OR merge: Can merge all nodes without any - or +.
ff17962014-08-15Martin Nilsson private array(TextNode) mergeTextNodes(array(TextNode) a, string op) {
6ec0572001-05-31David Norlin  array(TextNode) result = ({}); mapping(string:array(TextNode)) fields = ([]); foreach (a, TextNode t) fields[t->field] = (fields[t->field] || ({ })) + ({ t }); // Only merge nodes in the same field foreach (indices(fields), string field) { array(TextNode) unMerged = ({}); TextNode merged = 0; foreach (fields[field], TextNode t) { int canMerge = 0; if (op == "and") canMerge = (sizeof(t->words) == 0 && sizeof(t->phrases) == 0); else if (op == "or") canMerge = (sizeof(t->plusWords) == 0 && sizeof(t->plusPhrases) == 0 && sizeof(t->minusWords) == 0 && sizeof(t->minusPhrases) == 0); if (canMerge) { merged = merged || TextNode(); merged->field = field; merged->words += t->words; merged->plusWords += t->plusWords; merged->minusWords += t->minusWords; merged->phrases += t->phrases; merged->plusPhrases += t->plusPhrases; merged->minusPhrases += t->minusPhrases; } else unMerged += ({ t }); } result += unMerged; if (merged) result += ({ merged }); } return result; }
40a44d2004-08-07Johan Schön //!
6ec0572001-05-31David Norlin ParseNode optimize(ParseNode node, string|void parentOp) { if (!node) return 0;
40a44d2004-08-07Johan Schön  node->children = map(node->children, optimize, node->op) - ({0});
6ec0572001-05-31David Norlin  array(ParseNode) newChildren = 0;
40a44d2004-08-07Johan Schön 
6ec0572001-05-31David Norlin  switch (node->op) { case "and": if (!sizeof(node->children)) return 0; newChildren = ({}); // Check if we can merge TextNodes with the same field { array(TextNode) toMerge = ({}); foreach (node->children, ParseNode child) { if (child->op == "and") newChildren += child->children; else if (child->op == "text") toMerge += ({ child }); else newChildren += ({ child }); } newChildren += mergeTextNodes(toMerge, "and"); }
69b28a2001-08-08David Norlin  if (sizeof(newChildren) == 1) return newChildren[0];
7a5d042008-03-27Jonas Wallden  else { // If we have a negative word only we try to place it at the // end of the children list. This will save us a fetch of all // document IDs in the execution pass to compute the negation. int(0..1) is_minus_only(ParseNode n) { return
a4d2532008-08-26Jonas Wallden  n->op == "text" &&
7a5d042008-03-27Jonas Wallden  !sizeof(n->plusWords) && !sizeof(n->plusPhrases) && (!sizeof(n->words) || has_value(n->words, "*"))&& !sizeof(n->phrases) && (sizeof(n->minusWords) || sizeof(n->minusPhrases)); }; array(ParseNode) minus_only_items = filter(newChildren, is_minus_only); newChildren = (newChildren - minus_only_items) + minus_only_items; }
6ec0572001-05-31David Norlin  break; case "or": if (!sizeof(node->children)) return 0; newChildren = ({}); { array(TextNode) toMerge = ({}); foreach (node->children, ParseNode child) { if (child->op == "or") newChildren += child->children; else if (child->op == "text") toMerge += ({ child }); else newChildren += ({ child }); } newChildren += mergeTextNodes(toMerge, "or"); }
69b28a2001-08-08David Norlin  if (sizeof(newChildren) == 1) return newChildren[0];
6ec0572001-05-31David Norlin  break; case "date": if (!node->date || node->date == "") return 0; break;
40a44d2004-08-07Johan Schön  case "text": if( node->words==({}) && node->plusWords==({}) && node->minusWords==({}) && node->phrases==({}) && node->plusPhrases==({}) && node->minusPhrases==({}) ) return 0;
26c3412008-03-26Jonas Wallden  // Fix "minus" queries without corresponding starting selection // by adding "*" as the starting selection. if (sizeof(node->minusWords) || sizeof(node->minusPhrases)) { if (!sizeof(node->plusWords) && !sizeof(node->plusPhrases) && !sizeof(node->words) && !sizeof(node->phrases)) { node->words = ({ "*" }); } }
6ec0572001-05-31David Norlin  } if (newChildren) node->children = newChildren; return node; }
ff17962014-08-15Martin Nilsson private void _validate(ParseNode node, ParseNode parent) {
40a44d2004-08-07Johan Schön  map(node->children, _validate, node);
6ec0572001-05-31David Norlin  switch (node->op) { case "date": if (!parent || parent->op != "and")
40a44d2004-08-07Johan Schön  throw ("date must restrict query");
6ec0572001-05-31David Norlin  break; case "and": break; case "text":
26c3412008-03-26Jonas Wallden  // NOTE: This should no longer be happening since we adjust these // nodes during the optimization phase to be constructed as // ("*" - phrase).
40a44d2004-08-07Johan Schön  if (sizeof(node->minusWords) || sizeof(node->minusPhrases)) {
6ec0572001-05-31David Norlin  if (!sizeof(node->plusWords) && !sizeof(node->plusPhrases) && !sizeof(node->words) && !sizeof(node->phrases)) throw ("negative query not allowed"); } } } // Returns 0 if OK, a string with error message if error string validate(ParseNode node) {
8087582001-06-11David Norlin  if (!node) // A null query is also valid. return 0;
40a44d2004-08-07Johan Schön  mixed err = catch (_validate(node, 0));
6ec0572001-05-31David Norlin  if (err) if (stringp(err)) return err; else throw (err); return 0; }
e650c62001-08-07David Norlin 
40a44d2004-08-07Johan Schön //! void remove_stop_words(ParseNode node, array(string) stop_words) { if (!node || !sizeof(stop_words)) return; low_remove_stop_words(node, stop_words); }
ff17962014-08-15Martin Nilsson protected void low_remove_stop_words(ParseNode node, array(string) stop_words) {
e650c62001-08-07David Norlin  switch (node->op) { case "or": case "and":
40a44d2004-08-07Johan Schön  foreach(node->children, ParseNode c) remove_stop_words(c, stop_words); break;
3524712015-05-26Martin Nilsson 
e650c62001-08-07David Norlin  case "text": node->plusWords -= stop_words; node->minusWords -= stop_words; node->words -= stop_words;
40a44d2004-08-07Johan Schön  break;
e650c62001-08-07David Norlin  } }