pike.git / lib / modules / Search.pmod / Grammar.pmod / Parser.pmod

version» Context lines:

pike.git/lib/modules/Search.pmod/Grammar.pmod/Parser.pmod:1: + inherit .Lexer;    -  + #include "debug.h" +  + class ParseNode { +  string op = "<node>"; +  array(ParseNode) children = ({}); +  void addChild(ParseNode n) { if (n) children += ({ n }); } +  +  static string indentArray(array(string) stuff, string indent) { +  return map(stuff, +  lambda(string s) { +  return replace(s, "\n", "\n" + indent); +  }) * ("\n" + indent); +  } +  +  static string printChildren(string indent) { +  return indentArray(map(children, +  lambda(ParseNode n) { return n->print(); } +  ), +  indent); +  } +  +  string print() { +  string indent = " " * (strlen(op) + 2); +  return sprintf("(%s %s)", op, printChildren(indent)); +  } + } +  + class AndNode { +  inherit ParseNode; +  string op = "and"; + } +  + class OrNode { +  inherit ParseNode; +  string op = "or"; + } +  + class DateNode { +  inherit ParseNode; +  string op = "date"; +  string date; +  string print() { return sprintf("(%s %O)", op, date); } + } +  + class TextNode { +  inherit ParseNode; +  string op = "text"; +  string field; +  array(string) words = ({}); +  array(string) plusWords = ({}); +  array(string) minusWords = ({}); +  array(array(string)) phrases = ({}); +  array(array(string)) plusPhrases = ({}); +  array(array(string)) minusPhrases = ({}); +  string print() { +  array(string) a = ({ "" }); +  +  foreach (words, string w) a += ({ w }); +  foreach (plusWords, string w) a += ({ "+" + w }); +  foreach (minusWords, string w) a += ({ "-" + w }); +  +  foreach (phrases, array(string) p) a += ({ "\"" + p * " " + "\"" }); +  foreach (plusPhrases, array(string) p) a += ({ "+\"" + p * " " + "\"" }); +  foreach (minusPhrases, array(string) p) a += ({ "-\"" + p * " " + "\"" }); +  return sprintf("(%s %O %s)", op, field, +  indentArray(a, " " * (strlen(op) + 2))); +  } + } +  + static int isFieldSpecWord(string word) { +  return (< +  "any", +  "date", +  "title", +  "description", +  "url", +  "keywords", +  >) [lower_case(word)]; + } +  + // Splits a string into its words... + // TODO: Fix a better phrase function.... + static array(string) splitPhrase(string phrase) { +  return phrase / " " - ({ "" }); + } +  + // AND merge: Can merge all nodes with - or + bef. each thing. + // OR merge: Can merge all nodes without any - or +. + static array(TextNode) mergeTextNodes(array(TextNode) a, string op) { +  array(TextNode) result = ({}); +  mapping(string:array(TextNode)) fields = ([]); +  foreach (a, TextNode t) +  fields[t->field] = (fields[t->field] || ({ })) + ({ t }); +  +  // Only merge nodes in the same field +  foreach (indices(fields), string field) { +  array(TextNode) unMerged = ({}); +  TextNode merged = 0; +  foreach (fields[field], TextNode t) { +  int canMerge = 0; +  if (op == "and") +  canMerge = (sizeof(t->words) == 0 +  && sizeof(t->phrases) == 0); +  else if (op == "or") +  canMerge = (sizeof(t->plusWords) == 0 +  && sizeof(t->plusPhrases) == 0 +  && sizeof(t->minusWords) == 0 +  && sizeof(t->minusPhrases) == 0); +  if (canMerge) { +  merged = merged || TextNode(); +  merged->field = field; +  merged->words += t->words; +  merged->plusWords += t->plusWords; +  merged->minusWords += t->minusWords; +  merged->phrases += t->phrases; +  merged->plusPhrases += t->plusPhrases; +  merged->minusPhrases += t->minusPhrases; +  } +  else +  unMerged += ({ t }); +  } +  result += unMerged; +  if (merged) +  result += ({ merged }); +  } +  return result; + } +  + public ParseNode optimize(ParseNode node, string|void parentOp) { +  if (!node) +  return 0; +  node->children = filter(map(node->children, optimize, node->op), +  lambda(ParseNode n) { +  return n != 0; +  }); +  array(ParseNode) newChildren = 0; +  switch (node->op) { +  case "and": +  if (!sizeof(node->children)) +  return 0; +  newChildren = ({}); +  // Check if we can merge TextNodes with the same field +  { +  array(TextNode) toMerge = ({}); +  foreach (node->children, ParseNode child) { +  if (child->op == "and") +  newChildren += child->children; +  else if (child->op == "text") +  toMerge += ({ child }); +  else +  newChildren += ({ child }); +  } +  newChildren += mergeTextNodes(toMerge, "and"); +  } +  break; +  case "or": +  if (!sizeof(node->children)) +  return 0; +  newChildren = ({}); +  +  { +  array(TextNode) toMerge = ({}); +  foreach (node->children, ParseNode child) { +  if (child->op == "or") +  newChildren += child->children; +  else if (child->op == "text") +  toMerge += ({ child }); +  else +  newChildren += ({ child }); +  } +  newChildren += mergeTextNodes(toMerge, "or"); +  } +  break; +  case "date": +  if (!node->date || node->date == "") +  return 0; +  break; +  } +  if (newChildren) +  node->children = newChildren; +  return node; + } +  + static void v(ParseNode node, ParseNode parent) { +  map(node->children, v, node); +  switch (node->op) { +  case "date": +  if (!parent || parent->op != "and") +  throw ("date must restrict query"); +  break; +  case "and": +  break; +  case "text": +  if (node->minusWords || node->minusPhrases) { +  if (!sizeof(node->plusWords) +  && !sizeof(node->plusPhrases) +  && !sizeof(node->words) +  && !sizeof(node->phrases)) +  throw ("negative query not allowed"); +  } +  } + } +  + public string validate(ParseNode node) { +  mixed err = catch (v(node, 0)); +  if (err) +  if (stringp(err)) +  return err; +  else +  throw (err); +  return 0; + } +  + static void lowlevel(string s, mixed ... args) { +  werror(s, @args); +  werror("\n"); + } +  + public void execute(ParseNode q) { +  switch (q->op) { +  case "and": +  { +  int first = 1; +  foreach (q->children, ParseNode child) +  if (child->op != "date") { +  execute(child); +  if (!first) +  lowlevel("AND"); +  else +  first = 0; +  } +  foreach (q->children, ParseNode child) +  if (child->op == "date") +  execute(child); +  } +  break; +  case "or": +  int first = 1; +  foreach (q->children, ParseNode child) { +  execute(child); +  if (!first) +  lowlevel("OR"); +  else +  first = 0; +  } +  break; +  case "date": +  lowlevel("DATE_FILTER %O", q->date); +  break; +  case "text": +  { +  int hasPlus = sizeof(q->plusWords) || sizeof(q->plusPhrases); +  int hasOrdinary = sizeof(q->words) || sizeof(q->phrases); +  int hasMinus = sizeof(q->minusWords) || sizeof(q->minusPhrases); +  if (hasPlus) { +  int first = 1; +  if (sizeof(q->plusWords)) { +  lowlevel("QUERY_AND field:%O %{ %O%}", q->field, q->plusWords); +  first = 0; +  } +  foreach (q->plusPhrases, array(string) ph) { +  lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph); +  if (first) +  first = 0; +  else +  lowlevel("AND"); +  } +  } +  if (hasOrdinary) { +  int first = 1; +  if (sizeof(q->words)) { +  lowlevel("QUERY_OR field:%O %{ %O%}", q->field, q->words); +  first = 0; +  } +  foreach (q->phrases, array(string) ph) { +  lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph); +  if (first) +  first = 0; +  else +  lowlevel("OR"); +  } +  } +  +  if (hasPlus && hasOrdinary) +  lowlevel("UPRANK"); // the XXX operation :) +  +  if (hasMinus) { +  int first = 1; +  if (sizeof(q->minusWords)) { +  lowlevel("QUERY_OR field:%O %{ %O%}", q->field, q->minusWords); +  first = 0; +  } +  foreach (q->minusPhrases, array(string) ph) { +  lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph); +  if (first) +  first = 0; +  else +  lowlevel("OR"); +  } +  lowlevel("SUB"); // Only - words are not allowed +  } +  +  break; +  } +  } // switch (q->op) + } +  + // ========================================================================= + // GRAMMAR FOR IMPLICIT <anything> + // ========================================================================= + // + // START : query + // ; + // + // query : query 'or' expr0 + // | expr0 + // ; + // + // expr0 : expr0 expr1 // imlicit AND + // | expr0 'and' expr1 // (on this level) + // | expr1 + // ; + // + // expr1 : expr2 + // ; + // + // expr2 : expr3 + // | field ':' expr3 + // | 'date' ':' date + // | '(' query ')' + // ; + // + // date : (word <not followed by ':'> + // | '-' | ':' | <unknown character> )* + // ; + // + // NOTE: when looking for an operator here (expr3 - expr5), we have to check + // that it is not followed by a "field:", or "(". + // + // expr3 : expr3 'or' expr4 + // | expr4 + // | <empty> + // ; + // + // expr4 : expr4 'and' expr5 + // | expr5 + // ; + // + // expr5 : expr5 expr6 + // | expr6 + // ; + // + // expr6 : '-' expr7 + // | '+' expr7 + // | expr7 + // ; + // + // expr7 : word + // | phrase + // ; +  + public class Parser { +  +  static array(array(Token|string)) tokens; +  static array(string) fieldstack; +  mapping(string:string) options; +  +  static array(Token|string) peek(void|int lookahead) { +  if (lookahead >= sizeof(tokens)) +  lookahead = sizeof(tokens) - 1; +  return tokens[lookahead]; +  } +  +  static void advance() { +  if (sizeof(tokens) > 1) +  tokens = tokens[1 .. ]; +  } +  +  static void create(mapping(string:string)|void opt) { +  options = opt || ([]); +  } +  +  ParseNode parse(string q) { +  fieldstack = ({ "any" }); +  tokens = tokenize(q); +  return parseQuery(); +  } +  +  static ParseNode parseQuery() { +  ParseNode or = OrNode(); +  for (;;) { +  ParseNode n = parseExpr0(); +  or->addChild(n); +  if (peek()[0] == TOKEN_OR) +  advance(); +  else +  break; +  } +  if (sizeof(or->children) == 1) +  return or->children[0]; +  return or; +  } +  +  static ParseNode parseExpr0() { +  ParseNode and = AndNode(); +  for (;;) { +  ParseNode n = parseExpr1(); +  and->addChild(n); +  if (peek()[0] == TOKEN_AND) +  advance(); +  else if ((< TOKEN_END, +  TOKEN_RPAREN, +  TOKEN_OR >)[ peek()[0] ]) +  break; +  // implicit AND +  } +  if (sizeof(and->children) == 1) +  return and->children[0]; +  return and; +  } +  +  static ParseNode parseExpr1() { +  return parseExpr2(); +  } +  +  static ParseNode parseExpr2() { +  +  // field ':' expr3 +  if (peek()[0] == TOKEN_WORD +  && isFieldSpecWord(peek()[1]) +  && peek(1)[0] == TOKEN_COLON) +  { +  fieldstack = ({ peek()[1] }) + fieldstack; +  advance(); +  advance(); +  ParseNode n = fieldstack[0] == "date" +  ? parseDate() +  : parseExpr3(); +  fieldstack = fieldstack[1 .. ]; +  return n; +  } +  +  // '(' query ')' +  if (peek()[0] == TOKEN_LPAREN) { +  advance(); +  ParseNode n = parseQuery(); +  if (peek()[0] == TOKEN_RPAREN) +  advance(); +  return n; +  } +  return parseExpr3(); +  } +  +  static ParseNode parseExpr3() { +  if (peek()[0] == TOKEN_WORD +  && peek(1)[0] == TOKEN_COLON) +  return 0; +  ParseNode or = OrNode(); +  for (;;) { +  ParseNode n = parseExpr4(); +  or->addChild(n); +  if (peek()[0] == TOKEN_OR) +  if (peek(1)[0] == TOKEN_WORD +  && peek(2)[0] == TOKEN_COLON) +  break; // it was a higher level OR +  else +  advance(); +  else +  break; +  } +  if (sizeof(or->children) == 1) +  return or->children[0]; +  return or; +  } +  +  static ParseNode parseExpr4() { +  ParseNode and = AndNode(); +  for (;;) { +  ParseNode n = parseExpr5(); +  and->addChild(n); +  // NOTE: No implicit and here! +  if (peek()[0] == TOKEN_AND +  && !(peek(1)[0] == TOKEN_WORD +  && peek(2)[0] == TOKEN_COLON // it was a higher level AND +  || peek(1)[0] == TOKEN_LPAREN)) +  advance(); +  else +  break; +  } +  if (sizeof(and->children) == 1) +  return and->children[0]; +  return and; +  } +  +  static ParseNode parseExpr5() { +  ParseNode text = TextNode(); +  text->field = fieldstack[0]; +  for (;;) { +  parseExpr6(text); +  if ( (< TOKEN_END, +  TOKEN_RPAREN, +  TOKEN_AND, +  TOKEN_OR >) [ peek()[0] ] +  || (peek()[0] == TOKEN_WORD +  && peek(1)[0] == TOKEN_COLON) +  || (peek()[0] == TOKEN_LPAREN)) +  break; // it was a higher level IMPLICIT AND +  if (peek()[0] == TOKEN_OR) +  if (peek(1)[0] == TOKEN_WORD +  && peek(2)[0] == TOKEN_COLON +  || peek(1)[0] == TOKEN_LPAREN) +  break; // it was a higher level OR +  else +  advance(); +  } +  if (sizeof(text->words) +  || sizeof(text->phrases) +  || sizeof(text->plusWords) +  || sizeof(text->plusPhrases) +  || sizeof(text->minusWords) +  || sizeof(text->minusPhrases)) +  return text; +  return 0; +  } +  +  static void parseExpr6(TextNode node) { +  int prefix = 0; +  +  if (peek()[0] == TOKEN_MINUS) { +  advance(); +  prefix = '-'; +  } +  else if (peek()[0] == TOKEN_PLUS) { +  advance(); +  prefix = '+'; +  } +  +  if (!prefix && options["implicit"] == "and") +  prefix = '+'; +  +  while (!(< TOKEN_PHRASE, +  TOKEN_WORD, +  TOKEN_END >) [ peek()[0] ]) +  advance(); // ... ????????? or something smarter ????? +  +  if (peek()[0] == TOKEN_PHRASE +  || peek()[0] == TOKEN_WORD) { +  string phrase = peek()[1]; +  advance(); +  array(string) words = splitPhrase(phrase); +  if (!words || !sizeof(words)) +  return; +  if (sizeof(words) == 1) +  switch (prefix) { +  case '+': node->plusWords += words; break; +  case '-': node->minusWords += words; break; +  default: node->words += words; break; +  } +  else if (sizeof(words) > 1) +  switch (prefix) { +  case '+': node->plusPhrases += ({ words }); break; +  case '-': node->minusPhrases += ({ words }); break; +  default: node->phrases += ({ words }); break; +  } +  } +  } +  +  static ParseNode parseDate() { +  DateNode n = DateNode(); +  n->date = ""; +  loop: +  for (;;) { +  switch (peek()[0]) { +  case TOKEN_WORD: +  if (isFieldSpecWord(peek()[1]) +  && peek(1)[0] == TOKEN_COLON) +  break loop; // it's a field specifier +  break; +  case TOKEN_UNKNOWN: +  case TOKEN_MINUS: +  case TOKEN_COLON: +  break; +  default: +  break loop; +  } +  n->date += peek()[2]; // with spaces preserved! +  advance(); +  } +  return n; +  } +  + } // class Parser   Newline at end of file added.