87e9262001-06-22Martin Nilsson // This file is part of Roxen Search // Copyright © 2001 Roxen IS. All rights reserved. //
6248b82002-03-11Henrik Grubbström (Grubba) // $Id: DefaultParser.pike,v 1.8 2002/03/11 14:26:45 grubba Exp $
0844382001-06-10Per Hedbor 
6ec0572001-05-31David Norlin static inherit Search.Grammar.AbstractParser; static inherit Search.Grammar.Lexer; static private inherit "./module.pmod"; //static constant ParseNode = Search.Grammar.ParseNode; //static constant OrNode = Search.Grammar.OrNode; //static constant AndNode = Search.Grammar.AndNode; //static constant TextNode = Search.Grammar.TextNode;
070ae42001-06-01David Norlin 
6ec0572001-05-31David Norlin #include "debug.h" // ========================================================================= // GRAMMAR FOR IMPLICIT <anything> // ========================================================================= // // START : query // ; // // query : query 'or' expr0 // | expr0 // ; // // expr0 : expr0 expr1 // imlicit AND // | expr0 'and' expr1 // (on this level) // | expr1 // ; // // expr1 : expr2 // ; // // expr2 : expr3 // | field ':' expr3 // | 'date' ':' date // | '(' query ')' // ; // // date : (word <not followed by ':'> // | '-' | ':' | <unknown character> )* // ; // // NOTE: when looking for an operator here (expr3 - expr5), we have to check // that it is not followed by a "field:", or "(". // // expr3 : expr3 'or' expr4 // | expr4 // | <empty> // ; // // expr4 : expr4 'and' expr5 // | expr5 // ; // // expr5 : expr5 expr6 // | expr6 // ; // // expr6 : '-' expr7 // | '+' expr7 // | expr7 // ; // // expr7 : word // | phrase // ; static array(array(Token|string)) tokens; static array(string) fieldstack;
070ae42001-06-01David Norlin mapping(string:mixed) options;
6ec0572001-05-31David Norlin  static array(Token|string) peek(void|int lookahead) { if (lookahead >= sizeof(tokens)) lookahead = sizeof(tokens) - 1; return tokens[lookahead]; } static void advance() { if (sizeof(tokens) > 1) tokens = tokens[1 .. ]; }
070ae42001-06-01David Norlin static int lookingAtFieldStart(void|int offset) { multiset(string) fields = options["fields"]; // SHOW(tokens);
d77baa2001-06-14David Norlin  return peek(offset)[0] == TOKEN_TEXT
070ae42001-06-01David Norlin  && fields[ lower_case(peek(offset)[1]) ] && peek(offset + 1)[0] == TOKEN_COLON; } static void create(mapping(string:mixed)|void opt) { options = opt || ([ "implicit" : "or" ]); if (!options["fields"]) options["fields"] = getDefaultFields();
6ec0572001-05-31David Norlin } ParseNode parse(string q) { fieldstack = ({ "any" }); tokens = tokenize(q); return parseQuery(); } static ParseNode parseQuery() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  ParseNode or = OrNode(); for (;;) { ParseNode n = parseExpr0(); or->addChild(n); if (peek()[0] == TOKEN_OR) advance(); else break; } if (sizeof(or->children) == 1) return or->children[0]; return or; } static ParseNode parseExpr0() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  ParseNode and = AndNode(); for (;;) { ParseNode n = parseExpr1(); and->addChild(n); if (peek()[0] == TOKEN_AND) advance(); else if ((< TOKEN_END, TOKEN_RPAREN, TOKEN_OR >)[ peek()[0] ]) break; // implicit AND } if (sizeof(and->children) == 1) return and->children[0]; return and; } static ParseNode parseExpr1() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  return parseExpr2(); } static ParseNode parseExpr2() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  // field ':' expr3
070ae42001-06-01David Norlin  if (lookingAtFieldStart())
6ec0572001-05-31David Norlin  {
6248b82002-03-11Henrik Grubbström (Grubba)  // TRACE;
6ec0572001-05-31David Norlin  fieldstack = ({ peek()[1] }) + fieldstack; advance(); advance(); ParseNode n = fieldstack[0] == "date" ? parseDate() : parseExpr3(); fieldstack = fieldstack[1 .. ]; return n; } // '(' query ')' if (peek()[0] == TOKEN_LPAREN) { advance(); ParseNode n = parseQuery(); if (peek()[0] == TOKEN_RPAREN) advance(); return n; } return parseExpr3(); } static ParseNode parseExpr3() {
070ae42001-06-01David Norlin  // TRACE; if (lookingAtFieldStart())
6ec0572001-05-31David Norlin  return 0; ParseNode or = OrNode(); for (;;) { ParseNode n = parseExpr4(); or->addChild(n); if (peek()[0] == TOKEN_OR)
070ae42001-06-01David Norlin  if (lookingAtFieldStart(1))
6ec0572001-05-31David Norlin  break; // it was a higher level OR else advance(); else break; } if (sizeof(or->children) == 1) return or->children[0]; return or; } static ParseNode parseExpr4() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  ParseNode and = AndNode(); for (;;) { ParseNode n = parseExpr5(); and->addChild(n); // NOTE: No implicit and here! if (peek()[0] == TOKEN_AND
070ae42001-06-01David Norlin  && !(lookingAtFieldStart(1) // it was a higher level AND
6ec0572001-05-31David Norlin  || peek(1)[0] == TOKEN_LPAREN)) advance(); else break; } if (sizeof(and->children) == 1) return and->children[0]; return and; } static ParseNode parseExpr5() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  ParseNode text = TextNode();
6248b82002-03-11Henrik Grubbström (Grubba)  ParseNode res = AndNode();
6ec0572001-05-31David Norlin  text->field = fieldstack[0]; for (;;) {
6248b82002-03-11Henrik Grubbström (Grubba)  int prefix = 0; if (peek()[0] == TOKEN_MINUS) { advance(); prefix = '-'; } else if (peek()[0] == TOKEN_PLUS) { advance(); prefix = '+'; } if (!prefix && options["implicit"] == "and") prefix = '+'; while (!(< TOKEN_TEXT, TOKEN_END >) [ peek()[0] ]) advance(); // ... ????????? or something smarter ????? int with_field; if (with_field = lookingAtFieldStart()) { // Special case... ParseNode tmp = TextNode(); tmp->field = peek()[1]; advance(); advance(); while (!(< TOKEN_TEXT, TOKEN_END >) [ peek()[0] ]) advance(); // ... ????????? or something smarter ????? parseExpr6(prefix, tmp); if (sizeof(tmp->words) || sizeof(tmp->phrases) || sizeof(tmp->plusWords) || sizeof(tmp->plusPhrases) || sizeof(tmp->minusWords) || sizeof(tmp->minusPhrases)) { res->addChild(tmp); } } else { parseExpr6(prefix, text); }
6ec0572001-05-31David Norlin  if ( (< TOKEN_END, TOKEN_RPAREN, TOKEN_AND, TOKEN_OR >) [ peek()[0] ]
070ae42001-06-01David Norlin  || lookingAtFieldStart()
6ec0572001-05-31David Norlin  || (peek()[0] == TOKEN_LPAREN)) break; // it was a higher level IMPLICIT AND if (peek()[0] == TOKEN_OR)
d77baa2001-06-14David Norlin  if (lookingAtFieldStart(1)
6ec0572001-05-31David Norlin  || peek(1)[0] == TOKEN_LPAREN) break; // it was a higher level OR else advance(); }
6248b82002-03-11Henrik Grubbström (Grubba) 
6ec0572001-05-31David Norlin  if (sizeof(text->words) || sizeof(text->phrases) || sizeof(text->plusWords) || sizeof(text->plusPhrases) || sizeof(text->minusWords) || sizeof(text->minusPhrases))
6248b82002-03-11Henrik Grubbström (Grubba)  res->addChild(text); if (sizeof(res->children) > 1) return res; if (sizeof(res->children) == 1) return res->children[0];
6ec0572001-05-31David Norlin  return 0; }
6248b82002-03-11Henrik Grubbström (Grubba) static void parseExpr6(int prefix, TextNode node) {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin 
d77baa2001-06-14David Norlin  if (peek()[0] == TOKEN_TEXT) { string text = peek()[1];
6ec0572001-05-31David Norlin  advance();
d77baa2001-06-14David Norlin  array(string) words = Unicode.split_words_and_normalize(text);
6248b82002-03-11Henrik Grubbström (Grubba)  if (words) { if (sizeof(words) == 1) switch (prefix) {
6ec0572001-05-31David Norlin  case '+': node->plusWords += words; break; case '-': node->minusWords += words; break; default: node->words += words; break;
6248b82002-03-11Henrik Grubbström (Grubba)  } else if (sizeof(words) > 1) switch (prefix) {
6ec0572001-05-31David Norlin  case '+': node->plusPhrases += ({ words }); break; case '-': node->minusPhrases += ({ words }); break; default: node->phrases += ({ words }); break;
6248b82002-03-11Henrik Grubbström (Grubba)  } }
6ec0572001-05-31David Norlin  } } static ParseNode parseDate() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  DateNode n = DateNode(); n->date = ""; loop: for (;;) { switch (peek()[0]) {
d77baa2001-06-14David Norlin  case TOKEN_TEXT:
070ae42001-06-01David Norlin  if (lookingAtFieldStart())
6ec0572001-05-31David Norlin  break loop; // it's a field specifier break; case TOKEN_MINUS: case TOKEN_COLON: break; default: break loop; } n->date += peek()[2]; // with spaces preserved! advance(); } return n; }
6248b82002-03-11Henrik Grubbström (Grubba)