eb01b42010-10-26Martin Stjernholm #pike __REAL_VERSION__
0844382001-06-10Per Hedbor 
ff17962014-08-15Martin Nilsson protected inherit .AbstractParser; protected inherit .Lexer;
40a44d2004-08-07Johan Schön import ".";
070ae42001-06-01David Norlin 
6ec0572001-05-31David Norlin #include "debug.h" // =========================================================================
40a44d2004-08-07Johan Schön // GRAMMAR FOR IMPLICIT AND/OR
6ec0572001-05-31David Norlin // ========================================================================= // // START : query // ;
3524712015-05-26Martin Nilsson //
6ec0572001-05-31David Norlin // query : query 'or' expr0 // | expr0 // ;
3524712015-05-26Martin Nilsson //
6ec0572001-05-31David Norlin // expr0 : expr0 expr1 // imlicit AND // | expr0 'and' expr1 // (on this level) // | expr1 // ; // // expr1 : expr2 // ;
3524712015-05-26Martin Nilsson //
6ec0572001-05-31David Norlin // expr2 : expr3 // | field ':' expr3
40a44d2004-08-07Johan Schön // | 'date' '>' date // | 'date' '<' date // | 'date' '=' date // | 'date' '!=' date // | 'date' '<>' date
6ec0572001-05-31David Norlin // | '(' query ')' // ;
40a44d2004-08-07Johan Schön //
6ec0572001-05-31David Norlin // date : (word <not followed by ':'> // | '-' | ':' | <unknown character> )* // ; // // NOTE: when looking for an operator here (expr3 - expr5), we have to check // that it is not followed by a "field:", or "(".
3524712015-05-26Martin Nilsson //
6ec0572001-05-31David Norlin // expr3 : expr3 'or' expr4 // | expr4 // | <empty> // ; // // expr4 : expr4 'and' expr5 // | expr5 // ;
3524712015-05-26Martin Nilsson //
6ec0572001-05-31David Norlin // expr5 : expr5 expr6 // | expr6 // ; // // expr6 : '-' expr7 // | '+' expr7 // | expr7 // ;
3524712015-05-26Martin Nilsson //
6ec0572001-05-31David Norlin // expr7 : word // | phrase // ;
ff17962014-08-15Martin Nilsson protected array(array(Token|string)) tokens; protected array(string) fieldstack;
40a44d2004-08-07Johan Schön  // fields : multiset(string) // implicit : "or"/"and" //!
070ae42001-06-01David Norlin mapping(string:mixed) options;
6ec0572001-05-31David Norlin 
ff17962014-08-15Martin Nilsson protected array(Token|string) peek(void|int lookahead) {
6ec0572001-05-31David Norlin  if (lookahead >= sizeof(tokens)) lookahead = sizeof(tokens) - 1; return tokens[lookahead]; }
ff17962014-08-15Martin Nilsson protected array advance()
40a44d2004-08-07Johan Schön { array res = tokens[0];
6ec0572001-05-31David Norlin  if (sizeof(tokens) > 1) tokens = tokens[1 .. ];
40a44d2004-08-07Johan Schön  return res;
6ec0572001-05-31David Norlin }
ff17962014-08-15Martin Nilsson protected int lookingAtFieldStart(void|int offset) {
070ae42001-06-01David Norlin  multiset(string) fields = options["fields"]; // SHOW(tokens);
d77baa2001-06-14David Norlin  return peek(offset)[0] == TOKEN_TEXT
070ae42001-06-01David Norlin  && fields[ lower_case(peek(offset)[1]) ] && peek(offset + 1)[0] == TOKEN_COLON; }
ff17962014-08-15Martin Nilsson protected int lookingAtDateStart(void|int offset) {
40a44d2004-08-07Johan Schön  // SHOW(tokens); return peek(offset)[0] == TOKEN_TEXT && lower_case(peek(offset)[1])=="date" && (< TOKEN_EQUAL, TOKEN_LESSEQUAL, TOKEN_GREATEREQUAL, TOKEN_NOTEQUAL, TOKEN_LESS, TOKEN_GREATER >)[ peek(offset + 1)[0]]; } //!
ff17962014-08-15Martin Nilsson protected void create(mapping(string:mixed)|void opt) {
070ae42001-06-01David Norlin  options = opt || ([ "implicit" : "or" ]); if (!options["fields"]) options["fields"] = getDefaultFields();
6ec0572001-05-31David Norlin }
40a44d2004-08-07Johan Schön //!
6ec0572001-05-31David Norlin ParseNode parse(string q) { fieldstack = ({ "any" }); tokens = tokenize(q); return parseQuery(); }
ff17962014-08-15Martin Nilsson protected ParseNode parseQuery() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  ParseNode or = OrNode(); for (;;) { ParseNode n = parseExpr0(); or->addChild(n); if (peek()[0] == TOKEN_OR) advance();
b4527e2002-03-11Henrik Grubbström (Grubba)  else if ((< TOKEN_END, TOKEN_RPAREN >)[ peek()[0] ] || options->implicit != "or")
6ec0572001-05-31David Norlin  break; } if (sizeof(or->children) == 1) return or->children[0]; return or; }
ff17962014-08-15Martin Nilsson protected ParseNode parseExpr0() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  ParseNode and = AndNode(); for (;;) { ParseNode n = parseExpr1(); and->addChild(n); if (peek()[0] == TOKEN_AND) advance(); else if ((< TOKEN_END, TOKEN_RPAREN,
b4527e2002-03-11Henrik Grubbström (Grubba)  TOKEN_OR >)[ peek()[0] ] || options->implicit != "and")
6ec0572001-05-31David Norlin  break; // implicit AND } if (sizeof(and->children) == 1) return and->children[0]; return and; }
ff17962014-08-15Martin Nilsson protected ParseNode parseExpr1() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  return parseExpr2(); }
ff17962014-08-15Martin Nilsson protected ParseNode parseExpr2() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  // field ':' expr3
070ae42001-06-01David Norlin  if (lookingAtFieldStart())
6ec0572001-05-31David Norlin  {
6248b82002-03-11Henrik Grubbström (Grubba)  // TRACE;
6ec0572001-05-31David Norlin  fieldstack = ({ peek()[1] }) + fieldstack; advance(); advance();
40a44d2004-08-07Johan Schön  ParseNode n = parseExpr3();
6ec0572001-05-31David Norlin  fieldstack = fieldstack[1 .. ]; return n; }
40a44d2004-08-07Johan Schön  // 'date' <op> date if(lookingAtDateStart()) { advance(); array operator = advance(); return parseDate(operator); }
6ec0572001-05-31David Norlin  // '(' query ')' if (peek()[0] == TOKEN_LPAREN) { advance(); ParseNode n = parseQuery(); if (peek()[0] == TOKEN_RPAREN) advance(); return n; } return parseExpr3(); }
ff17962014-08-15Martin Nilsson protected ParseNode parseExpr3() {
070ae42001-06-01David Norlin  // TRACE;
40a44d2004-08-07Johan Schön  if (lookingAtFieldStart() || lookingAtDateStart())
6ec0572001-05-31David Norlin  return 0; ParseNode or = OrNode(); for (;;) { ParseNode n = parseExpr4(); or->addChild(n); if (peek()[0] == TOKEN_OR)
40a44d2004-08-07Johan Schön  if (lookingAtFieldStart(1) || lookingAtDateStart(1))
6ec0572001-05-31David Norlin  break; // it was a higher level OR else advance(); else break; } if (sizeof(or->children) == 1) return or->children[0]; return or; }
ff17962014-08-15Martin Nilsson protected ParseNode parseExpr4() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  ParseNode and = AndNode(); for (;;) { ParseNode n = parseExpr5(); and->addChild(n); // NOTE: No implicit and here! if (peek()[0] == TOKEN_AND
070ae42001-06-01David Norlin  && !(lookingAtFieldStart(1) // it was a higher level AND
40a44d2004-08-07Johan Schön  || lookingAtDateStart(1)
6ec0572001-05-31David Norlin  || peek(1)[0] == TOKEN_LPAREN)) advance(); else break; } if (sizeof(and->children) == 1) return and->children[0]; return and; }
ff17962014-08-15Martin Nilsson protected ParseNode parseExpr5() {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  ParseNode text = TextNode();
b4527e2002-03-11Henrik Grubbström (Grubba)  ParseNode res;
6ec0572001-05-31David Norlin  text->field = fieldstack[0];
b4527e2002-03-11Henrik Grubbström (Grubba)  if (options->implicit == "or") { res = OrNode(); } else { res = AndNode(); }
6ec0572001-05-31David Norlin  for (;;) {
6248b82002-03-11Henrik Grubbström (Grubba)  int prefix = 0; if (peek()[0] == TOKEN_MINUS) { advance(); prefix = '-'; } else if (peek()[0] == TOKEN_PLUS) { advance(); prefix = '+'; } if (!prefix && options["implicit"] == "and") prefix = '+'; while (!(< TOKEN_TEXT, TOKEN_END >) [ peek()[0] ]) advance(); // ... ????????? or something smarter ?????
40a44d2004-08-07Johan Schön  if(lookingAtFieldStart()) {
6248b82002-03-11Henrik Grubbström (Grubba)  // Special case... ParseNode tmp = TextNode(); tmp->field = peek()[1]; advance(); advance(); while (!(< TOKEN_TEXT, TOKEN_END >) [ peek()[0] ]) advance(); // ... ????????? or something smarter ????? parseExpr6(prefix, tmp); if (sizeof(tmp->words) || sizeof(tmp->phrases) || sizeof(tmp->plusWords) || sizeof(tmp->plusPhrases) || sizeof(tmp->minusWords) || sizeof(tmp->minusPhrases)) { res->addChild(tmp); } } else { parseExpr6(prefix, text); }
6ec0572001-05-31David Norlin  if ( (< TOKEN_END, TOKEN_RPAREN, TOKEN_AND, TOKEN_OR >) [ peek()[0] ]
070ae42001-06-01David Norlin  || lookingAtFieldStart()
40a44d2004-08-07Johan Schön  || lookingAtDateStart()
6ec0572001-05-31David Norlin  || (peek()[0] == TOKEN_LPAREN)) break; // it was a higher level IMPLICIT AND if (peek()[0] == TOKEN_OR)
d77baa2001-06-14David Norlin  if (lookingAtFieldStart(1)
40a44d2004-08-07Johan Schön  || lookingAtDateStart(1)
6ec0572001-05-31David Norlin  || peek(1)[0] == TOKEN_LPAREN) break; // it was a higher level OR else advance(); }
6248b82002-03-11Henrik Grubbström (Grubba) 
6ec0572001-05-31David Norlin  if (sizeof(text->words) || sizeof(text->phrases) || sizeof(text->plusWords) || sizeof(text->plusPhrases) || sizeof(text->minusWords) || sizeof(text->minusPhrases))
6248b82002-03-11Henrik Grubbström (Grubba)  res->addChild(text); if (sizeof(res->children) > 1) return res; if (sizeof(res->children) == 1) return res->children[0];
6ec0572001-05-31David Norlin  return 0; }
ff17962014-08-15Martin Nilsson protected void parseExpr6(int prefix, TextNode node) {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin 
d77baa2001-06-14David Norlin  if (peek()[0] == TOKEN_TEXT) { string text = peek()[1];
6ec0572001-05-31David Norlin  advance();
8b4ffb2002-03-12Johan Schön  string star = "86196759014593256"; string questionmark = "76196758925470133"; text=replace(text,({"*","?"}), ({star, questionmark}));
d77baa2001-06-14David Norlin  array(string) words = Unicode.split_words_and_normalize(text);
8b4ffb2002-03-12Johan Schön  for(int i=0; i<sizeof(words); i++) words[i]=replace(words[i], ({star, questionmark}), ({"*","?"})); // End of abominable kludge
6248b82002-03-11Henrik Grubbström (Grubba)  if (words) {
75b6b72008-06-25Marcus Wellhardh  // If search phrase, remove empty globs. This might promote to // ordinary search word that do support remining globs. if (sizeof(words) > 1) words = filter(words, lambda(string w) { return (w - "*" - "?") == "" ? 0 : 1; });
3524712015-05-26Martin Nilsson 
ba60662016-01-28Jonas Walldén  if (sizeof(words) == 1) { // If auto-globbing has been requested we take a word not using any // glob characters and wrap it in "*" + word + "*". if (options["auto-glob"] && !has_value(words[0], "*") && !has_value(words[0], "?")) { words[0] = "*" + words[0] + "*"; }
6248b82002-03-11Henrik Grubbström (Grubba)  switch (prefix) {
6ec0572001-05-31David Norlin  case '+': node->plusWords += words; break; case '-': node->minusWords += words; break; default: node->words += words; break;
6248b82002-03-11Henrik Grubbström (Grubba)  }
ba60662016-01-28Jonas Walldén  } else if (sizeof(words) > 1) { // No use of globs at this point so remove them. Auto-globbing isn't // used in this case either.
4e87842008-06-24Marcus Wellhardh  words = map(words, lambda(string w) { return w - "*" - "?"; } );
6248b82002-03-11Henrik Grubbström (Grubba)  switch (prefix) {
6ec0572001-05-31David Norlin  case '+': node->plusPhrases += ({ words }); break; case '-': node->minusPhrases += ({ words }); break; default: node->phrases += ({ words }); break;
6248b82002-03-11Henrik Grubbström (Grubba)  }
4e87842008-06-24Marcus Wellhardh  }
6248b82002-03-11Henrik Grubbström (Grubba)  }
6ec0572001-05-31David Norlin  } }
ff17962014-08-15Martin Nilsson protected ParseNode parseDate(array operator)
40a44d2004-08-07Johan Schön {
070ae42001-06-01David Norlin  // TRACE;
6ec0572001-05-31David Norlin  DateNode n = DateNode(); n->date = "";
40a44d2004-08-07Johan Schön  n->operator = operator;
6ec0572001-05-31David Norlin loop: for (;;) { switch (peek()[0]) {
d77baa2001-06-14David Norlin  case TOKEN_TEXT:
070ae42001-06-01David Norlin  if (lookingAtFieldStart())
6ec0572001-05-31David Norlin  break loop; // it's a field specifier break; case TOKEN_MINUS: case TOKEN_COLON: break; default: break loop; } n->date += peek()[2]; // with spaces preserved! advance(); } return n; }