pike.git
/
lib
/
modules
/
Search.pmod
/
Grammar.pmod
/
Parser.pmod
version
»
Context lines:
10
20
40
80
file
none
3
pike.git/lib/modules/Search.pmod/Grammar.pmod/Parser.pmod:1:
+
inherit .Lexer;
-
+
#include "debug.h"
+
+
class ParseNode {
+
string op = "<node>";
+
array(ParseNode) children = ({});
+
void addChild(ParseNode n) { if (n) children += ({ n }); }
+
+
static string indentArray(array(string) stuff, string indent) {
+
return map(stuff,
+
lambda(string s) {
+
return replace(s, "\n", "\n" + indent);
+
}) * ("\n" + indent);
+
}
+
+
static string printChildren(string indent) {
+
return indentArray(map(children,
+
lambda(ParseNode n) { return n->print(); }
+
),
+
indent);
+
}
+
+
string print() {
+
string indent = " " * (strlen(op) + 2);
+
return sprintf("(%s %s)", op, printChildren(indent));
+
}
+
}
+
+
class AndNode {
+
inherit ParseNode;
+
string op = "and";
+
}
+
+
class OrNode {
+
inherit ParseNode;
+
string op = "or";
+
}
+
+
class DateNode {
+
inherit ParseNode;
+
string op = "date";
+
string date;
+
string print() { return sprintf("(%s %O)", op, date); }
+
}
+
+
class TextNode {
+
inherit ParseNode;
+
string op = "text";
+
string field;
+
array(string) words = ({});
+
array(string) plusWords = ({});
+
array(string) minusWords = ({});
+
array(array(string)) phrases = ({});
+
array(array(string)) plusPhrases = ({});
+
array(array(string)) minusPhrases = ({});
+
string print() {
+
array(string) a = ({ "" });
+
+
foreach (words, string w) a += ({ w });
+
foreach (plusWords, string w) a += ({ "+" + w });
+
foreach (minusWords, string w) a += ({ "-" + w });
+
+
foreach (phrases, array(string) p) a += ({ "\"" + p * " " + "\"" });
+
foreach (plusPhrases, array(string) p) a += ({ "+\"" + p * " " + "\"" });
+
foreach (minusPhrases, array(string) p) a += ({ "-\"" + p * " " + "\"" });
+
return sprintf("(%s %O %s)", op, field,
+
indentArray(a, " " * (strlen(op) + 2)));
+
}
+
}
+
+
static int isFieldSpecWord(string word) {
+
return (<
+
"any",
+
"date",
+
"title",
+
"description",
+
"url",
+
"keywords",
+
>) [lower_case(word)];
+
}
+
+
// Splits a string into its words...
+
// TODO: Fix a better phrase function....
+
static array(string) splitPhrase(string phrase) {
+
return phrase / " " - ({ "" });
+
}
+
+
// AND merge: Can merge all nodes with - or + bef. each thing.
+
// OR merge: Can merge all nodes without any - or +.
+
static array(TextNode) mergeTextNodes(array(TextNode) a, string op) {
+
array(TextNode) result = ({});
+
mapping(string:array(TextNode)) fields = ([]);
+
foreach (a, TextNode t)
+
fields[t->field] = (fields[t->field] || ({ })) + ({ t });
+
+
// Only merge nodes in the same field
+
foreach (indices(fields), string field) {
+
array(TextNode) unMerged = ({});
+
TextNode merged = 0;
+
foreach (fields[field], TextNode t) {
+
int canMerge = 0;
+
if (op == "and")
+
canMerge = (sizeof(t->words) == 0
+
&& sizeof(t->phrases) == 0);
+
else if (op == "or")
+
canMerge = (sizeof(t->plusWords) == 0
+
&& sizeof(t->plusPhrases) == 0
+
&& sizeof(t->minusWords) == 0
+
&& sizeof(t->minusPhrases) == 0);
+
if (canMerge) {
+
merged = merged || TextNode();
+
merged->field = field;
+
merged->words += t->words;
+
merged->plusWords += t->plusWords;
+
merged->minusWords += t->minusWords;
+
merged->phrases += t->phrases;
+
merged->plusPhrases += t->plusPhrases;
+
merged->minusPhrases += t->minusPhrases;
+
}
+
else
+
unMerged += ({ t });
+
}
+
result += unMerged;
+
if (merged)
+
result += ({ merged });
+
}
+
return result;
+
}
+
+
public ParseNode optimize(ParseNode node, string|void parentOp) {
+
if (!node)
+
return 0;
+
node->children = filter(map(node->children, optimize, node->op),
+
lambda(ParseNode n) {
+
return n != 0;
+
});
+
array(ParseNode) newChildren = 0;
+
switch (node->op) {
+
case "and":
+
if (!sizeof(node->children))
+
return 0;
+
newChildren = ({});
+
// Check if we can merge TextNodes with the same field
+
{
+
array(TextNode) toMerge = ({});
+
foreach (node->children, ParseNode child) {
+
if (child->op == "and")
+
newChildren += child->children;
+
else if (child->op == "text")
+
toMerge += ({ child });
+
else
+
newChildren += ({ child });
+
}
+
newChildren += mergeTextNodes(toMerge, "and");
+
}
+
break;
+
case "or":
+
if (!sizeof(node->children))
+
return 0;
+
newChildren = ({});
+
+
{
+
array(TextNode) toMerge = ({});
+
foreach (node->children, ParseNode child) {
+
if (child->op == "or")
+
newChildren += child->children;
+
else if (child->op == "text")
+
toMerge += ({ child });
+
else
+
newChildren += ({ child });
+
}
+
newChildren += mergeTextNodes(toMerge, "or");
+
}
+
break;
+
case "date":
+
if (!node->date || node->date == "")
+
return 0;
+
break;
+
}
+
if (newChildren)
+
node->children = newChildren;
+
return node;
+
}
+
+
static void v(ParseNode node, ParseNode parent) {
+
map(node->children, v, node);
+
switch (node->op) {
+
case "date":
+
if (!parent || parent->op != "and")
+
throw ("date must restrict query");
+
break;
+
case "and":
+
break;
+
case "text":
+
if (node->minusWords || node->minusPhrases) {
+
if (!sizeof(node->plusWords)
+
&& !sizeof(node->plusPhrases)
+
&& !sizeof(node->words)
+
&& !sizeof(node->phrases))
+
throw ("negative query not allowed");
+
}
+
}
+
}
+
+
public string validate(ParseNode node) {
+
mixed err = catch (v(node, 0));
+
if (err)
+
if (stringp(err))
+
return err;
+
else
+
throw (err);
+
return 0;
+
}
+
+
static void lowlevel(string s, mixed ... args) {
+
werror(s, @args);
+
werror("\n");
+
}
+
+
public void execute(ParseNode q) {
+
switch (q->op) {
+
case "and":
+
{
+
int first = 1;
+
foreach (q->children, ParseNode child)
+
if (child->op != "date") {
+
execute(child);
+
if (!first)
+
lowlevel("AND");
+
else
+
first = 0;
+
}
+
foreach (q->children, ParseNode child)
+
if (child->op == "date")
+
execute(child);
+
}
+
break;
+
case "or":
+
int first = 1;
+
foreach (q->children, ParseNode child) {
+
execute(child);
+
if (!first)
+
lowlevel("OR");
+
else
+
first = 0;
+
}
+
break;
+
case "date":
+
lowlevel("DATE_FILTER %O", q->date);
+
break;
+
case "text":
+
{
+
int hasPlus = sizeof(q->plusWords) || sizeof(q->plusPhrases);
+
int hasOrdinary = sizeof(q->words) || sizeof(q->phrases);
+
int hasMinus = sizeof(q->minusWords) || sizeof(q->minusPhrases);
+
if (hasPlus) {
+
int first = 1;
+
if (sizeof(q->plusWords)) {
+
lowlevel("QUERY_AND field:%O %{ %O%}", q->field, q->plusWords);
+
first = 0;
+
}
+
foreach (q->plusPhrases, array(string) ph) {
+
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph);
+
if (first)
+
first = 0;
+
else
+
lowlevel("AND");
+
}
+
}
+
if (hasOrdinary) {
+
int first = 1;
+
if (sizeof(q->words)) {
+
lowlevel("QUERY_OR field:%O %{ %O%}", q->field, q->words);
+
first = 0;
+
}
+
foreach (q->phrases, array(string) ph) {
+
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph);
+
if (first)
+
first = 0;
+
else
+
lowlevel("OR");
+
}
+
}
+
+
if (hasPlus && hasOrdinary)
+
lowlevel("UPRANK"); // the XXX operation :)
+
+
if (hasMinus) {
+
int first = 1;
+
if (sizeof(q->minusWords)) {
+
lowlevel("QUERY_OR field:%O %{ %O%}", q->field, q->minusWords);
+
first = 0;
+
}
+
foreach (q->minusPhrases, array(string) ph) {
+
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph);
+
if (first)
+
first = 0;
+
else
+
lowlevel("OR");
+
}
+
lowlevel("SUB"); // Only - words are not allowed
+
}
+
+
break;
+
}
+
} // switch (q->op)
+
}
+
+
// =========================================================================
+
// GRAMMAR FOR IMPLICIT <anything>
+
// =========================================================================
+
//
+
// START : query
+
// ;
+
//
+
// query : query 'or' expr0
+
// | expr0
+
// ;
+
//
+
// expr0 : expr0 expr1 // imlicit AND
+
// | expr0 'and' expr1 // (on this level)
+
// | expr1
+
// ;
+
//
+
// expr1 : expr2
+
// ;
+
//
+
// expr2 : expr3
+
// | field ':' expr3
+
// | 'date' ':' date
+
// | '(' query ')'
+
// ;
+
//
+
// date : (word <not followed by ':'>
+
// | '-' | ':' | <unknown character> )*
+
// ;
+
//
+
// NOTE: when looking for an operator here (expr3 - expr5), we have to check
+
// that it is not followed by a "field:", or "(".
+
//
+
// expr3 : expr3 'or' expr4
+
// | expr4
+
// | <empty>
+
// ;
+
//
+
// expr4 : expr4 'and' expr5
+
// | expr5
+
// ;
+
//
+
// expr5 : expr5 expr6
+
// | expr6
+
// ;
+
//
+
// expr6 : '-' expr7
+
// | '+' expr7
+
// | expr7
+
// ;
+
//
+
// expr7 : word
+
// | phrase
+
// ;
+
+
public class Parser {
+
+
static array(array(Token|string)) tokens;
+
static array(string) fieldstack;
+
mapping(string:string) options;
+
+
static array(Token|string) peek(void|int lookahead) {
+
if (lookahead >= sizeof(tokens))
+
lookahead = sizeof(tokens) - 1;
+
return tokens[lookahead];
+
}
+
+
static void advance() {
+
if (sizeof(tokens) > 1)
+
tokens = tokens[1 .. ];
+
}
+
+
static void create(mapping(string:string)|void opt) {
+
options = opt || ([]);
+
}
+
+
ParseNode parse(string q) {
+
fieldstack = ({ "any" });
+
tokens = tokenize(q);
+
return parseQuery();
+
}
+
+
static ParseNode parseQuery() {
+
ParseNode or = OrNode();
+
for (;;) {
+
ParseNode n = parseExpr0();
+
or->addChild(n);
+
if (peek()[0] == TOKEN_OR)
+
advance();
+
else
+
break;
+
}
+
if (sizeof(or->children) == 1)
+
return or->children[0];
+
return or;
+
}
+
+
static ParseNode parseExpr0() {
+
ParseNode and = AndNode();
+
for (;;) {
+
ParseNode n = parseExpr1();
+
and->addChild(n);
+
if (peek()[0] == TOKEN_AND)
+
advance();
+
else if ((< TOKEN_END,
+
TOKEN_RPAREN,
+
TOKEN_OR >)[ peek()[0] ])
+
break;
+
// implicit AND
+
}
+
if (sizeof(and->children) == 1)
+
return and->children[0];
+
return and;
+
}
+
+
static ParseNode parseExpr1() {
+
return parseExpr2();
+
}
+
+
static ParseNode parseExpr2() {
+
+
// field ':' expr3
+
if (peek()[0] == TOKEN_WORD
+
&& isFieldSpecWord(peek()[1])
+
&& peek(1)[0] == TOKEN_COLON)
+
{
+
fieldstack = ({ peek()[1] }) + fieldstack;
+
advance();
+
advance();
+
ParseNode n = fieldstack[0] == "date"
+
? parseDate()
+
: parseExpr3();
+
fieldstack = fieldstack[1 .. ];
+
return n;
+
}
+
+
// '(' query ')'
+
if (peek()[0] == TOKEN_LPAREN) {
+
advance();
+
ParseNode n = parseQuery();
+
if (peek()[0] == TOKEN_RPAREN)
+
advance();
+
return n;
+
}
+
return parseExpr3();
+
}
+
+
static ParseNode parseExpr3() {
+
if (peek()[0] == TOKEN_WORD
+
&& peek(1)[0] == TOKEN_COLON)
+
return 0;
+
ParseNode or = OrNode();
+
for (;;) {
+
ParseNode n = parseExpr4();
+
or->addChild(n);
+
if (peek()[0] == TOKEN_OR)
+
if (peek(1)[0] == TOKEN_WORD
+
&& peek(2)[0] == TOKEN_COLON)
+
break; // it was a higher level OR
+
else
+
advance();
+
else
+
break;
+
}
+
if (sizeof(or->children) == 1)
+
return or->children[0];
+
return or;
+
}
+
+
static ParseNode parseExpr4() {
+
ParseNode and = AndNode();
+
for (;;) {
+
ParseNode n = parseExpr5();
+
and->addChild(n);
+
// NOTE: No implicit and here!
+
if (peek()[0] == TOKEN_AND
+
&& !(peek(1)[0] == TOKEN_WORD
+
&& peek(2)[0] == TOKEN_COLON // it was a higher level AND
+
|| peek(1)[0] == TOKEN_LPAREN))
+
advance();
+
else
+
break;
+
}
+
if (sizeof(and->children) == 1)
+
return and->children[0];
+
return and;
+
}
+
+
static ParseNode parseExpr5() {
+
ParseNode text = TextNode();
+
text->field = fieldstack[0];
+
for (;;) {
+
parseExpr6(text);
+
if ( (< TOKEN_END,
+
TOKEN_RPAREN,
+
TOKEN_AND,
+
TOKEN_OR >) [ peek()[0] ]
+
|| (peek()[0] == TOKEN_WORD
+
&& peek(1)[0] == TOKEN_COLON)
+
|| (peek()[0] == TOKEN_LPAREN))
+
break; // it was a higher level IMPLICIT AND
+
if (peek()[0] == TOKEN_OR)
+
if (peek(1)[0] == TOKEN_WORD
+
&& peek(2)[0] == TOKEN_COLON
+
|| peek(1)[0] == TOKEN_LPAREN)
+
break; // it was a higher level OR
+
else
+
advance();
+
}
+
if (sizeof(text->words)
+
|| sizeof(text->phrases)
+
|| sizeof(text->plusWords)
+
|| sizeof(text->plusPhrases)
+
|| sizeof(text->minusWords)
+
|| sizeof(text->minusPhrases))
+
return text;
+
return 0;
+
}
+
+
static void parseExpr6(TextNode node) {
+
int prefix = 0;
+
+
if (peek()[0] == TOKEN_MINUS) {
+
advance();
+
prefix = '-';
+
}
+
else if (peek()[0] == TOKEN_PLUS) {
+
advance();
+
prefix = '+';
+
}
+
+
if (!prefix && options["implicit"] == "and")
+
prefix = '+';
+
+
while (!(< TOKEN_PHRASE,
+
TOKEN_WORD,
+
TOKEN_END >) [ peek()[0] ])
+
advance(); // ... ????????? or something smarter ?????
+
+
if (peek()[0] == TOKEN_PHRASE
+
|| peek()[0] == TOKEN_WORD) {
+
string phrase = peek()[1];
+
advance();
+
array(string) words = splitPhrase(phrase);
+
if (!words || !sizeof(words))
+
return;
+
if (sizeof(words) == 1)
+
switch (prefix) {
+
case '+': node->plusWords += words; break;
+
case '-': node->minusWords += words; break;
+
default: node->words += words; break;
+
}
+
else if (sizeof(words) > 1)
+
switch (prefix) {
+
case '+': node->plusPhrases += ({ words }); break;
+
case '-': node->minusPhrases += ({ words }); break;
+
default: node->phrases += ({ words }); break;
+
}
+
}
+
}
+
+
static ParseNode parseDate() {
+
DateNode n = DateNode();
+
n->date = "";
+
loop:
+
for (;;) {
+
switch (peek()[0]) {
+
case TOKEN_WORD:
+
if (isFieldSpecWord(peek()[1])
+
&& peek(1)[0] == TOKEN_COLON)
+
break loop; // it's a field specifier
+
break;
+
case TOKEN_UNKNOWN:
+
case TOKEN_MINUS:
+
case TOKEN_COLON:
+
break;
+
default:
+
break loop;
+
}
+
n->date += peek()[2]; // with spaces preserved!
+
advance();
+
}
+
return n;
+
}
+
+
} // class Parser
Newline at end of file added.