inherit Search.Grammar.Lexer; |
|
#include "debug.h" |
|
class ParseNode { |
string op = "<node>"; |
array(ParseNode) children = ({}); |
void addChild(ParseNode n) { if (n) children += ({ n }); } |
|
static string indentArray(array(string) stuff, string indent) { |
return map(stuff, |
lambda(string s) { |
return replace(s, "\n", "\n" + indent); |
}) * ("\n" + indent); |
} |
|
static string printChildren(string indent) { |
return indentArray(map(children, |
lambda(ParseNode n) { return n->print(); } |
), |
indent); |
} |
|
string print() { |
string indent = " " * (strlen(op) + 2); |
return sprintf("(%s %s)", op, printChildren(indent)); |
} |
} |
|
class AndNode { |
inherit ParseNode; |
string op = "and"; |
} |
|
class OrNode { |
inherit ParseNode; |
string op = "or"; |
} |
|
class DateNode { |
inherit ParseNode; |
string op = "date"; |
string date; |
string print() { return sprintf("(%s %O)", op, date); } |
} |
|
class TextNode { |
inherit ParseNode; |
string op = "text"; |
string field; |
array(string) words = ({}); |
array(string) plusWords = ({}); |
array(string) minusWords = ({}); |
array(array(string)) phrases = ({}); |
array(array(string)) plusPhrases = ({}); |
array(array(string)) minusPhrases = ({}); |
string print() { |
array(string) a = ({ "" }); |
|
foreach (words, string w) a += ({ w }); |
foreach (plusWords, string w) a += ({ "+" + w }); |
foreach (minusWords, string w) a += ({ "-" + w }); |
|
foreach (phrases, array(string) p) a += ({ "\"" + p * " " + "\"" }); |
foreach (plusPhrases, array(string) p) a += ({ "+\"" + p * " " + "\"" }); |
foreach (minusPhrases, array(string) p) a += ({ "-\"" + p * " " + "\"" }); |
return sprintf("(%s %O %s)", op, field, |
indentArray(a, " " * (strlen(op) + 2))); |
} |
} |
|
static int isFieldSpecWord(string word) { |
return (< |
"any", |
"date", |
"title", |
"description", |
"url", |
"keywords", |
>) [lower_case(word)]; |
} |
|
|
|
static array(string) splitPhrase(string phrase) { |
return phrase / " " - ({ "" }); |
} |
|
|
|
static array(TextNode) mergeTextNodes(array(TextNode) a, string op) { |
array(TextNode) result = ({}); |
mapping(string:array(TextNode)) fields = ([]); |
foreach (a, TextNode t) |
fields[t->field] = (fields[t->field] || ({ })) + ({ t }); |
|
|
foreach (indices(fields), string field) { |
array(TextNode) unMerged = ({}); |
TextNode merged = 0; |
foreach (fields[field], TextNode t) { |
int canMerge = 0; |
if (op == "and") |
canMerge = (sizeof(t->words) == 0 |
&& sizeof(t->phrases) == 0); |
else if (op == "or") |
canMerge = (sizeof(t->plusWords) == 0 |
&& sizeof(t->plusPhrases) == 0 |
&& sizeof(t->minusWords) == 0 |
&& sizeof(t->minusPhrases) == 0); |
if (canMerge) { |
merged = merged || TextNode(); |
merged->field = field; |
merged->words += t->words; |
merged->plusWords += t->plusWords; |
merged->minusWords += t->minusWords; |
merged->phrases += t->phrases; |
merged->plusPhrases += t->plusPhrases; |
merged->minusPhrases += t->minusPhrases; |
} |
else |
unMerged += ({ t }); |
} |
result += unMerged; |
if (merged) |
result += ({ merged }); |
} |
return result; |
} |
|
public ParseNode optimize(ParseNode node, string|void parentOp) { |
if (!node) |
return 0; |
node->children = filter(map(node->children, optimize, node->op), |
lambda(ParseNode n) { |
return n != 0; |
}); |
array(ParseNode) newChildren = 0; |
switch (node->op) { |
case "and": |
if (!sizeof(node->children)) |
return 0; |
newChildren = ({}); |
|
{ |
array(TextNode) toMerge = ({}); |
foreach (node->children, ParseNode child) { |
if (child->op == "and") |
newChildren += child->children; |
else if (child->op == "text") |
toMerge += ({ child }); |
else |
newChildren += ({ child }); |
} |
newChildren += mergeTextNodes(toMerge, "and"); |
} |
break; |
case "or": |
if (!sizeof(node->children)) |
return 0; |
newChildren = ({}); |
|
{ |
array(TextNode) toMerge = ({}); |
foreach (node->children, ParseNode child) { |
if (child->op == "or") |
newChildren += child->children; |
else if (child->op == "text") |
toMerge += ({ child }); |
else |
newChildren += ({ child }); |
} |
newChildren += mergeTextNodes(toMerge, "or"); |
} |
break; |
case "date": |
if (!node->date || node->date == "") |
return 0; |
break; |
} |
if (newChildren) |
node->children = newChildren; |
return node; |
} |
|
static void v(ParseNode node, ParseNode parent) { |
map(node->children, v, node); |
switch (node->op) { |
case "date": |
if (!parent || parent->op != "and") |
throw ("date must restrict query"); |
break; |
case "and": |
break; |
case "text": |
if (node->minusWords || node->minusPhrases) { |
if (!sizeof(node->plusWords) |
&& !sizeof(node->plusPhrases) |
&& !sizeof(node->words) |
&& !sizeof(node->phrases)) |
throw ("negative query not allowed"); |
} |
} |
} |
|
public string validate(ParseNode node) { |
mixed err = catch (v(node, 0)); |
if (err) |
if (stringp(err)) |
return err; |
else |
throw (err); |
return 0; |
} |
|
static void lowlevel(string s, mixed ... args) { |
werror(s, @args); |
werror("\n"); |
} |
|
public void execute(ParseNode q) { |
switch (q->op) { |
case "and": |
{ |
int first = 1; |
foreach (q->children, ParseNode child) |
if (child->op != "date") { |
execute(child); |
if (!first) |
lowlevel("AND"); |
else |
first = 0; |
} |
foreach (q->children, ParseNode child) |
if (child->op == "date") |
execute(child); |
} |
break; |
case "or": |
int first = 1; |
foreach (q->children, ParseNode child) { |
execute(child); |
if (!first) |
lowlevel("OR"); |
else |
first = 0; |
} |
break; |
case "date": |
lowlevel("DATE_FILTER %O", q->date); |
break; |
case "text": |
{ |
int hasPlus = sizeof(q->plusWords) || sizeof(q->plusPhrases); |
int hasOrdinary = sizeof(q->words) || sizeof(q->phrases); |
int hasMinus = sizeof(q->minusWords) || sizeof(q->minusPhrases); |
if (hasPlus) { |
int first = 1; |
if (sizeof(q->plusWords)) { |
lowlevel("QUERY_AND field:%O %{ %O%}", q->field, q->plusWords); |
first = 0; |
} |
foreach (q->plusPhrases, array(string) ph) { |
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph); |
if (first) |
first = 0; |
else |
lowlevel("AND"); |
} |
} |
if (hasOrdinary) { |
int first = 1; |
if (sizeof(q->words)) { |
lowlevel("QUERY_OR field:%O %{ %O%}", q->field, q->words); |
first = 0; |
} |
foreach (q->phrases, array(string) ph) { |
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph); |
if (first) |
first = 0; |
else |
lowlevel("OR"); |
} |
} |
|
if (hasPlus && hasOrdinary) |
lowlevel("UPRANK"); |
|
if (hasMinus) { |
int first = 1; |
if (sizeof(q->minusWords)) { |
lowlevel("QUERY_OR field:%O %{ %O%}", q->field, q->minusWords); |
first = 0; |
} |
foreach (q->minusPhrases, array(string) ph) { |
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph); |
if (first) |
first = 0; |
else |
lowlevel("OR"); |
} |
lowlevel("SUB"); |
} |
|
break; |
} |
} |
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public class Parser { |
|
static array(array(Token|string)) tokens; |
static array(string) fieldstack; |
mapping(string:string) options; |
|
static array(Token|string) peek(void|int lookahead) { |
if (lookahead >= sizeof(tokens)) |
lookahead = sizeof(tokens) - 1; |
return tokens[lookahead]; |
} |
|
static void advance() { |
if (sizeof(tokens) > 1) |
tokens = tokens[1 .. ]; |
} |
|
static void create(mapping(string:string)|void opt) { |
options = opt || ([]); |
} |
|
ParseNode parse(string q) { |
fieldstack = ({ "any" }); |
tokens = tokenize(q); |
return parseQuery(); |
} |
|
static ParseNode parseQuery() { |
ParseNode or = OrNode(); |
for (;;) { |
ParseNode n = parseExpr0(); |
or->addChild(n); |
if (peek()[0] == TOKEN_OR) |
advance(); |
else |
break; |
} |
if (sizeof(or->children) == 1) |
return or->children[0]; |
return or; |
} |
|
static ParseNode parseExpr0() { |
ParseNode and = AndNode(); |
for (;;) { |
ParseNode n = parseExpr1(); |
and->addChild(n); |
if (peek()[0] == TOKEN_AND) |
advance(); |
else if ((< TOKEN_END, |
TOKEN_RPAREN, |
TOKEN_OR >)[ peek()[0] ]) |
break; |
|
} |
if (sizeof(and->children) == 1) |
return and->children[0]; |
return and; |
} |
|
static ParseNode parseExpr1() { |
return parseExpr2(); |
} |
|
static ParseNode parseExpr2() { |
|
|
if (peek()[0] == TOKEN_WORD |
&& isFieldSpecWord(peek()[1]) |
&& peek(1)[0] == TOKEN_COLON) |
{ |
fieldstack = ({ peek()[1] }) + fieldstack; |
advance(); |
advance(); |
ParseNode n = fieldstack[0] == "date" |
? parseDate() |
: parseExpr3(); |
fieldstack = fieldstack[1 .. ]; |
return n; |
} |
|
|
if (peek()[0] == TOKEN_LPAREN) { |
advance(); |
ParseNode n = parseQuery(); |
if (peek()[0] == TOKEN_RPAREN) |
advance(); |
return n; |
} |
return parseExpr3(); |
} |
|
static ParseNode parseExpr3() { |
if (peek()[0] == TOKEN_WORD |
&& peek(1)[0] == TOKEN_COLON) |
return 0; |
ParseNode or = OrNode(); |
for (;;) { |
ParseNode n = parseExpr4(); |
or->addChild(n); |
if (peek()[0] == TOKEN_OR) |
if (peek(1)[0] == TOKEN_WORD |
&& peek(2)[0] == TOKEN_COLON) |
break; |
else |
advance(); |
else |
break; |
} |
if (sizeof(or->children) == 1) |
return or->children[0]; |
return or; |
} |
|
static ParseNode parseExpr4() { |
ParseNode and = AndNode(); |
for (;;) { |
ParseNode n = parseExpr5(); |
and->addChild(n); |
|
if (peek()[0] == TOKEN_AND |
&& !(peek(1)[0] == TOKEN_WORD |
&& peek(2)[0] == TOKEN_COLON |
|| peek(1)[0] == TOKEN_LPAREN)) |
advance(); |
else |
break; |
} |
if (sizeof(and->children) == 1) |
return and->children[0]; |
return and; |
} |
|
static ParseNode parseExpr5() { |
ParseNode text = TextNode(); |
text->field = fieldstack[0]; |
for (;;) { |
parseExpr6(text); |
if ( (< TOKEN_END, |
TOKEN_RPAREN, |
TOKEN_AND, |
TOKEN_OR >) [ peek()[0] ] |
|| (peek()[0] == TOKEN_WORD |
&& peek(1)[0] == TOKEN_COLON) |
|| (peek()[0] == TOKEN_LPAREN)) |
break; |
if (peek()[0] == TOKEN_OR) |
if (peek(1)[0] == TOKEN_WORD |
&& peek(2)[0] == TOKEN_COLON |
|| peek(1)[0] == TOKEN_LPAREN) |
break; |
else |
advance(); |
} |
if (sizeof(text->words) |
|| sizeof(text->phrases) |
|| sizeof(text->plusWords) |
|| sizeof(text->plusPhrases) |
|| sizeof(text->minusWords) |
|| sizeof(text->minusPhrases)) |
return text; |
return 0; |
} |
|
static void parseExpr6(TextNode node) { |
int prefix = 0; |
|
if (peek()[0] == TOKEN_MINUS) { |
advance(); |
prefix = '-'; |
} |
else if (peek()[0] == TOKEN_PLUS) { |
advance(); |
prefix = '+'; |
} |
|
if (!prefix && options["implicit"] == "and") |
prefix = '+'; |
|
while (!(< TOKEN_PHRASE, |
TOKEN_WORD, |
TOKEN_END >) [ peek()[0] ]) |
advance(); |
|
if (peek()[0] == TOKEN_PHRASE |
|| peek()[0] == TOKEN_WORD) { |
string phrase = peek()[1]; |
advance(); |
array(string) words = splitPhrase(phrase); |
if (!words || !sizeof(words)) |
return; |
if (sizeof(words) == 1) |
switch (prefix) { |
case '+': node->plusWords += words; break; |
case '-': node->minusWords += words; break; |
default: node->words += words; break; |
} |
else if (sizeof(words) > 1) |
switch (prefix) { |
case '+': node->plusPhrases += ({ words }); break; |
case '-': node->minusPhrases += ({ words }); break; |
default: node->phrases += ({ words }); break; |
} |
} |
} |
|
static ParseNode parseDate() { |
DateNode n = DateNode(); |
n->date = ""; |
loop: |
for (;;) { |
switch (peek()[0]) { |
case TOKEN_WORD: |
if (isFieldSpecWord(peek()[1]) |
&& peek(1)[0] == TOKEN_COLON) |
break loop; |
break; |
case TOKEN_UNKNOWN: |
case TOKEN_MINUS: |
case TOKEN_COLON: |
break; |
default: |
break loop; |
} |
n->date += peek()[2]; |
advance(); |
} |
return n; |
} |
|
} |
|
|