4855d3 | 2001-05-31 | Johan Schön | |
#include "debug.h"
class ParseNode {
string op = "<node>";
array(ParseNode) children = ({});
void addChild(ParseNode n) { if (n) children += ({ n }); }
static string indentArray(array(string) stuff, string indent) {
return map(stuff,
lambda(string s) {
return replace(s, "\n", "\n" + indent);
}) * ("\n" + indent);
}
static string printChildren(string indent) {
return indentArray(map(children,
lambda(ParseNode n) { return n->print(); }
),
indent);
}
string print() {
string indent = " " * (strlen(op) + 2);
return sprintf("(%s %s)", op, printChildren(indent));
}
}
class AndNode {
inherit ParseNode;
string op = "and";
}
class OrNode {
inherit ParseNode;
string op = "or";
}
class DateNode {
inherit ParseNode;
string op = "date";
string date;
string print() { return sprintf("(%s %O)", op, date); }
}
class TextNode {
inherit ParseNode;
string op = "text";
string field;
array(string) words = ({});
array(string) plusWords = ({});
array(string) minusWords = ({});
array(array(string)) phrases = ({});
array(array(string)) plusPhrases = ({});
array(array(string)) minusPhrases = ({});
string print() {
array(string) a = ({ "" });
foreach (words, string w) a += ({ w });
foreach (plusWords, string w) a += ({ "+" + w });
foreach (minusWords, string w) a += ({ "-" + w });
foreach (phrases, array(string) p) a += ({ "\"" + p * " " + "\"" });
foreach (plusPhrases, array(string) p) a += ({ "+\"" + p * " " + "\"" });
foreach (minusPhrases, array(string) p) a += ({ "-\"" + p * " " + "\"" });
return sprintf("(%s %O %s)", op, field,
indentArray(a, " " * (strlen(op) + 2)));
}
}
static int isFieldSpecWord(string word) {
return (<
"any",
"date",
"title",
"description",
"url",
"keywords",
>) [lower_case(word)];
}
static array(string) splitPhrase(string phrase) {
return phrase / " " - ({ "" });
}
static array(TextNode) mergeTextNodes(array(TextNode) a, string op) {
array(TextNode) result = ({});
mapping(string:array(TextNode)) fields = ([]);
foreach (a, TextNode t)
fields[t->field] = (fields[t->field] || ({ })) + ({ t });
foreach (indices(fields), string field) {
array(TextNode) unMerged = ({});
TextNode merged = 0;
foreach (fields[field], TextNode t) {
int canMerge = 0;
if (op == "and")
canMerge = (sizeof(t->words) == 0
&& sizeof(t->phrases) == 0);
else if (op == "or")
canMerge = (sizeof(t->plusWords) == 0
&& sizeof(t->plusPhrases) == 0
&& sizeof(t->minusWords) == 0
&& sizeof(t->minusPhrases) == 0);
if (canMerge) {
merged = merged || TextNode();
merged->field = field;
merged->words += t->words;
merged->plusWords += t->plusWords;
merged->minusWords += t->minusWords;
merged->phrases += t->phrases;
merged->plusPhrases += t->plusPhrases;
merged->minusPhrases += t->minusPhrases;
}
else
unMerged += ({ t });
}
result += unMerged;
if (merged)
result += ({ merged });
}
return result;
}
public ParseNode optimize(ParseNode node, string|void parentOp) {
if (!node)
return 0;
node->children = filter(map(node->children, optimize, node->op),
lambda(ParseNode n) {
return n != 0;
});
array(ParseNode) newChildren = 0;
switch (node->op) {
case "and":
if (!sizeof(node->children))
return 0;
newChildren = ({});
{
array(TextNode) toMerge = ({});
foreach (node->children, ParseNode child) {
if (child->op == "and")
newChildren += child->children;
else if (child->op == "text")
toMerge += ({ child });
else
newChildren += ({ child });
}
newChildren += mergeTextNodes(toMerge, "and");
}
break;
case "or":
if (!sizeof(node->children))
return 0;
newChildren = ({});
{
array(TextNode) toMerge = ({});
foreach (node->children, ParseNode child) {
if (child->op == "or")
newChildren += child->children;
else if (child->op == "text")
toMerge += ({ child });
else
newChildren += ({ child });
}
newChildren += mergeTextNodes(toMerge, "or");
}
break;
case "date":
if (!node->date || node->date == "")
return 0;
break;
}
if (newChildren)
node->children = newChildren;
return node;
}
static void v(ParseNode node, ParseNode parent) {
map(node->children, v, node);
switch (node->op) {
case "date":
if (!parent || parent->op != "and")
throw ("date must restrict query");
break;
case "and":
break;
case "text":
if (node->minusWords || node->minusPhrases) {
if (!sizeof(node->plusWords)
&& !sizeof(node->plusPhrases)
&& !sizeof(node->words)
&& !sizeof(node->phrases))
throw ("negative query not allowed");
}
}
}
public string validate(ParseNode node) {
mixed err = catch (v(node, 0));
if (err)
if (stringp(err))
return err;
else
throw (err);
return 0;
}
static void lowlevel(string s, mixed ... args) {
werror(s, @args);
werror("\n");
}
public void execute(ParseNode q) {
switch (q->op) {
case "and":
{
int first = 1;
foreach (q->children, ParseNode child)
if (child->op != "date") {
execute(child);
if (!first)
lowlevel("AND");
else
first = 0;
}
foreach (q->children, ParseNode child)
if (child->op == "date")
execute(child);
}
break;
case "or":
int first = 1;
foreach (q->children, ParseNode child) {
execute(child);
if (!first)
lowlevel("OR");
else
first = 0;
}
break;
case "date":
lowlevel("DATE_FILTER %O", q->date);
break;
case "text":
{
int hasPlus = sizeof(q->plusWords) || sizeof(q->plusPhrases);
int hasOrdinary = sizeof(q->words) || sizeof(q->phrases);
int hasMinus = sizeof(q->minusWords) || sizeof(q->minusPhrases);
if (hasPlus) {
int first = 1;
if (sizeof(q->plusWords)) {
lowlevel("QUERY_AND field:%O %{ %O%}", q->field, q->plusWords);
first = 0;
}
foreach (q->plusPhrases, array(string) ph) {
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph);
if (first)
first = 0;
else
lowlevel("AND");
}
}
if (hasOrdinary) {
int first = 1;
if (sizeof(q->words)) {
lowlevel("QUERY_OR field:%O %{ %O%}", q->field, q->words);
first = 0;
}
foreach (q->phrases, array(string) ph) {
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph);
if (first)
first = 0;
else
lowlevel("OR");
}
}
if (hasPlus && hasOrdinary)
lowlevel("UPRANK");
if (hasMinus) {
int first = 1;
if (sizeof(q->minusWords)) {
lowlevel("QUERY_OR field:%O %{ %O%}", q->field, q->minusWords);
first = 0;
}
foreach (q->minusPhrases, array(string) ph) {
lowlevel("QUERY_PHRASE field:%O %{ %O%}", q->field, ph);
if (first)
first = 0;
else
lowlevel("OR");
}
lowlevel("SUB");
}
break;
}
}
}
public class Parser {
static array(array(Token|string)) tokens;
static array(string) fieldstack;
mapping(string:string) options;
static array(Token|string) peek(void|int lookahead) {
if (lookahead >= sizeof(tokens))
lookahead = sizeof(tokens) - 1;
return tokens[lookahead];
}
static void advance() {
if (sizeof(tokens) > 1)
tokens = tokens[1 .. ];
}
static void create(mapping(string:string)|void opt) {
options = opt || ([]);
}
ParseNode parse(string q) {
fieldstack = ({ "any" });
tokens = tokenize(q);
return parseQuery();
}
static ParseNode parseQuery() {
ParseNode or = OrNode();
for (;;) {
ParseNode n = parseExpr0();
or->addChild(n);
if (peek()[0] == TOKEN_OR)
advance();
else
break;
}
if (sizeof(or->children) == 1)
return or->children[0];
return or;
}
static ParseNode parseExpr0() {
ParseNode and = AndNode();
for (;;) {
ParseNode n = parseExpr1();
and->addChild(n);
if (peek()[0] == TOKEN_AND)
advance();
else if ((< TOKEN_END,
TOKEN_RPAREN,
TOKEN_OR >)[ peek()[0] ])
break;
}
if (sizeof(and->children) == 1)
return and->children[0];
return and;
}
static ParseNode parseExpr1() {
return parseExpr2();
}
static ParseNode parseExpr2() {
if (peek()[0] == TOKEN_WORD
&& isFieldSpecWord(peek()[1])
&& peek(1)[0] == TOKEN_COLON)
{
fieldstack = ({ peek()[1] }) + fieldstack;
advance();
advance();
ParseNode n = fieldstack[0] == "date"
? parseDate()
: parseExpr3();
fieldstack = fieldstack[1 .. ];
return n;
}
if (peek()[0] == TOKEN_LPAREN) {
advance();
ParseNode n = parseQuery();
if (peek()[0] == TOKEN_RPAREN)
advance();
return n;
}
return parseExpr3();
}
static ParseNode parseExpr3() {
if (peek()[0] == TOKEN_WORD
&& peek(1)[0] == TOKEN_COLON)
return 0;
ParseNode or = OrNode();
for (;;) {
ParseNode n = parseExpr4();
or->addChild(n);
if (peek()[0] == TOKEN_OR)
if (peek(1)[0] == TOKEN_WORD
&& peek(2)[0] == TOKEN_COLON)
break;
else
advance();
else
break;
}
if (sizeof(or->children) == 1)
return or->children[0];
return or;
}
static ParseNode parseExpr4() {
ParseNode and = AndNode();
for (;;) {
ParseNode n = parseExpr5();
and->addChild(n);
if (peek()[0] == TOKEN_AND
&& !(peek(1)[0] == TOKEN_WORD
&& peek(2)[0] == TOKEN_COLON
|| peek(1)[0] == TOKEN_LPAREN))
advance();
else
break;
}
if (sizeof(and->children) == 1)
return and->children[0];
return and;
}
static ParseNode parseExpr5() {
ParseNode text = TextNode();
text->field = fieldstack[0];
for (;;) {
parseExpr6(text);
if ( (< TOKEN_END,
TOKEN_RPAREN,
TOKEN_AND,
TOKEN_OR >) [ peek()[0] ]
|| (peek()[0] == TOKEN_WORD
&& peek(1)[0] == TOKEN_COLON)
|| (peek()[0] == TOKEN_LPAREN))
break;
if (peek()[0] == TOKEN_OR)
if (peek(1)[0] == TOKEN_WORD
&& peek(2)[0] == TOKEN_COLON
|| peek(1)[0] == TOKEN_LPAREN)
break;
else
advance();
}
if (sizeof(text->words)
|| sizeof(text->phrases)
|| sizeof(text->plusWords)
|| sizeof(text->plusPhrases)
|| sizeof(text->minusWords)
|| sizeof(text->minusPhrases))
return text;
return 0;
}
static void parseExpr6(TextNode node) {
int prefix = 0;
if (peek()[0] == TOKEN_MINUS) {
advance();
prefix = '-';
}
else if (peek()[0] == TOKEN_PLUS) {
advance();
prefix = '+';
}
if (!prefix && options["implicit"] == "and")
prefix = '+';
while (!(< TOKEN_PHRASE,
TOKEN_WORD,
TOKEN_END >) [ peek()[0] ])
advance();
if (peek()[0] == TOKEN_PHRASE
|| peek()[0] == TOKEN_WORD) {
string phrase = peek()[1];
advance();
array(string) words = splitPhrase(phrase);
if (!words || !sizeof(words))
return;
if (sizeof(words) == 1)
switch (prefix) {
case '+': node->plusWords += words; break;
case '-': node->minusWords += words; break;
default: node->words += words; break;
}
else if (sizeof(words) > 1)
switch (prefix) {
case '+': node->plusPhrases += ({ words }); break;
case '-': node->minusPhrases += ({ words }); break;
default: node->phrases += ({ words }); break;
}
}
}
static ParseNode parseDate() {
DateNode n = DateNode();
n->date = "";
loop:
for (;;) {
switch (peek()[0]) {
case TOKEN_WORD:
if (isFieldSpecWord(peek()[1])
&& peek(1)[0] == TOKEN_COLON)
break loop;
break;
case TOKEN_UNKNOWN:
case TOKEN_MINUS:
case TOKEN_COLON:
break;
default:
break loop;
}
n->date += peek()[2];
advance();
}
return n;
}
}
|