|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "global.h" |
#include <ctype.h> |
#ifdef HAVE_STRING_H |
#include <string.h> |
#endif /* HAVE_STRING_H */ |
#include "pike_regexp.h" |
#include "pike_memory.h" |
#include "pike_error.h" |
#include "interpret.h" |
|
#undef NOTHING |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define END 0 /* no End of program. */ |
#define BOL 1 /* no Match "" at beginning of line. */ |
#define EOL 2 /* no Match "" at end of line. */ |
#define ANY 3 /* no Match any one character. */ |
#define ANYOF 4 /* str Match any character in this string. */ |
#define ANYBUT 5 /* str Match any character not in this |
* string. */ |
#define BRANCH 6 /* node Match this alternative, or the |
* next... */ |
#define BACK 7 /* no Match "", "next" ptr points backward. */ |
#define EXACTLY 8 /* str Match this string. */ |
#define NOTHING 9 /* no Match empty string. */ |
#define STAR 10 /* node Match this (simple) thing 0 or more |
* times. */ |
#define WORDSTART 11 /* node matching a start of a word */ |
#define WORDEND 12 /* node matching an end of a word */ |
#define KPLUS 13 /* node Match this (simple) thing 1 or more |
* times. */ |
#define OPEN 20 /* no Mark this point in input as start of |
* #n. */ |
|
#define CLOSE (OPEN+NSUBEXP) /* no Analogous to OPEN. */ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define OP(p) (*(p)) |
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) |
#define OPERAND(p) ((p) + 3) |
|
|
|
|
|
#define regerror(X) Pike_error("Regexp: %s\n",X); |
#define SPECIAL 0x100 |
#define LBRAC ('('|SPECIAL) |
#define RBRAC (')'|SPECIAL) |
#define ASTERIX ('*'|SPECIAL) |
#define PLUS ('+'|SPECIAL) |
#define OR_OP ('|'|SPECIAL) |
#define DOLLAR ('$'|SPECIAL) |
#define DOT ('.'|SPECIAL) |
#define CARET ('^'|SPECIAL) |
#define LSQBRAC ('['|SPECIAL) |
#define RSQBRAC (']'|SPECIAL) |
#define LSHBRAC ('<'|SPECIAL) |
#define RSHBRAC ('>'|SPECIAL) |
#define FAIL(m) { regerror(m); return(NULL); } |
#define ISMULT(c) ((c) == ASTERIX || (c)==PLUS) |
#define META "^$.[()|*+\\" |
#ifndef CHARBITS |
#define CHARBITS 0xff |
#define UCHARAT(p) ((int)*(unsigned char *)(p)) |
#else |
#define UCHARAT(p) ((int)*(p)&CHARBITS) |
#endif |
#define ISWORDPART(c) ( isalnum(c) || (c) == '_' ) |
|
|
|
|
#define HASWIDTH 01 /* Known never to match null string. */ |
#define SIMPLE 02 /* Simple enough to be STAR or KPLUS operand. */ |
#define SPSTART 04 /* Starts with * */ |
#define WORST 0 /* Worst case. */ |
|
|
|
|
static short *regparse; |
static int regnpar; |
static char *regcode; |
static long regsize; |
static char regdummy[3] = { NOTHING, 0, 0 }; |
|
|
|
|
static char *reg(int, int *); |
static char *regbranch(int *); |
static char *regpiece(int *); |
static char *regatom(int *); |
static char *regnode(char); |
static char *regnext(char *); |
static void regc(char b); |
static void reginsert(char, char *); |
static void regtail(char *, const char *); |
static void regoptail(char *, const char *); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
regexp *pike_regcomp(const char *exp) |
{ |
regexp *r = NULL; |
char *scan; |
int flags; |
short *exp2,*dest,c; |
|
if (exp == NULL) |
FAIL("NULL argument"); |
|
exp2=xcalloc( (strlen(exp)+1), sizeof(short) ); |
for ( dest=exp2; (c=UCHARAT(exp++)); ) { |
switch (c) { |
case '(': |
case ')': |
case '.': |
case '*': |
case '+': |
case '|': |
case '$': |
case '^': |
case '[': |
case ']': |
*dest++ = c | SPECIAL; |
break; |
case '\\': |
switch ( c = *exp++ ) { |
case '(': |
case ')': |
*dest++ = c; |
break; |
case '<': |
case '>': |
*dest++ = c | SPECIAL; |
break; |
case '{': |
case '}': |
FAIL("sorry, unimplemented operator"); |
case 'b': *dest++ = '\b'; break; |
case 't': *dest++ = '\t'; break; |
case 'r': *dest++ = '\r'; break; |
default: |
*dest++ = c; |
} |
break; |
default: |
*dest++ = c; |
} |
} |
|
|
regparse = exp2; |
regnpar = 1; |
regsize = 0L; |
regcode = regdummy; |
if (reg(0, &flags) == NULL) |
goto exit_regcomp; |
|
|
if (regsize >= 32767L) |
{ |
free(exp2); |
FAIL("regexp too big"); |
} |
|
|
r = malloc(sizeof(regexp) + (unsigned) regsize); |
if(!r) |
{ |
free(exp2); |
Pike_error(msg_out_of_mem); |
} |
|
|
regparse = exp2; |
regnpar = 1; |
regcode = r->program; |
if (reg(0, &flags) == NULL) |
{ |
free(r); |
r = NULL; |
goto exit_regcomp; |
} |
|
|
r->regstart = '\0'; |
r->reganch = 0; |
r->regmust = NULL; |
r->regmlen = 0; |
scan = r->program; |
if (OP(regnext(scan)) == END) { |
scan = OPERAND(scan); |
|
|
if (OP(scan) == EXACTLY) |
r->regstart = *OPERAND(scan); |
else if (OP(scan) == BOL) |
r->reganch++; |
|
|
|
|
|
|
|
|
|
if (flags & SPSTART) { |
char *longest = NULL; |
size_t len = 0; |
for (; scan != NULL; scan = regnext(scan)) |
if (OP(scan) == EXACTLY && |
strlen(OPERAND(scan)) >= (size_t)len) { |
longest = OPERAND(scan); |
len = strlen(OPERAND(scan)); |
} |
r->regmust = longest; |
r->regmlen = len; |
} |
} |
|
exit_regcomp: |
free(exp2); |
return r; |
} |
|
|
|
|
|
|
|
|
|
|
static char *reg(int paren,int *flagp) |
{ |
char *ret; |
char *br; |
char *ender; |
int parno=0; |
int flags; |
|
*flagp = HASWIDTH; |
|
|
if (paren) |
{ |
if (regnpar >= NSUBEXP) |
FAIL("too many ()"); |
parno = regnpar; |
regnpar++; |
ret = regnode((char)(OPEN + parno)); |
} |
else |
ret = NULL; |
|
|
br = regbranch(&flags); |
if (br == NULL) |
return NULL; |
if (ret != NULL) |
regtail(ret, br); |
else |
ret = br; |
|
if (!(flags & HASWIDTH)) |
*flagp &= ~HASWIDTH; |
*flagp |= flags & SPSTART; |
|
while (*regparse == OR_OP) { |
regparse++; |
br = regbranch(&flags); |
if (br == NULL) |
return NULL; |
regtail(ret, br); |
if (!(flags & HASWIDTH)) |
*flagp &= ~HASWIDTH; |
*flagp |= flags & SPSTART; |
} |
|
|
ender = regnode((char)((paren) ? (CLOSE + parno) : END)); |
regtail(ret, ender); |
|
|
for (br = ret; br != NULL; br = regnext(br)) |
regoptail(br, ender); |
|
|
if (paren && *regparse++ != RBRAC) |
{ |
FAIL("unmatched ()"); |
} |
else if (!paren && *regparse != '\0') |
{ |
if (*regparse == RBRAC) |
{ |
FAIL("unmatched ()"); |
} |
else |
FAIL("junk on end"); |
} |
|
return ret; |
} |
|
|
|
|
|
|
static char *regbranch(int *flagp) |
{ |
char *ret; |
char *chain; |
char *latest; |
int flags; |
|
*flagp = WORST; |
ret = regnode(BRANCH); |
chain = NULL; |
|
while (*regparse != '\0' && *regparse != OR_OP && *regparse != RBRAC) { |
latest = regpiece(&flags); |
if (latest == NULL) |
return NULL; |
*flagp |= flags & HASWIDTH; |
if (chain == NULL) |
*flagp |= flags & SPSTART; |
else |
regtail(chain, latest); |
chain = latest; |
} |
if (chain == NULL) |
regnode(NOTHING); |
|
return ret; |
} |
|
|
|
|
|
|
|
|
|
|
static char *regpiece(int *flagp) |
{ |
char *ret; |
short op; |
int flags; |
|
ret = regatom(&flags); |
if (ret == NULL) |
return NULL; |
|
op = *regparse; |
if (!ISMULT(op)) { |
*flagp = flags; |
return ret; |
} |
|
|
if (!(flags & HASWIDTH)) |
FAIL("* or + operand could be empty"); |
*flagp = (WORST | SPSTART); |
|
if(op == ASTERIX) |
{ |
if (flags & SIMPLE) |
{ |
reginsert(STAR, ret); |
} |
else |
{ |
|
reginsert(BRANCH, ret); |
regoptail(ret, regnode(BACK)); |
regoptail(ret, ret); |
regtail(ret, regnode(BRANCH)); |
regtail(ret, regnode(NOTHING)); |
} |
} |
else if(op == PLUS) |
{ |
if (flags & SIMPLE) |
{ |
reginsert(KPLUS, ret); |
} |
else |
{ |
|
|
|
|
|
|
char *tmp; |
tmp=regnode(BACK); |
reginsert(BRANCH, tmp); |
regtail(ret, tmp); |
regoptail(tmp, ret); |
regtail(ret, regnode(BRANCH)); |
regtail(ret, regnode(NOTHING)); |
} |
} |
|
regparse++; |
if (ISMULT(*regparse)) |
FAIL("nested * or +"); |
|
return ret; |
} |
|
|
|
|
|
|
|
|
|
static char *regatom(int *flagp) |
{ |
char *ret; |
int flags; |
|
*flagp = WORST; |
|
switch (*regparse++) |
{ |
case CARET: |
ret = regnode(BOL); |
break; |
case DOLLAR: |
ret = regnode(EOL); |
break; |
case DOT: |
ret = regnode(ANY); |
*flagp |= HASWIDTH | SIMPLE; |
break; |
case LSHBRAC: |
ret = regnode(WORDSTART); |
break; |
case RSHBRAC: |
ret = regnode(WORDEND); |
break; |
case LSQBRAC: |
{ |
int range; |
int rangeend; |
|
if (*regparse == CARET) |
{ |
|
ret = regnode(ANYBUT); |
regparse++; |
} |
else |
ret = regnode(ANYOF); |
|
if (*regparse == RSQBRAC || *regparse == '-') |
regc((char)(*regparse++)); |
|
while (*regparse != '\0' && *regparse != RSQBRAC) |
{ |
if (*regparse == '-') |
{ |
regparse++; |
if (*regparse == RSQBRAC || *regparse == '\0') |
regc('-'); |
else |
{ |
range = (CHARBITS & *(regparse - 2)); |
rangeend = (CHARBITS & *(regparse)); |
if (range > rangeend) |
FAIL("invalid [] range"); |
for (range++; range <= rangeend; range++) |
regc((char)range); |
regparse++; |
} |
} |
else |
regc((char)(*regparse++)); |
} |
regc('\0'); |
if (*regparse != RSQBRAC) |
FAIL("unmatched []"); |
regparse++; |
*flagp |= HASWIDTH | SIMPLE; |
} |
break; |
|
case LBRAC: |
ret = reg(1, &flags); |
if (ret == NULL) |
return NULL; |
*flagp |= flags & (HASWIDTH | SPSTART); |
break; |
|
case '\0': |
case OR_OP: |
case RBRAC: |
FAIL("internal urp"); |
break; |
|
case PLUS: |
case ASTERIX: |
FAIL("*/+ follows nothing\n"); |
break; |
|
default: |
{ |
size_t len; |
short ender; |
|
regparse--; |
for (len=0; regparse[len] && |
!(regparse[len]&SPECIAL) && regparse[len] != RSQBRAC; len++) ; |
if (len <= 0) |
FAIL("internal disaster"); |
|
ender = *(regparse + len); |
if (len > 1 && ISMULT(ender)) |
len--; |
*flagp |= HASWIDTH; |
if (len == 1) |
*flagp |= SIMPLE; |
ret = regnode(EXACTLY); |
for (; len > 0; len--) |
regc((char)(*regparse++)); |
regc('\0'); |
} |
break; |
} |
|
return ret; |
} |
|
|
|
|
static char *regnode(char op) |
{ |
char *ret = regcode; |
char *ptr; |
|
if (ret == regdummy) { |
regsize += 3; |
return ret; |
} |
|
ptr = ret; |
*ptr++ = op; |
*ptr++ = '\0'; |
*ptr++ = '\0'; |
regcode = ptr; |
|
return ret; |
} |
|
|
|
|
static void regc(char b) |
{ |
if (regcode != regdummy) |
*regcode++ = b; |
else |
regsize++; |
} |
|
|
|
|
|
|
static void reginsert(char op, char *opnd) |
{ |
char *place; |
|
if (regcode == regdummy) |
{ |
regsize += 3; |
return; |
} |
|
memmove(opnd+3, opnd, (size_t)(regcode - opnd)); |
regcode += 3; |
|
place = opnd; |
*place++ = op; |
*place++ = '\0'; |
*place++ = '\0'; |
} |
|
|
|
|
static void regtail(char *p, const char *val) |
{ |
char *scan; |
char *temp; |
ptrdiff_t offset; |
|
if (p == regdummy) |
return; |
|
|
for (scan = p; (temp = regnext(scan)) != NULL; scan = temp) |
continue; |
|
if (OP(scan) == BACK) |
offset = scan - val; |
else |
offset = val - scan; |
*(scan + 1) = DO_NOT_WARN((offset >> 8) & 0377); |
*(scan + 2) = DO_NOT_WARN(offset & 0377); |
} |
|
|
|
|
static void regoptail(char *p, const char *val) |
{ |
|
if (p == NULL || p == regdummy || OP(p) != BRANCH) |
return; |
regtail(OPERAND(p), val); |
} |
|
|
|
|
|
|
|
|
static char *reginput; |
static char *regbol; |
static char **regstartp; |
static char **regendp; |
|
|
|
|
static int regtry(regexp *, char *); |
static int regmatch(char *); |
static size_t regrepeat(const char *); |
|
#ifdef PIKE_DEBUG |
int regnarrate = 0; |
void regdump(regexp *); |
static char *regprop(char *op); |
#endif |
|
|
|
|
int pike_regexec(regexp *prog, char *string) |
{ |
char *s; |
|
|
if (prog == NULL || string == NULL) { |
regerror("NULL parameter"); |
return 0; |
} |
|
|
if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL) |
return 0; |
|
|
regbol = string; |
|
|
if (prog->reganch) |
return regtry(prog, string); |
|
|
s = string; |
if (prog->regstart != '\0') |
{ |
|
for (s = string; s != NULL; s = strchr(s+1, prog->regstart)) |
if (regtry(prog, s)) |
return 1; |
return 0; |
} |
else |
{ |
|
for (s = string; !regtry(prog, s); s++) |
if (*s == '\0') |
return 0; |
return 1; |
} |
|
UNREACHABLE(return 0); |
} |
|
|
|
|
static int regtry(regexp *prog, char *string) |
{ |
int i; |
char **stp; |
char **enp; |
|
reginput = string; |
regstartp = prog->startp; |
regendp = prog->endp; |
|
stp = prog->startp; |
enp = prog->endp; |
for (i = NSUBEXP; i > 0; i--) |
{ |
*stp++ = NULL; |
*enp++ = NULL; |
} |
if (regmatch(prog->program)) |
{ |
prog->startp[0] = string; |
prog->endp[0] = reginput; |
return 1; |
} |
|
return 0; |
} |
|
|
|
|
|
|
|
|
|
|
|
static int regmatch(char *prog) |
{ |
char *scan; |
char *next; |
|
check_c_stack (4 * sizeof (void *)); |
|
scan = prog; |
#ifdef PIKE_DEBUG |
if (scan != NULL && regnarrate) |
fprintf(stderr, "%s(\n", regprop(scan)); |
#endif |
|
while (scan != NULL) |
{ |
#ifdef PIKE_DEBUG |
if (regnarrate) |
fprintf(stderr, "%s...\n", regprop(scan)); |
#endif |
next = regnext(scan); |
|
switch (OP(scan)) |
{ |
case BOL: |
if (reginput != regbol) |
return 0; |
break; |
|
case EOL: |
if (*reginput != '\0') |
return 0; |
break; |
|
case ANY: |
if (*reginput == '\0') |
return 0; |
reginput++; |
break; |
|
case WORDSTART: |
if (reginput == regbol) |
break; |
if (*reginput == '\0' || |
ISWORDPART( *((unsigned char *)reginput-1) ) || |
!ISWORDPART( *((unsigned char *)reginput) ) ) |
return 0; |
break; |
|
case WORDEND: |
if (*reginput == '\0') |
break; |
if ( reginput == regbol || |
!ISWORDPART( *((unsigned char *)reginput-1) ) || |
ISWORDPART( *((unsigned char *)reginput) ) ) |
return 0; |
break; |
|
case EXACTLY: |
{ |
ptrdiff_t len; |
char *opnd; |
|
opnd = OPERAND(scan); |
|
if (*opnd != *reginput) |
return 0; |
len = strlen(opnd); |
if (len > 1 && strncmp(opnd, reginput, len) != 0) |
return 0; |
reginput += len; |
} |
break; |
|
case ANYOF: |
if (*reginput == '\0' || |
strchr(OPERAND(scan), *reginput) == NULL) |
return 0; |
reginput++; |
break; |
|
case ANYBUT: |
if (*reginput == '\0' || |
strchr(OPERAND(scan), *reginput) != NULL) |
return 0; |
reginput++; |
break; |
|
case NOTHING: |
break; |
case BACK: |
break; |
|
case BRANCH: |
{ |
if (OP(next) != BRANCH) |
next = OPERAND(scan); |
else |
{ |
|
char *save = reginput; |
do |
{ |
if (regmatch(OPERAND(scan))) |
return 1; |
reginput = save; |
scan = regnext(scan); |
} while (scan != NULL && OP(scan) == BRANCH); |
return 0; |
} |
} |
break; |
|
case KPLUS: |
case STAR: |
{ |
char nextch = |
(OP(next) == EXACTLY) ? *OPERAND(next) : '\0'; |
size_t no; |
char *save = reginput; |
size_t minimum = (OP(scan) == STAR) ? 0 : 1; |
|
for(no = regrepeat(OPERAND(scan)) + 1; no > minimum; no--) { |
reginput = save + no -1; |
|
if (nextch == '\0' || *reginput == nextch) |
if (regmatch(next)) |
return 1; |
} |
return 0; |
} |
|
case END: |
return 1; |
|
default: |
if(OP(scan) >= OPEN && OP(scan)<OPEN+NSUBEXP) |
{ |
int no = OP(scan) - OPEN; |
char *input = reginput; |
|
if (regmatch(next)) |
{ |
|
|
|
|
if (regstartp[no] == NULL) |
regstartp[no] = input; |
return 1; |
} else |
return 0; |
} |
|
if(OP(scan) >= CLOSE && OP(scan)<CLOSE+NSUBEXP) |
{ |
int no = OP(scan) - CLOSE; |
char *input = reginput; |
|
if (regmatch(next)) |
{ |
|
|
|
|
if (regendp[no] == NULL) |
regendp[no] = input; |
return 1; |
} |
else |
return 0; |
} |
regerror("memory corruption"); |
return 0; |
} |
scan = next; |
} |
|
|
|
|
|
regerror("corrupted pointers"); |
return 0; |
} |
|
|
|
|
static size_t regrepeat(const char *node) |
{ |
switch (OP(node)) |
{ |
case ANY: |
return(strlen(reginput)); |
break; |
|
case EXACTLY: |
{ |
char *scan; |
char ch = *OPERAND(node); |
size_t count = 0; |
|
for (scan = reginput; *scan == ch; scan++) |
count++; |
return count; |
} |
break; |
|
case ANYOF: |
return strspn(reginput, OPERAND(node)); |
break; |
|
case ANYBUT: |
return strcspn(reginput, OPERAND(node)); |
break; |
|
default: |
regerror("internal foulup"); |
return 0; |
break; |
} |
} |
|
|
|
|
|
static char *regnext(char *p) |
{ |
int offset = NEXT(p); |
|
if (offset == 0) |
return NULL; |
|
if (OP(p) == BACK) |
return (p - offset); |
else |
return (p + offset); |
} |
|
#ifdef PIKE_DEBUG |
|
static char *regprop(char *); |
|
|
|
|
void regdump(regexp *r) |
{ |
char *s; |
char op = EXACTLY; |
char *next; |
|
s = r->program; |
while (op != END) { |
op = OP(s); |
printf("%2ld%s", |
DO_NOT_WARN((long)(s - r->program)), |
regprop(s)); |
next = regnext(s); |
if (next == NULL) |
printf("(0)"); |
else |
printf("(%ld)", |
DO_NOT_WARN((long)( (s - r->program) + (next - s)))); |
s += 3; |
if (op == ANYOF || op == ANYBUT || op == EXACTLY) { |
|
while (*s != '\0') { |
putchar(*s); |
s++; |
} |
s++; |
} |
putchar('\n'); |
} |
|
|
if (r->regstart != '\0') |
printf("start `%c' ", r->regstart); |
if (r->reganch) |
printf("anchored "); |
if (r->regmust != NULL) |
printf("must have \"%s\"", r->regmust); |
printf("\n"); |
} |
|
|
|
|
static char *regprop(char *op) |
{ |
char *p; |
static char buf[50]; |
|
strcpy(buf, ":"); |
|
switch (OP(op)) { |
case BOL: |
p = "BOL"; |
break; |
case EOL: |
p = "EOL"; |
break; |
case ANY: |
p = "ANY"; |
break; |
case ANYOF: |
p = "ANYOF"; |
break; |
case ANYBUT: |
p = "ANYBUT"; |
break; |
case BRANCH: |
p = "BRANCH"; |
break; |
case EXACTLY: |
p = "EXACTLY"; |
break; |
case NOTHING: |
p = "NOTHING"; |
break; |
case BACK: |
p = "BACK"; |
break; |
case END: |
p = "END"; |
break; |
|
case STAR: |
p = "STAR"; |
break; |
|
case KPLUS: |
p = "KPLUS"; |
break; |
|
default: |
if(OP(op) >= OPEN && OP(op) < OPEN+NSUBEXP) |
{ |
sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN); |
p = NULL; |
break; |
} |
if(OP(op) >= CLOSE && OP(op) < CLOSE+NSUBEXP) |
{ |
sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE); |
p = NULL; |
break; |
} |
regerror("corrupted opcode"); |
p=NULL; |
break; |
} |
if (p != NULL) |
strcat(buf, p); |
return (buf); |
} |
#endif |
|
|