pike.git / src / modules / Regexp / pike_regexp.c

version» Context lines:

pike.git/src/modules/Regexp/pike_regexp.c:1:   /* -  * $Id: pike_regexp.c,v 1.20 2000/12/01 08:10:23 hubbe Exp $ -  * + || This file is part of Pike. For copyright information see COPYRIGHT. + || Pike is distributed under GPL, LGPL and MPL. See the file COPYING + || for more information. + */ +  + /*    * regexp.c - regular expression matching    *    * DESCRIPTION    *    * Underneath the reformatting and comment blocks which were added to    * make it consistent with the rest of the code, you will find a    * modified version of Henry Specer's regular expression library.    * Henry's functions were modified to provide the minimal regular    * expression matching, as required by P1003. Henry's code was    * copyrighted, and copy of the copyright message and restrictions
pike.git/src/modules/Regexp/pike_regexp.c:60:      /* Headers */   #include "global.h"   #include <ctype.h>   #ifdef HAVE_STRING_H   #include <string.h>   #endif /* HAVE_STRING_H */   #include "pike_regexp.h"   #include "pike_memory.h"   #include "pike_error.h" + #include "interpret.h"    - /* must be included last */ - #include "module_magic.h" + #undef NOTHING      /*    * The "internal use only" fields in regexp.h are present to pass info from    * compile to execute that permits the execute phase to run lots faster on    * simple cases. They are:    *    * regstart char that must begin a match; '\0' if none obvious    * reganch is the match anchored (at beginning-of-line only)?    * regmust string (pointer into program) that match must include, or NULL    * regmlen length of regmust string
pike.git/src/modules/Regexp/pike_regexp.c:117:    * string. */   #define BRANCH 6 /* node Match this alternative, or the    * nxt... */   #define BACK 7 /* no Match "", "nxt" ptr points backward. */   #define EXACTLY 8 /* str Match this string. */   #define NOTHING 9 /* no Match empty string. */   #define STAR 10 /* node Match this (simple) thing 0 or more    * times. */   #define WORDSTART 11 /* node matching a start of a word */   #define WORDEND 12 /* node matching an end of a word */ + #define KPLUS 13 /* node Match this (simple) thing 1 or more +  * times. */   #define OPEN 20 /* no Mark this point in input as start of    * #n. */    /* OPEN+1 is number 1, etc. */   #define CLOSE (OPEN+NSUBEXP) /* no Analogous to OPEN. */      /*    * Opcode notes:    *    * BRANCH The set of branches constituting a single choice are hooked    * together with their "nxt" pointers, since precedence prevents    * anything being concatenated to any individual branch. The    * "nxt" pointer of the last BRANCH in a choice points to the    * thing following the whole choice. This is also where the    * final "nxt" pointer of each individual branch points; each    * branch starts with the operand node of a BRANCH node.    *    * BACK Normal "nxt" pointers all implicitly point forward; BACK    * exists to make loop structures possible.    * -  * STAR complex '*', are implemented as circular BRANCH structures +  * STAR,KPLUS Complex cases are implemented as circular BRANCH structures    * using BACK. Simple cases (one character per match) are -  * implemented with STAR for speed and to minimize recursive -  * plunges. +  * implemented with STAR or KPLUS for speed and to minimize +  * recursive plunges.    *    * OPEN,CLOSE ...are numbered at compile time.    */      /*    * A node is one char of opcode followed by two chars of "nxt" pointer.    * "Nxt" pointers are stored as two 8-bit pieces, high order first. The    * value is a positive offset from the opcode of the node containing it.    * An operand, if any, simply follows the node. (Note that much of the    * code generation knows about this implicit relationship.)    *    * Using two bytes for the "nxt" pointer is vast overkill for most things,    * but allows patterns to get big without disasters.    */   #define OP(p) (*(p))   #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))   #define OPERAND(p) ((p) + 3)      /* -  * The first byte of the regexp internal "program" is actually this magic -  * number; the start node begins in the second byte. -  */ - #define MAGIC 0234 -  - /* +     * Utility definitions.    */      #define regerror(X) Pike_error("Regexp: %s\n",X);   #define SPECIAL 0x100   #define LBRAC ('('|SPECIAL)   #define RBRAC (')'|SPECIAL)   #define ASTERIX ('*'|SPECIAL)   #define PLUS ('+'|SPECIAL)   #define OR_OP ('|'|SPECIAL)
pike.git/src/modules/Regexp/pike_regexp.c:197:   #define UCHARAT(p) ((int)*(unsigned char *)(p))   #else   #define UCHARAT(p) ((int)*(p)&CHARBITS)   #endif   #define ISWORDPART(c) ( isalnum(c) || (c) == '_' )      /*    * Flags to be passed up and down.    */   #define HASWIDTH 01 /* Known never to match null string. */ - #define SIMPLE 02 /* Simple enough to be STAR operand. */ + #define SIMPLE 02 /* Simple enough to be STAR or KPLUS operand. */   #define SPSTART 04 /* Starts with * */   #define WORST 0 /* Worst case. */      /*    * Global work variables for regcomp().    */   static short *regparse; /* Input-scan pointer. */   static int regnpar; /* () count. */   static char regdummy;   static char *regcode; /* Code-emit pointer; &regdummy = don't. */
pike.git/src/modules/Regexp/pike_regexp.c:254:    register regexp *r;    register char *scan;    register char *longest;    register ptrdiff_t len;    int flags;    short *exp2,*dest,c;       if (exp == (char *)NULL)    FAIL("NULL argument");    -  exp2=(short*)xalloc( (strlen(exp)+1) * (sizeof(short[8])/sizeof(char[8])) ); +  exp2=(short*)xalloc( (strlen(exp)+1) * sizeof(short) );    for ( scan=exp,dest=exp2;( c= UCHARAT(scan++)); ) {    switch (c) {    case '(':    case ')':    *dest++ = excompat ? c : c | SPECIAL;    break;    case '.':    case '*':    case '+':    case '|':
pike.git/src/modules/Regexp/pike_regexp.c:301:    default:    *dest++ = c;    }    }    *dest=0;    /* First pass: determine size, legality. */    regparse = exp2;    regnpar = 1;    regsize = 0L;    regcode = &regdummy; -  regc(MAGIC); +     if (reg(0, &flags) == (char *)NULL)    return ((regexp *)NULL);       /* Small enough for pointer-storage convention? */    if (regsize >= 32767L) /* Probably could be 65535L. */    FAIL("regexp too big");       /* Allocate space. */    r = (regexp *) xalloc(sizeof(regexp) + (unsigned) regsize); -  if (r == (regexp *) NULL) -  FAIL("out of space"); +        /* Second pass: emit code. */    regparse = exp2;    regnpar = 1;    regcode = r->program; -  regc(MAGIC); +     if (reg(0, &flags) == NULL)    return ((regexp *) NULL);       /* Dig out information for optimizations. */    r->regstart = '\0'; /* Worst-case defaults. */    r->reganch = 0;    r->regmust = NULL;    r->regmlen = 0; -  scan = r->program + 1; /* First BRANCH. */ +  scan = r->program; /* First BRANCH. */    if (OP(regnext(scan)) == END) { /* Only one top-level choice. */    scan = OPERAND(scan);       /* Starting-point info. */    if (OP(scan) == EXACTLY)    r->regstart = *OPERAND(scan);    else if (OP(scan) == BOL)    r->reganch++;       /*
pike.git/src/modules/Regexp/pike_regexp.c:468:    regtail(chain, latest);    chain = latest;    }    if (chain == (char *)NULL) /* Loop ran zero times. */    regnode(NOTHING);       return (ret);   }      /* -  - regpiece - something followed by possible [*] +  - regpiece - something followed by possible [*] or [+]    *    * Note that the branching code sequence used for * is somewhat optimized:    * they use the same NOTHING node as both the endmarker for their branch    * list and the body of the last branch. It might seem that this node could    * be dispensed with entirely, but the endmarker role is not redundant.    */   static char *regpiece(int *flagp)   {    register char *ret;    register short op;
pike.git/src/modules/Regexp/pike_regexp.c:513:    /* Emit x* as (x&|), where & means "self". */    reginsert(BRANCH, ret); /* Either x */    regoptail(ret, regnode(BACK)); /* and loop */    regoptail(ret, ret); /* back */    regtail(ret, regnode(BRANCH)); /* or */    regtail(ret, regnode(NOTHING)); /* null. */    }    }    else if(op == PLUS)    { -  /* Emit a+ as (a&) where & means "self" /Fredrik Hubinette */ +  if (flags & SIMPLE) +  { +  reginsert(KPLUS, ret); +  } +  else +  { +  /* ret -> 1: x nxt: 2 +  * tmp -> 2: BRANCH op: 3 nxt: 4 +  * 3: BACK nxt: 1 +  * 4: BRANCH op: 5 nxt: 5 +  * 5: NOTHING +  */    char *tmp;    tmp=regnode(BACK);    reginsert(BRANCH, tmp);    regtail(ret, tmp);    regoptail(tmp, ret);    regtail(ret, regnode(BRANCH));    regtail(ret, regnode(NOTHING));    } -  +  }       regparse++;    if (ISMULT(*regparse))    FAIL("nested * or +");       return (ret);   }         /*
pike.git/src/modules/Regexp/pike_regexp.c:778:    */   int pike_regexec(regexp *prog, char *string)   {    register char *s;       /* Be paranoid... */    if (prog == (regexp *)NULL || string == (char *)NULL) {    regerror("NULL parameter");    return (0);    } -  /* Check validity of program. */ -  if (UCHARAT(prog->program) != MAGIC) { -  regerror("corrupted program"); -  return (0); -  } +     /* If there is a "must appear" string, look for it. */    if (prog->regmust != (char *)NULL) {    s = string;    while ((s = STRCHR(s, prog->regmust[0])) != (char *)NULL) {    if (strncmp(s, prog->regmust, prog->regmlen) == 0)    break; /* Found it. */    s++;    }    if (s == (char *)NULL) /* Not present. */    return (0);
pike.git/src/modules/Regexp/pike_regexp.c:850:    reginput = string;    regstartp = prog->startp;    regendp = prog->endp;       sp = prog->startp;    ep = prog->endp;    for (i = NSUBEXP; i > 0; i--) {    *sp++ = (char *)NULL;    *ep++ = (char *)NULL;    } -  if (regmatch(prog->program + 1)) { +  if (regmatch(prog->program)) {    prog->startp[0] = string;    prog->endp[0] = reginput;    return (1);    } else    return (0);   }      /*    - regmatch - main matching routine    *
pike.git/src/modules/Regexp/pike_regexp.c:882:   #else      static int regmatch(prog)   char *prog;      #endif   {    register char *scan; /* Current node. */    char *nxt; /* nxt node. */    +  check_c_stack (4 * sizeof (void *)); +     scan = prog;   #ifdef PIKE_DEBUG    if (scan != (char *)NULL && regnarrate)    fprintf(stderr, "%s(\n", regprop(scan));   #endif    while (scan != (char *)NULL) {   #ifdef PIKE_DEBUG    if (regnarrate)    fprintf(stderr, "%s...\n", regprop(scan));   #endif
pike.git/src/modules/Regexp/pike_regexp.c:973:    if (regmatch(OPERAND(scan)))    return (1);    reginput = save;    scan = regnext(scan);    } while (scan != (char *)NULL && OP(scan) == BRANCH);    return (0);    /* NOTREACHED */    }    }    break; +  case KPLUS:    case STAR:{    register char nextch;    register ptrdiff_t no;    register char *save;    register ptrdiff_t minimum;       /*    * Lookahead to avoid useless match attempts when we know    * what character comes next.    */
pike.git/src/modules/Regexp/pike_regexp.c:1165: Inside #if defined(PIKE_DEBUG)
     void regdump(r)   regexp *r;      #endif   {    register char *s;    register char op = EXACTLY; /* Arbitrary non-END op. */    register char *nxt;    -  s = r->program + 1; +  s = r->program;    while (op != END) { /* While that wasn't END last time... */    op = OP(s);    printf("%2ld%s", /* Where, what. */    DO_NOT_WARN((long)(s - r->program)),    regprop(s));    nxt = regnext(s);    if (nxt == (char *)NULL) /* nxt ptr. */    printf("(0)");    else    printf("(%ld)",
pike.git/src/modules/Regexp/pike_regexp.c:1254:    p = "BACK";    break;    case END:    p = "END";    break;       case STAR:    p = "STAR";    break;    +  case KPLUS: +  p = "KPLUS"; +  break; +     default:    if(OP(op) >= OPEN && OP(op) < OPEN+NSUBEXP)    {    sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN);    p = (char *)NULL;    break;    }    if(OP(op) >= CLOSE && OP(op) < CLOSE+NSUBEXP)    {    sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE);
pike.git/src/modules/Regexp/pike_regexp.c:1294:    register char *dst;    register char c;    register int no;    register ptrdiff_t len;       if (prog == (regexp *)NULL ||    source == (char *)NULL || dest == (char *)NULL) {    regerror("NULL parm to regsub");    return NULL;    } -  if (UCHARAT(prog->program) != MAGIC) { -  regerror("damaged regexp fed to regsub"); -  return NULL; -  } +     src = source;    dst = dest;    while ((c = *src++) != '\0') {    if (c == '&')    no = 0;    else if (c == '\\' && '0' <= *src && *src <= '9')    no = *src++ - '0';    else    no = -1;   
pike.git/src/modules/Regexp/pike_regexp.c:1338:    }    }    }    if (--n < 0) { /* amylaar */    regerror("line too long");    return NULL;    }    *dst = '\0';    return dst;   } -  +