2 Copyright (c) 1995-1998 by Cisco systems, Inc.
4 Permission to use, copy, modify, and distribute this software for
5 any purpose and without fee is hereby granted, provided that this
6 copyright and permission notice appear on all copies of the
7 software and supporting documentation, the name of Cisco Systems,
8 Inc. not be used in advertising or publicity pertaining to
9 distribution of the program without specific prior permission, and
10 notice be given in supporting documentation that modification,
11 copying and distribution is by permission of Cisco Systems, Inc.
13 Cisco Systems, Inc. makes no representations about the suitability
14 of this software for any purpose. THIS SOFTWARE IS PROVIDED ``AS
15 IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
16 WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 FITNESS FOR A PARTICULAR PURPOSE.
21 * regcomp and regexec -- regsub and regerror are elsewhere
22 * @(#)regexp.c 1.3 of 18 April 87
24 * Copyright (c) 1986 by University of Toronto.
25 * Written by Henry Spencer. Not derived from licensed software.
27 * Permission is granted to anyone to use this software for any
28 * purpose on any computer system, and to redistribute it freely,
29 * subject to the following restrictions:
31 * 1. The author is not responsible for the consequences of use of
32 * this software, no matter how awful, even if they arise
35 * 2. The origin of this software must not be misrepresented, either
36 * by explicit claim or by omission.
38 * 3. Altered versions must be plainly marked as such, and must not
39 * be misrepresented as being the original software.
41 * Beware that some of this code is subtly aware of the way operator
42 * precedence is structured in regular expressions. Serious changes in
43 * regular-expression syntax might require a total rethink.
50 * The "internal use only" fields in regexp.h are present to pass info from
51 * compile to execute that permits the execute phase to run lots faster on
52 * simple cases. They are:
54 * regstart char that must begin a match; '\0' if none obvious
55 * reganch is the match anchored (at beginning-of-line only)?
56 * regmust string (pointer into program) that match must include, or NULL
57 * regmlen length of regmust string
59 * Regstart and reganch permit very fast decisions on suitable starting points
60 * for a match, cutting down the work a lot. Regmust permits fast rejection
61 * of lines that cannot possibly match. The regmust tests are costly enough
62 * that regcomp() supplies a regmust only if the r.e. contains something
63 * potentially expensive (at present, the only such thing detected is * or +
64 * at the start of the r.e., which can involve a lot of backup). Regmlen is
65 * supplied because the test in regexec() needs it and regcomp() is computing
70 * Structure for regexp "program". This is essentially a linear encoding
71 * of a nondeterministic finite-state machine (aka syntax charts or
72 * "railroad normal form" in parsing technology). Each node is an opcode
73 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
74 * all nodes except BRANCH implement concatenation; a "next" pointer with
75 * a BRANCH on both ends of it is connecting two alternatives. (Here we
76 * have one of the subtle syntax dependencies: an individual BRANCH (as
77 * opposed to a collection of them) is never concatenated with anything
78 * because of operator precedence.) The operand of some types of node is
79 * a literal string; for others, it is a node leading into a sub-FSM. In
80 * particular, the operand of a BRANCH node is the first node of the branch.
81 * (NB this is *not* a tree structure: the tail of the branch connects
82 * to the thing following the set of BRANCHes.) The opcodes are:
85 /* definition number opnd? meaning */
86 #define END 0 /* no End of program. */
87 #define BOL 1 /* no Match "" at beginning of line. */
88 #define EOL 2 /* no Match "" at end of line. */
89 #define ANY 3 /* no Match any one character. */
90 #define ANYOF 4 /* str Match any character in this string. */
91 #define ANYBUT 5 /* str Match any character not in this string. */
92 #define BRANCH 6 /* node Match this alternative, or the next... */
93 #define BACK 7 /* no Match "", "next" ptr points backward. */
94 #define EXACTLY 8 /* str Match this string. */
95 #define NOTHING 9 /* no Match empty string. */
96 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
97 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
98 #define OPEN 20 /* no Mark this point in input as start of #n. */
99 /* OPEN+1 is number 1, etc. */
100 #define CLOSE 30 /* no Analogous to OPEN. */
105 * BRANCH The set of branches constituting a single choice are hooked
106 * together with their "next" pointers, since precedence prevents
107 * anything being concatenated to any individual branch. The
108 * "next" pointer of the last BRANCH in a choice points to the
109 * thing following the whole choice. This is also where the
110 * final "next" pointer of each individual branch points; each
111 * branch starts with the operand node of a BRANCH node.
113 * BACK Normal "next" pointers all implicitly point forward; BACK
114 * exists to make loop structures possible.
116 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
117 * BRANCH structures using BACK. Simple cases (one character
118 * per match) are implemented with STAR and PLUS for speed
119 * and to minimize recursive plunges.
121 * OPEN,CLOSE ...are numbered at compile time.
125 * A node is one char of opcode followed by two chars of "next" pointer.
126 * "Next" pointers are stored as two 8-bit pieces, high order first. The
127 * value is a positive offset from the opcode of the node containing it.
128 * An operand, if any, simply follows the node. (Note that much of the
129 * code generation knows about this implicit relationship.)
131 * Using two bytes for the "next" pointer is vast overkill for most things,
132 * but allows patterns to get big without disasters.
135 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
136 #define OPERAND(p) ((p) + 3)
139 * See regmagic.h for one further detail of program structure.
144 * Utility definitions.
147 #define UCHARAT(p) ((int)*(unsigned char *)(p))
149 #define UCHARAT(p) ((int)*(p)&CHARBITS)
152 #define FAIL(m) { regerror(m); return(NULL); }
153 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define META "^$.[()|?+*\\"
157 * Flags to be passed up and down.
159 #define HASWIDTH 01 /* Known never to match null string. */
160 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
161 #define SPSTART 04 /* Starts with * or +. */
162 #define WORST 0 /* Worst case. */
165 * Global work variables for regcomp().
167 static char *regparse; /* Input-scan pointer. */
168 static int regnpar; /* () count. */
169 static char regdummy;
170 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
171 static long regsize; /* Code size. */
174 * Forward declarations for regcomp()'s friends.
177 #define STATIC static
180 STATIC char *regbranch();
181 STATIC char *regpiece();
182 STATIC char *regatom();
183 STATIC char *regnode();
184 STATIC char *regnext();
186 STATIC void reginsert();
187 STATIC void regtail();
188 STATIC void regoptail();
190 STATIC int strcspn();
194 - regcomp - compile a regular expression into internal code
196 * We can't allocate space until we know how big the compiled form will be,
197 * but we can't compile it (and thus know how big it is) until we've got a
198 * place to put the code. So we cheat: we compile it twice, once with code
199 * generation turned off and size counting turned on, and once "for real".
200 * This also means that we don't allocate space until we are sure that the
201 * thing really will compile successfully, and we never have to move the
202 * code and thus invalidate pointers into it. (Note that it has to be in
203 * one piece because free() must be able to free it all.)
205 * Beware that the optimization-preparation code in here knows about some
206 * of the structure of the compiled regexp.
214 register char *longest;
217 extern char *malloc();
220 FAIL("NULL argument");
222 /* First pass: determine size, legality. */
228 if (reg(0, &flags) == NULL)
231 /* Small enough for pointer-storage convention? */
232 if (regsize >= 32767L) /* Probably could be 65535L. */
233 FAIL("regexp too big");
235 /* Allocate space. */
236 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
238 FAIL("out of space");
240 /* Second pass: emit code. */
243 regcode = r->program;
245 if (reg(0, &flags) == NULL)
248 /* Dig out information for optimizations. */
249 r->regstart = '\0'; /* Worst-case defaults. */
253 scan = r->program+1; /* First BRANCH. */
254 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
255 scan = OPERAND(scan);
257 /* Starting-point info. */
258 if (OP(scan) == EXACTLY)
259 r->regstart = *OPERAND(scan);
260 else if (OP(scan) == BOL)
264 * If there's something expensive in the r.e., find the
265 * longest literal string that must appear and make it the
266 * regmust. Resolve ties in favor of later strings, since
267 * the regstart check works with the beginning of the r.e.
268 * and avoiding duplication strengthens checking. Not a
269 * strong reason, but sufficient in the absence of others.
274 for (; scan != NULL; scan = regnext(scan))
275 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
276 longest = OPERAND(scan);
277 len = strlen(OPERAND(scan));
279 r->regmust = longest;
288 - reg - regular expression, i.e. main body or parenthesized thing
290 * Caller must absorb opening parenthesis.
292 * Combining parenthesis handling with the base level of regular expression
293 * is a trifle forced, but the need to tie the tails of the branches to what
294 * follows makes it hard to avoid.
298 int paren; /* Parenthesized? */
303 register char *ender;
307 *flagp = HASWIDTH; /* Tentatively. */
309 /* Make an OPEN node, if parenthesized. */
311 if (regnpar >= NSUBEXP)
315 ret = regnode(OPEN+parno);
319 /* Pick up the branches, linking them together. */
320 br = regbranch(&flags);
324 regtail(ret, br); /* OPEN -> first. */
327 if (!(flags&HASWIDTH))
329 *flagp |= flags&SPSTART;
330 while (*regparse == '|') {
332 br = regbranch(&flags);
335 regtail(ret, br); /* BRANCH -> BRANCH. */
336 if (!(flags&HASWIDTH))
338 *flagp |= flags&SPSTART;
341 /* Make a closing node, and hook it on the end. */
342 ender = regnode((paren) ? CLOSE+parno : END);
345 /* Hook the tails of the branches to the closing node. */
346 for (br = ret; br != NULL; br = regnext(br))
347 regoptail(br, ender);
349 /* Check for proper termination. */
350 if (paren && *regparse++ != ')') {
351 FAIL("unmatched ()");
352 } else if (!paren && *regparse != '\0') {
353 if (*regparse == ')') {
354 FAIL("unmatched ()");
356 FAIL("junk on end"); /* "Can't happen". */
364 - regbranch - one alternative of an | operator
366 * Implements the concatenation operator.
373 register char *chain;
374 register char *latest;
377 *flagp = WORST; /* Tentatively. */
379 ret = regnode(BRANCH);
381 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
382 latest = regpiece(&flags);
385 *flagp |= flags&HASWIDTH;
386 if (chain == NULL) /* First piece. */
387 *flagp |= flags&SPSTART;
389 regtail(chain, latest);
392 if (chain == NULL) /* Loop ran zero times. */
393 (void) regnode(NOTHING);
399 - regpiece - something followed by possible [*+?]
401 * Note that the branching code sequences used for ? and the general cases
402 * of * and + are somewhat optimized: they use the same NOTHING node as
403 * both the endmarker for their branch list and the body of the last branch.
404 * It might seem that this node could be dispensed with entirely, but the
405 * endmarker role is not redundant.
416 ret = regatom(&flags);
426 if (!(flags&HASWIDTH) && op != '?')
427 FAIL("*+ operand could be empty");
428 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
430 if (op == '*' && (flags&SIMPLE))
431 reginsert(STAR, ret);
432 else if (op == '*') {
433 /* Emit x* as (x&|), where & means "self". */
434 reginsert(BRANCH, ret); /* Either x */
435 regoptail(ret, regnode(BACK)); /* and loop */
436 regoptail(ret, ret); /* back */
437 regtail(ret, regnode(BRANCH)); /* or */
438 regtail(ret, regnode(NOTHING)); /* null. */
439 } else if (op == '+' && (flags&SIMPLE))
440 reginsert(PLUS, ret);
441 else if (op == '+') {
442 /* Emit x+ as x(&|), where & means "self". */
443 next = regnode(BRANCH); /* Either */
445 regtail(regnode(BACK), ret); /* loop back */
446 regtail(next, regnode(BRANCH)); /* or */
447 regtail(ret, regnode(NOTHING)); /* null. */
448 } else if (op == '?') {
449 /* Emit x? as (x|) */
450 reginsert(BRANCH, ret); /* Either x */
451 regtail(ret, regnode(BRANCH)); /* or */
452 next = regnode(NOTHING); /* null. */
454 regoptail(ret, next);
457 if (ISMULT(*regparse))
464 - regatom - the lowest level
466 * Optimization: gobbles an entire sequence of ordinary characters so that
467 * it can turn them into a single node, which is smaller to store and
468 * faster to run. Backslashed characters are exceptions, each becoming a
469 * separate node; the code is simpler that way and it's not worth fixing.
478 *flagp = WORST; /* Tentatively. */
480 switch (*regparse++) {
489 *flagp |= HASWIDTH|SIMPLE;
493 register int classend;
495 if (*regparse == '^') { /* Complement of range. */
496 ret = regnode(ANYBUT);
499 ret = regnode(ANYOF);
500 if (*regparse == ']' || *regparse == '-')
502 while (*regparse != '\0' && *regparse != ']') {
503 if (*regparse == '-') {
505 if (*regparse == ']' || *regparse == '\0')
508 class = UCHARAT(regparse-2)+1;
509 classend = UCHARAT(regparse);
510 if (class > classend+1)
511 FAIL("invalid [] range");
512 for (; class <= classend; class++)
520 if (*regparse != ']')
521 FAIL("unmatched []");
523 *flagp |= HASWIDTH|SIMPLE;
527 ret = reg(1, &flags);
530 *flagp |= flags&(HASWIDTH|SPSTART);
535 FAIL("internal urp"); /* Supposed to be caught earlier. */
540 FAIL("?+* follows nothing");
543 if (*regparse == '\0')
545 ret = regnode(EXACTLY);
548 *flagp |= HASWIDTH|SIMPLE;
555 len = strcspn(regparse, META);
557 FAIL("internal disaster");
558 ender = *(regparse+len);
559 if (len > 1 && ISMULT(ender))
560 len--; /* Back off clear of ?+* operand. */
564 ret = regnode(EXACTLY);
578 - regnode - emit a node
580 static char * /* Location. */
588 if (ret == ®dummy) {
595 *ptr++ = '\0'; /* Null "next" pointer. */
603 - regc - emit (if appropriate) a byte of code
609 if (regcode != ®dummy)
616 - reginsert - insert an operator in front of already-emitted operand
618 * Means relocating the operand.
627 register char *place;
629 if (regcode == ®dummy) {
640 place = opnd; /* Op node, where operand used to be. */
647 - regtail - set the next-pointer at the end of a node chain
661 /* Find last node. */
664 temp = regnext(scan);
670 if (OP(scan) == BACK)
674 *(scan+1) = (offset>>8)&0377;
675 *(scan+2) = offset&0377;
679 - regoptail - regtail on operand of first argument; nop if operandless
686 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
687 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
689 regtail(OPERAND(p), val);
693 * regexec and friends
697 * Global work variables for regexec().
699 static char *reginput; /* String-input pointer. */
700 static char *regbol; /* Beginning of input, for ^ check. */
701 static char **regstartp; /* Pointer to startp array. */
702 static char **regendp; /* Ditto for endp. */
708 STATIC int regmatch();
709 STATIC int regrepeat();
714 STATIC char *regprop();
718 - regexec - match a regexp against a string
721 regexec(prog, string)
722 register regexp *prog;
723 register char *string;
726 extern char *strchr();
729 if (prog == NULL || string == NULL) {
730 regerror("NULL parameter");
734 /* Check validity of program. */
735 if (UCHARAT(prog->program) != MAGIC) {
736 regerror("corrupted program");
740 /* If there is a "must appear" string, look for it. */
741 if (prog->regmust != NULL) {
743 while ((s = strchr(s, prog->regmust[0])) != NULL) {
744 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
745 break; /* Found it. */
748 if (s == NULL) /* Not present. */
752 /* Mark beginning of line for ^ . */
755 /* Simplest case: anchored match need be tried only once. */
757 return(regtry(prog, string));
759 /* Messy cases: unanchored match. */
761 if (prog->regstart != '\0')
762 /* We know what char it must start with. */
763 while ((s = strchr(s, prog->regstart)) != NULL) {
769 /* We don't -- general case. */
773 } while (*s++ != '\0');
780 - regtry - try match at specific point
782 static int /* 0 failure, 1 success */
792 regstartp = prog->startp;
793 regendp = prog->endp;
797 for (i = NSUBEXP; i > 0; i--) {
801 if (regmatch(prog->program + 1)) {
802 prog->startp[0] = string;
803 prog->endp[0] = reginput;
810 - regmatch - main matching routine
812 * Conceptually the strategy is simple: check to see whether the current
813 * node matches, call self recursively to see whether the rest matches,
814 * and then act accordingly. In practice we make some effort to avoid
815 * recursion, in particular by going through "ordinary" nodes (that don't
816 * need to know whether the rest of the match failed) by a loop instead of
819 static int /* 0 failure, 1 success */
823 register char *scan; /* Current node. */
824 char *next; /* Next node. */
825 extern char *strchr();
829 if (scan != NULL && regnarrate)
830 fprintf(stderr, "%s(\n", regprop(scan));
832 while (scan != NULL) {
835 fprintf(stderr, "%s...\n", regprop(scan));
837 next = regnext(scan);
841 if (reginput != regbol)
845 if (*reginput != '\0')
849 if (*reginput == '\0')
857 opnd = OPERAND(scan);
858 /* Inline the first character, for speed. */
859 if (*opnd != *reginput)
862 if (len > 1 && strncmp(opnd, reginput, len) != 0)
868 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
873 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
893 no = OP(scan) - OPEN;
896 if (regmatch(next)) {
898 * Don't set startp if some later
899 * invocation of the same parentheses
902 if (regstartp[no] == NULL)
903 regstartp[no] = save;
921 no = OP(scan) - CLOSE;
924 if (regmatch(next)) {
926 * Don't set endp if some later
927 * invocation of the same parentheses
930 if (regendp[no] == NULL)
940 if (OP(next) != BRANCH) /* No choice. */
941 next = OPERAND(scan); /* Avoid recursion. */
945 if (regmatch(OPERAND(scan)))
948 scan = regnext(scan);
949 } while (scan != NULL && OP(scan) == BRANCH);
957 register char nextch;
963 * Lookahead to avoid useless match attempts
964 * when we know what character comes next.
967 if (OP(next) == EXACTLY)
968 nextch = *OPERAND(next);
969 min = (OP(scan) == STAR) ? 0 : 1;
971 no = regrepeat(OPERAND(scan));
973 /* If it could work, try it. */
974 if (nextch == '\0' || *reginput == nextch)
977 /* Couldn't or didn't -- back up. */
979 reginput = save + no;
985 return(1); /* Success! */
988 regerror("memory corruption");
997 * We get here only if there's trouble -- normally "case END" is
998 * the terminating point.
1000 regerror("corrupted pointers");
1005 - regrepeat - repeatedly match something simple, report how many
1011 register int count = 0;
1012 register char *scan;
1013 register char *opnd;
1014 extern char *strchr();
1020 count = strlen(scan);
1024 while (*opnd == *scan) {
1030 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1036 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1041 default: /* Oh dear. Called inappropriately. */
1042 regerror("internal foulup");
1043 count = 0; /* Best compromise. */
1052 - regnext - dig the "next" pointer out of a node
1058 register int offset;
1075 STATIC char *regprop();
1078 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1085 register char op = EXACTLY; /* Arbitrary non-END op. */
1086 register char *next;
1087 extern char *strchr();
1091 while (op != END) { /* While that wasn't END last time... */
1093 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1095 if (next == NULL) /* Next ptr. */
1098 printf("(%d)", (s-r->program)+(next-s));
1100 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1101 /* Literal string, where present. */
1102 while (*s != '\0') {
1111 /* Header fields of interest. */
1112 if (r->regstart != '\0')
1113 printf("start `%c' ", r->regstart);
1115 printf("anchored ");
1116 if (r->regmust != NULL)
1117 printf("must have \"%s\"", r->regmust);
1122 - regprop - printable representation of opcode
1129 static char buf[50];
1131 (void) strcpy(buf, ":");
1173 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1185 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1195 regerror("corrupted opcode");
1199 (void) strcat(buf, p);
1205 * The following is provided for those people who do not have strcspn() in
1206 * their C libraries. They should get off their butts and do something
1207 * about it; at least one public-domain implementation of those (highly
1208 * useful) string routines has been published on Usenet.
1212 * strcspn - find length of initial segment of s1 consisting entirely
1213 * of characters not from s2
1221 register char *scan1;
1222 register char *scan2;
1226 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1227 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1228 if (*scan1 == *scan2++)