1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2001 University of Cambridge
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
36 /* Define DEBUG to get debugging output on stdout. */
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
45 #define DPRINTF(p) printf p
47 #define DPRINTF(p) /*nothing*/
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
56 /* Allow compilation as C++ source code, should anybody want to do that. */
59 #define class pcre_class
63 /* Maximum number of items on the nested bracket stacks at compile time. This
64 applies to the nesting of all kinds of parentheses. It does not limit
65 un-nested, non-capturing parentheses. This number can be made bigger if
66 necessary - it is used to dimension one int and one unsigned char vector at
69 #define BRASTACK_SIZE 200
72 /* The number of bytes in a literal character string above which we can't add
73 any more is different when UTF-8 characters may be encountered. */
82 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
84 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
85 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
87 /* Text forms of OP_ values and things, for debugging (not all used) */
90 static const char *OP_names[] = {
91 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
92 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
93 "Opt", "^", "$", "Any", "chars", "not",
94 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
95 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
96 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
97 "*", "*?", "+", "+?", "?", "??", "{", "{",
98 "class", "Ref", "Recurse",
99 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
100 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
101 "Brazero", "Braminzero", "Branumber", "Bra"
105 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
106 are simple data values; negative values are for special things like \d and so
107 on. Zero means further processing is needed (for things like \x), or the escape
110 static const short int escapes[] = {
111 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
112 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
113 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
114 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
115 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
116 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
117 '`', 7, -ESC_b, 0, -ESC_d, ESC_E, ESC_F, 0, /* ` - g */
118 0, 0, 0, 0, 0, 0, ESC_N, 0, /* h - o */
119 0, 0, ESC_R, -ESC_s, ESC_T, 0, 0, -ESC_w, /* p - w */
120 0, 0, -ESC_z /* x - z */
123 /* Tables of names of POSIX character classes and their lengths. The list is
124 terminated by a zero length entry. The first three must be alpha, upper, lower,
125 as this is assumed for handling case independence. */
127 static const char *posix_names[] = {
128 "alpha", "lower", "upper",
129 "alnum", "ascii", "cntrl", "digit", "graph",
130 "print", "punct", "space", "word", "xdigit" };
132 static const uschar posix_name_lengths[] = {
133 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
135 /* Table of class bit maps for each POSIX class; up to three may be combined
136 to form the class. */
138 static const int posix_class_maps[] = {
139 cbit_lower, cbit_upper, -1, /* alpha */
140 cbit_lower, -1, -1, /* lower */
141 cbit_upper, -1, -1, /* upper */
142 cbit_digit, cbit_lower, cbit_upper, /* alnum */
143 cbit_print, cbit_cntrl, -1, /* ascii */
144 cbit_cntrl, -1, -1, /* cntrl */
145 cbit_digit, -1, -1, /* digit */
146 cbit_graph, -1, -1, /* graph */
147 cbit_print, -1, -1, /* print */
148 cbit_punct, -1, -1, /* punct */
149 cbit_space, -1, -1, /* space */
150 cbit_word, -1, -1, /* word */
151 cbit_xdigit,-1, -1 /* xdigit */
155 /* Definition to allow mutual recursion */
158 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
159 BOOL, int, int *, int *, compile_data *);
161 /* Structure for building a chain of data that actually lives on the
162 stack, for holding the values of the subject pointer at the start of each
163 subpattern, so as to detect when an empty string has been matched by a
164 subpattern - to break infinite loops. */
166 typedef struct eptrblock {
167 struct eptrblock *prev;
168 const uschar *saved_eptr;
171 /* Flag bits for the match() function */
173 #define match_condassert 0x01 /* Called to check a condition assertion */
174 #define match_isgroup 0x02 /* Set if start of bracketed group */
178 /*************************************************
180 *************************************************/
182 /* PCRE is thread-clean and doesn't use any global variables in the normal
183 sense. However, it calls memory allocation and free functions via the two
184 indirections below, which are can be changed by the caller, but are shared
185 between all threads. */
187 void *(*pcre_malloc)(size_t) = malloc;
188 void (*pcre_free)(void *) = free;
192 /*************************************************
193 * Macros and tables for character handling *
194 *************************************************/
196 /* When UTF-8 encoding is being used, a character is no longer just a single
197 byte. The macros for character handling generate simple sequences when used in
198 byte-mode, and more complicated ones for UTF-8 characters. */
201 #define GETCHARINC(c, eptr) c = *eptr++;
202 #define GETCHARLEN(c, eptr, len) c = *eptr;
203 #define BACKCHAR(eptr)
205 #else /* SUPPORT_UTF8 */
207 /* Get the next UTF-8 character, advancing the pointer */
209 #define GETCHARINC(c, eptr) \
211 if (md->utf8 && (c & 0xc0) == 0xc0) \
213 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
215 c = (c & utf8_table3[a]) << s; \
219 c |= (*eptr++ & 0x3f) << s; \
223 /* Get the next UTF-8 character, not advancing the pointer, setting length */
225 #define GETCHARLEN(c, eptr, len) \
228 if (md->utf8 && (c & 0xc0) == 0xc0) \
231 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
233 c = (c & utf8_table3[a]) << s; \
234 for (i = 1; i <= a; i++) \
237 c |= (eptr[i] & 0x3f) << s; \
242 /* If the pointer is not at the start of a character, move it back until
245 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
251 /*************************************************
252 * Default character tables *
253 *************************************************/
255 /* A default set of character tables is included in the PCRE binary. Its source
256 is built by the maketables auxiliary program, which uses the default C ctypes
257 functions, and put in the file chartables.c. These tables are used by PCRE
258 whenever the caller of pcre_compile() does not provide an alternate set of
261 #include "chartables.c"
266 /*************************************************
267 * Tables for UTF-8 support *
268 *************************************************/
270 /* These are the breakpoints for different numbers of bytes in a UTF-8
273 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
275 /* These are the indicator bits and the mask for the data bits to set in the
276 first byte of a character, indexed by the number of additional bytes. */
278 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
279 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
281 /* Table of the number of extra characters, indexed by the first character
282 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
285 static uschar utf8_table4[] = {
286 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
287 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
288 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
289 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
292 /*************************************************
293 * Convert character value to UTF-8 *
294 *************************************************/
296 /* This function takes an integer value in the range 0 - 0x7fffffff
297 and encodes it as a UTF-8 character in 0 to 6 bytes.
300 cvalue the character value
301 buffer pointer to buffer for result - at least 6 bytes long
303 Returns: number of characters placed in the buffer
307 ord2utf8(int cvalue, uschar *buffer)
310 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
311 if (cvalue <= utf8_table1[i]) break;
313 for (j = i; j > 0; j--)
315 *buffer-- = 0x80 | (cvalue & 0x3f);
318 *buffer = utf8_table2[i] | cvalue;
325 /*************************************************
326 * Return version string *
327 *************************************************/
329 #define STRING(a) # a
330 #define XSTRING(s) STRING(s)
335 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
341 /*************************************************
342 * (Obsolete) Return info about compiled pattern *
343 *************************************************/
345 /* This is the original "info" function. It picks potentially useful data out
346 of the private structure, but its interface was too rigid. It remains for
347 backwards compatibility. The public options are passed back in an int - though
348 the re->options field has been expanded to a long int, all the public options
349 at the low end of it, and so even on 16-bit systems this will still be OK.
350 Therefore, I haven't changed the API for pcre_info().
353 external_re points to compiled code
354 optptr where to pass back the options
355 first_char where to pass back the first character,
356 or -1 if multiline and all branches start ^,
359 Returns: number of capturing subpatterns
360 or negative values on error
364 pcre_info(const pcre *external_re, int *optptr, int *first_char)
366 const real_pcre *re = (const real_pcre *)external_re;
367 if (re == NULL) return PCRE_ERROR_NULL;
368 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
369 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
370 if (first_char != NULL)
371 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
372 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
373 return re->top_bracket;
378 /*************************************************
379 * Return info about compiled pattern *
380 *************************************************/
382 /* This is a newer "info" function which has an extensible interface so
383 that additional items can be added compatibly.
386 external_re points to compiled code
387 external_study points to study data, or NULL
388 what what information is required
389 where where to put the information
391 Returns: 0 if data returned, negative on error
395 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
398 const real_pcre *re = (const real_pcre *)external_re;
399 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
401 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
402 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
406 case PCRE_INFO_OPTIONS:
407 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
411 *((size_t *)where) = re->size;
414 case PCRE_INFO_CAPTURECOUNT:
415 *((int *)where) = re->top_bracket;
418 case PCRE_INFO_BACKREFMAX:
419 *((int *)where) = re->top_backref;
422 case PCRE_INFO_FIRSTCHAR:
424 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
425 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
428 case PCRE_INFO_FIRSTTABLE:
429 *((const uschar **)where) =
430 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
431 study->start_bits : NULL;
434 case PCRE_INFO_LASTLITERAL:
436 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
439 default: return PCRE_ERROR_BADOPTION;
448 /*************************************************
449 * Debugging function to print chars *
450 *************************************************/
452 /* Print a sequence of chars in printable format, stopping at the end of the
453 subject if the requested.
456 p points to characters
457 length number to print
458 is_subject TRUE if printing from within md->start_subject
459 md pointer to matching data block, if is_subject is TRUE
465 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
468 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
470 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
477 /*************************************************
479 *************************************************/
481 /* This function is called when a \ has been encountered. It either returns a
482 positive value for a simple escape such as \n, or a negative value which
483 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
484 a positive value greater than 255 may be returned. On entry, ptr is pointing at
485 the \. On exit, it is on the final character of the escape sequence.
488 ptrptr points to the pattern position pointer
489 errorptr points to the pointer to the error message
490 bracount number of previous extracting brackets
491 options the options bits
492 isclass TRUE if inside a character class
493 cd pointer to char tables block
495 Returns: zero or positive => a data character
496 negative => a special escape sequence
497 on error, errorptr is set
501 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
502 int options, BOOL isclass, compile_data *cd)
504 const uschar *ptr = *ptrptr;
507 /* If backslash is at the end of the pattern, it's an error. */
510 if (c == 0) *errorptr = ERR1;
512 /* Digits or letters may have special meaning; all others are literals. */
514 else if (c < '0' || c > 'z') {}
516 /* Do an initial lookup in a table. A non-zero result is something that can be
517 returned immediately. Otherwise further processing may be required. */
519 else if ((i = escapes[c - '0']) != 0) c = i;
521 /* Escapes that need further processing, or are illegal. */
525 const uschar *oldptr;
528 /* The handling of escape sequences consisting of a string of digits
529 starting with one that is not zero is not straightforward. By experiment,
530 the way Perl works seems to be as follows:
532 Outside a character class, the digits are read as a decimal number. If the
533 number is less than 10, or if there are that many previous extracting
534 left brackets, then it is a back reference. Otherwise, up to three octal
535 digits are read to form an escaped byte. Thus \123 is likely to be octal
536 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
537 value is greater than 377, the least significant 8 bits are taken. Inside a
538 character class, \ followed by a digit is always an octal number. */
540 case '1': case '2': case '3': case '4': case '5':
541 case '6': case '7': case '8': case '9':
547 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
548 c = c * 10 + *(++ptr) - '0';
549 if (c < 10 || c <= bracount)
554 ptr = oldptr; /* Put the pointer back and fall through */
557 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
558 generates a binary zero byte and treats the digit as a following literal.
559 Thus we have to pull back the pointer by one. */
561 if ((c = *ptr) >= '8')
568 /* \0 always starts an octal number, but we may drop through to here with a
569 larger first octal digit. */
573 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
574 ptr[1] != '8' && ptr[1] != '9')
575 c = c * 8 + *(++ptr) - '0';
576 c &= 255; /* Take least significant 8 bits */
579 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
580 which can be greater than 0xff, but only if the ddd are hex digits. */
584 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
586 const uschar *pt = ptr + 2;
587 register int count = 0;
589 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
592 c = c * 16 + cd->lcc[*pt] -
593 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
598 if (c < 0 || count > 8) *errorptr = ERR34;
602 /* If the sequence of hex digits does not end with '}', then we don't
603 recognize this construct; fall through to the normal \x handling. */
607 /* Read just a single hex char */
610 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
613 c = c * 16 + cd->lcc[*ptr] -
614 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
618 /* Other special escapes not starting with a digit are straightforward */
628 /* A letter is upper-cased; then the 0x40 bit is flipped */
630 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
634 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
635 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
636 for Perl compatibility, it is a literal. This code looks a bit odd, but
637 there used to be some cases other than the default, and there may be again
638 in future, so I haven't "optimized" it. */
641 if ((options & PCRE_EXTRA) != 0) switch(c)
657 /*************************************************
658 * Check for counted repeat *
659 *************************************************/
661 /* This function is called when a '{' is encountered in a place where it might
662 start a quantifier. It looks ahead to see if it really is a quantifier or not.
663 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
664 where the ddds are digits.
667 p pointer to the first char after '{'
668 cd pointer to char tables block
670 Returns: TRUE or FALSE
674 is_counted_repeat(const uschar *p, compile_data *cd)
676 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
677 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
678 if (*p == '}') return TRUE;
680 if (*p++ != ',') return FALSE;
681 if (*p == '}') return TRUE;
683 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
684 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
690 /*************************************************
691 * Read repeat counts *
692 *************************************************/
694 /* Read an item of the form {n,m} and return the values. This is called only
695 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
696 so the syntax is guaranteed to be correct, but we need to check the values.
699 p pointer to first char after '{'
700 minp pointer to int for min
701 maxp pointer to int for max
702 returned as -1 if no max
703 errorptr points to pointer to error message
704 cd pointer to character tables clock
706 Returns: pointer to '}' on success;
707 current ptr on error, with errorptr set
710 static const uschar *
711 read_repeat_counts(const uschar *p, int *minp, int *maxp,
712 const char **errorptr, compile_data *cd)
717 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
719 if (*p == '}') max = min; else
724 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
733 /* Do paranoid checks, then fill in the required variables, and pass back the
734 pointer to the terminating '}'. */
736 if (min > 65535 || max > 65535)
748 /*************************************************
749 * Find the fixed length of a pattern *
750 *************************************************/
752 /* Scan a pattern and compute the fixed length of subject that will match it,
753 if the length is fixed. This is needed for dealing with backward assertions.
756 code points to the start of the pattern (the bracket)
757 options the compiling options
759 Returns: the fixed length, or -1 if there is no fixed length
763 find_fixedlength(uschar *code, int options)
767 register int branchlength = 0;
768 register uschar *cc = code + 3;
770 /* Scan along the opcodes for this branch. If we get to the end of the
771 branch, check the length against that of the other branches. */
776 register int op = *cc;
777 if (op >= OP_BRA) op = OP_BRA;
784 d = find_fixedlength(cc, options);
785 if (d < 0) return -1;
787 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
791 /* Reached end of a branch; if it's a ket it is the end of a nested
792 call. If it's ALT it is an alternation in a nested call. If it is
793 END it's the end of the outer call. All can be handled by the same code. */
800 if (length < 0) length = branchlength;
801 else if (length != branchlength) return -1;
802 if (*cc != OP_ALT) return length;
807 /* Skip over assertive subpatterns */
812 case OP_ASSERTBACK_NOT:
813 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
817 /* Skip over things that don't match chars */
834 case OP_NOT_WORD_BOUNDARY:
835 case OP_WORD_BOUNDARY:
839 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
840 This requires a scan of the string, unfortunately. We assume valid UTF-8
841 strings, so all we do is reduce the length by one for byte whose bits are
845 branchlength += *(++cc);
847 for (d = 1; d <= *cc; d++)
848 if ((cc[d] & 0xc0) == 0x80) branchlength--;
853 /* Handle exact repetitions */
857 branchlength += (cc[1] << 8) + cc[2];
861 /* Handle single-char matchers */
865 case OP_NOT_WHITESPACE:
867 case OP_NOT_WORDCHAR:
875 /* Check a class for variable quantification */
890 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
891 branchlength += (cc[1] << 8) + cc[2];
900 /* Anything else is variable length */
906 /* Control never gets here */
912 /*************************************************
913 * Check for POSIX class syntax *
914 *************************************************/
916 /* This function is called when the sequence "[:" or "[." or "[=" is
917 encountered in a character class. It checks whether this is followed by an
918 optional ^ and then a sequence of letters, terminated by a matching ":]" or
922 ptr pointer to the initial [
923 endptr where to return the end pointer
924 cd pointer to compile data
926 Returns: TRUE or FALSE
930 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
932 int terminator; /* Don't combine these lines; the Solaris cc */
933 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
934 if (*(++ptr) == '^') ptr++;
935 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
936 if (*ptr == terminator && ptr[1] == ']')
947 /*************************************************
948 * Check POSIX class name *
949 *************************************************/
951 /* This function is called to check the name given in a POSIX-style class entry
955 ptr points to the first letter
956 len the length of the name
958 Returns: a value representing the name, or -1 if unknown
962 check_posix_name(const uschar *ptr, int len)
964 register int yield = 0;
965 while (posix_name_lengths[yield] != 0)
967 if (len == posix_name_lengths[yield] &&
968 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
977 /*************************************************
978 * Compile one branch *
979 *************************************************/
981 /* Scan the pattern, compiling it into the code vector.
984 options the option bits
985 brackets points to number of extracting brackets used
986 code points to the pointer to the current code point
987 ptrptr points to the current pattern pointer
988 errorptr points to pointer to error message
989 optchanged set to the value of the last OP_OPT item compiled
990 reqchar set to the last literal character required, else -1
991 countlits set to count of mandatory literal characters
992 cd contains pointers to tables
994 Returns: TRUE on success
995 FALSE, with *errorptr set on error
999 compile_branch(int options, int *brackets, uschar **codeptr,
1000 const uschar **ptrptr, const char **errorptr, int *optchanged,
1001 int *reqchar, int *countlits, compile_data *cd)
1003 int repeat_type, op_type;
1004 int repeat_min, repeat_max;
1005 int bravalue, length;
1006 int greedy_default, greedy_non_default;
1009 int subcountlits = 0;
1011 register uschar *code = *codeptr;
1013 const uschar *ptr = *ptrptr;
1014 const uschar *tempptr;
1015 uschar *previous = NULL;
1018 /* Set up the default and non-default settings for greediness */
1020 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1021 greedy_non_default = greedy_default ^ 1;
1023 /* Initialize no required char, and count of literals */
1025 *reqchar = prevreqchar = -1;
1028 /* Switch on next character until the end of the branch */
1033 int class_charcount;
1040 if ((options & PCRE_EXTENDED) != 0)
1042 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1045 /* The space before the ; is to avoid a warning on a silly compiler
1046 on the Macintosh. */
1047 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1054 /* The branch terminates at end of string, |, or ). */
1063 /* Handle single-character metacharacters */
1080 /* Character classes. These always build a 32-byte bitmap of the permitted
1081 characters, except in the special case where there is only one character.
1082 For negated classes, we build the map as usual, then invert it at the end.
1089 /* If the first character is '^', set the negation flag and skip it. */
1091 if ((c = *(++ptr)) == '^')
1093 negate_class = TRUE;
1096 else negate_class = FALSE;
1098 /* Keep a count of chars so that we can optimize the case of just a single
1101 class_charcount = 0;
1102 class_lastchar = -1;
1104 /* Initialize the 32-char bit map to all zeros. We have to build the
1105 map in a temporary bit of store, in case the class contains only 1
1106 character, because in that case the compiled code doesn't use the
1109 memset(class, 0, 32 * sizeof(uschar));
1111 /* Process characters until ] is reached. By writing this as a "do" it
1112 means that an initial ] is taken as a data character. */
1122 /* Handle POSIX class names. Perl allows a negation extension of the
1123 form [:^name]. A square bracket that doesn't match the syntax is
1124 treated as a literal. We also recognize the POSIX constructions
1125 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1129 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1130 check_posix_syntax(ptr, &tempptr, cd))
1132 BOOL local_negate = FALSE;
1134 register const uschar *cbits = cd->cbits;
1145 local_negate = TRUE;
1149 posix_class = check_posix_name(ptr, tempptr - ptr);
1150 if (posix_class < 0)
1156 /* If matching is caseless, upper and lower are converted to
1157 alpha. This relies on the fact that the class table starts with
1158 alpha, lower, upper as the first 3 entries. */
1160 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1163 /* Or into the map we are building up to 3 of the static class
1164 tables, or their negations. */
1167 for (i = 0; i < 3; i++)
1169 int taboffset = posix_class_maps[posix_class + i];
1170 if (taboffset < 0) break;
1172 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1174 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1178 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1182 /* Backslash may introduce a single character, or it may introduce one
1183 of the specials, which just set a flag. Escaped items are checked for
1184 validity in the pre-compiling pass. The sequence \b is a special case.
1185 Inside a class (and only there) it is treated as backspace. Elsewhere
1186 it marks a word boundary. Other escapes have preset maps ready to
1187 or into the one we are building. We assume they have more than one
1188 character in them, so set class_count bigger than one. */
1192 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1193 if (-c == ESC_b) c = '\b';
1196 register const uschar *cbits = cd->cbits;
1197 class_charcount = 10;
1201 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1205 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1209 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1213 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1217 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1221 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1230 /* Fall through if single character, but don't at present allow
1231 chars > 255 in UTF-8 mode. */
1242 /* A single character may be followed by '-' to form a range. However,
1243 Perl does not permit ']' to be the end of the range. A '-' character
1244 here is treated as a literal. */
1246 if (ptr[1] == '-' && ptr[2] != ']')
1258 /* The second part of a range can be a single-character escape, but
1259 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1260 in such circumstances. */
1264 const uschar *oldptr = ptr;
1265 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1274 /* \b is backslash; any other special means the '-' was literal */
1278 if (d == -ESC_b) d = '\b'; else
1281 goto SINGLE_CHARACTER; /* A few lines below */
1294 class[c/8] |= (1 << (c&7));
1295 if ((options & PCRE_CASELESS) != 0)
1297 int uc = cd->fcc[c]; /* flip case */
1298 class[uc/8] |= (1 << (uc&7));
1300 class_charcount++; /* in case a one-char range */
1303 continue; /* Go get the next char in the class */
1306 /* Handle a lone single character - we can get here for a normal
1307 non-escape char, or after \ that introduces a single character. */
1311 class [c/8] |= (1 << (c&7));
1312 if ((options & PCRE_CASELESS) != 0)
1314 c = cd->fcc[c]; /* flip case */
1315 class[c/8] |= (1 << (c&7));
1321 /* Loop until ']' reached; the check for end of string happens inside the
1322 loop. This "while" is the end of the "do" above. */
1324 while ((c = *(++ptr)) != ']');
1326 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1327 precisely one character. This doesn't need the whole 32-byte bit map.
1328 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1331 if (class_charcount == 1 && class_lastchar >= 0)
1339 code[-1] = OP_CHARS;
1342 *code++ = class_lastchar;
1345 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1351 for (c = 0; c < 32; c++) code[c] = ~class[c];
1353 memcpy(code, class, 32);
1358 /* Various kinds of repeat */
1361 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1362 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1363 if (*errorptr != NULL) goto FAILED;
1381 if (previous == NULL)
1387 /* If the next character is '?' this is a minimizing repeat, by default,
1388 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1392 { repeat_type = greedy_non_default; ptr++; }
1393 else repeat_type = greedy_default;
1395 /* If previous was a string of characters, chop off the last one and use it
1396 as the subject of the repeat. If there was only one character, we can
1397 abolish the previous item altogether. A repeat with a zero minimum wipes
1398 out any reqchar setting, backing up to the previous value. We must also
1399 adjust the countlits value. */
1401 if (*previous == OP_CHARS)
1403 int len = previous[1];
1405 if (repeat_min == 0) *reqchar = prevreqchar;
1406 *countlits += repeat_min - 1;
1415 c = previous[len+1];
1419 op_type = 0; /* Use single-char op codes */
1420 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1423 /* If previous was a single negated character ([^a] or similar), we use
1424 one of the special opcodes, replacing it. The code is shared with single-
1425 character repeats by adding a suitable offset into repeat_type. */
1427 else if ((int)*previous == OP_NOT)
1429 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1432 goto OUTPUT_SINGLE_REPEAT;
1435 /* If previous was a character type match (\d or similar), abolish it and
1436 create a suitable repeat item. The code is shared with single-character
1437 repeats by adding a suitable offset into repeat_type. */
1439 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1441 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1445 OUTPUT_SINGLE_REPEAT:
1447 /* If the maximum is zero then the minimum must also be zero; Perl allows
1448 this case, so we do too - by simply omitting the item altogether. */
1450 if (repeat_max == 0) goto END_REPEAT;
1452 /* Combine the op_type with the repeat_type */
1454 repeat_type += op_type;
1456 /* A minimum of zero is handled either as the special case * or ?, or as
1457 an UPTO, with the maximum given. */
1459 if (repeat_min == 0)
1461 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1462 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1465 *code++ = OP_UPTO + repeat_type;
1466 *code++ = repeat_max >> 8;
1467 *code++ = (repeat_max & 255);
1471 /* The case {1,} is handled as the special case + */
1473 else if (repeat_min == 1 && repeat_max == -1)
1474 *code++ = OP_PLUS + repeat_type;
1476 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1477 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1481 if (repeat_min != 1)
1483 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1484 *code++ = repeat_min >> 8;
1485 *code++ = (repeat_min & 255);
1488 /* If the mininum is 1 and the previous item was a character string,
1489 we either have to put back the item that got cancelled if the string
1490 length was 1, or add the character back onto the end of a longer
1491 string. For a character type nothing need be done; it will just get
1492 put back naturally. Note that the final character is always going to
1495 else if (*previous == OP_CHARS)
1497 if (code == previous) code += 2; else previous[1]++;
1500 /* For a single negated character we also have to put back the
1501 item that got cancelled. */
1503 else if (*previous == OP_NOT) code++;
1505 /* If the maximum is unlimited, insert an OP_STAR. */
1510 *code++ = OP_STAR + repeat_type;
1513 /* Else insert an UPTO if the max is greater than the min. */
1515 else if (repeat_max != repeat_min)
1518 repeat_max -= repeat_min;
1519 *code++ = OP_UPTO + repeat_type;
1520 *code++ = repeat_max >> 8;
1521 *code++ = (repeat_max & 255);
1525 /* The character or character type itself comes last in all cases. */
1530 /* If previous was a character class or a back reference, we put the repeat
1531 stuff after it, but just skip the item if the repeat was {0,0}. */
1533 else if (*previous == OP_CLASS || *previous == OP_REF)
1535 if (repeat_max == 0)
1540 if (repeat_min == 0 && repeat_max == -1)
1541 *code++ = OP_CRSTAR + repeat_type;
1542 else if (repeat_min == 1 && repeat_max == -1)
1543 *code++ = OP_CRPLUS + repeat_type;
1544 else if (repeat_min == 0 && repeat_max == 1)
1545 *code++ = OP_CRQUERY + repeat_type;
1548 *code++ = OP_CRRANGE + repeat_type;
1549 *code++ = repeat_min >> 8;
1550 *code++ = repeat_min & 255;
1551 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1552 *code++ = repeat_max >> 8;
1553 *code++ = repeat_max & 255;
1557 /* If previous was a bracket group, we may have to replicate it in certain
1560 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1561 (int)*previous == OP_COND)
1565 int len = code - previous;
1566 uschar *bralink = NULL;
1568 /* If the maximum repeat count is unlimited, find the end of the bracket
1569 by scanning through from the start, and compute the offset back to it
1570 from the current code pointer. There may be an OP_OPT setting following
1571 the final KET, so we can't find the end just by going back from the code
1574 if (repeat_max == -1)
1576 register uschar *ket = previous;
1577 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1578 ketoffset = code - ket;
1581 /* The case of a zero minimum is special because of the need to stick
1582 OP_BRAZERO in front of it, and because the group appears once in the
1583 data, whereas in other cases it appears the minimum number of times. For
1584 this reason, it is simplest to treat this case separately, as otherwise
1585 the code gets far too messy. There are several special subcases when the
1588 if (repeat_min == 0)
1590 /* If we set up a required char from the bracket, we must back off
1591 to the previous value and reset the countlits value too. */
1593 if (subcountlits > 0)
1595 *reqchar = prevreqchar;
1596 *countlits -= subcountlits;
1599 /* If the maximum is also zero, we just omit the group from the output
1602 if (repeat_max == 0)
1608 /* If the maximum is 1 or unlimited, we just have to stick in the
1609 BRAZERO and do no more at this point. */
1611 if (repeat_max <= 1)
1613 memmove(previous+1, previous, len);
1615 *previous++ = OP_BRAZERO + repeat_type;
1618 /* If the maximum is greater than 1 and limited, we have to replicate
1619 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1620 The first one has to be handled carefully because it's the original
1621 copy, which has to be moved up. The remainder can be handled by code
1622 that is common with the non-zero minimum case below. We just have to
1623 adjust the value or repeat_max, since one less copy is required. */
1628 memmove(previous+4, previous, len);
1630 *previous++ = OP_BRAZERO + repeat_type;
1631 *previous++ = OP_BRA;
1633 /* We chain together the bracket offset fields that have to be
1634 filled in later when the ends of the brackets are reached. */
1636 offset = (bralink == NULL)? 0 : previous - bralink;
1638 *previous++ = offset >> 8;
1639 *previous++ = offset & 255;
1645 /* If the minimum is greater than zero, replicate the group as many
1646 times as necessary, and adjust the maximum to the number of subsequent
1647 copies that we need. */
1651 for (i = 1; i < repeat_min; i++)
1653 memcpy(code, previous, len);
1656 if (repeat_max > 0) repeat_max -= repeat_min;
1659 /* This code is common to both the zero and non-zero minimum cases. If
1660 the maximum is limited, it replicates the group in a nested fashion,
1661 remembering the bracket starts on a stack. In the case of a zero minimum,
1662 the first one was set up above. In all cases the repeat_max now specifies
1663 the number of additional copies needed. */
1665 if (repeat_max >= 0)
1667 for (i = repeat_max - 1; i >= 0; i--)
1669 *code++ = OP_BRAZERO + repeat_type;
1671 /* All but the final copy start a new nesting, maintaining the
1672 chain of brackets outstanding. */
1678 offset = (bralink == NULL)? 0 : code - bralink;
1680 *code++ = offset >> 8;
1681 *code++ = offset & 255;
1684 memcpy(code, previous, len);
1688 /* Now chain through the pending brackets, and fill in their length
1689 fields (which are holding the chain links pro tem). */
1691 while (bralink != NULL)
1694 int offset = code - bralink + 1;
1695 uschar *bra = code - offset;
1696 oldlinkoffset = (bra[1] << 8) + bra[2];
1697 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1699 *code++ = bra[1] = offset >> 8;
1700 *code++ = bra[2] = (offset & 255);
1704 /* If the maximum is unlimited, set a repeater in the final copy. We
1705 can't just offset backwards from the current code point, because we
1706 don't know if there's been an options resetting after the ket. The
1707 correct offset was computed above. */
1709 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1712 /* Else there's some kind of shambles */
1720 /* In all case we no longer have a previous item. */
1727 /* Start of nested bracket sub-expression, or comment or lookahead or
1728 lookbehind or option setting or condition. First deal with special things
1729 that can come after a bracket; all are introduced by ?, and the appearance
1730 of any of them means that this is not a referencing group. They were
1731 checked for validity in the first pass over the string, so we don't have to
1732 check for syntax errors here. */
1735 newoptions = options;
1738 if (*(++ptr) == '?')
1745 case '#': /* Comment; skip to ket */
1747 while (*ptr != ')') ptr++;
1750 case ':': /* Non-extracting bracket */
1756 bravalue = OP_COND; /* Conditional group */
1757 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1759 int condref = *ptr - '0';
1760 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1768 code[4] = condref >> 8;
1769 code[5] = condref & 255;
1775 case '=': /* Positive lookahead */
1776 bravalue = OP_ASSERT;
1780 case '!': /* Negative lookahead */
1781 bravalue = OP_ASSERT_NOT;
1785 case '<': /* Lookbehinds */
1788 case '=': /* Positive lookbehind */
1789 bravalue = OP_ASSERTBACK;
1793 case '!': /* Negative lookbehind */
1794 bravalue = OP_ASSERTBACK_NOT;
1798 default: /* Syntax error */
1804 case '>': /* One-time brackets */
1809 case 'R': /* Pattern recursion */
1810 *code++ = OP_RECURSE;
1814 default: /* Option setting */
1818 while (*ptr != ')' && *ptr != ':')
1822 case '-': optset = &unset; break;
1824 case 'i': *optset |= PCRE_CASELESS; break;
1825 case 'm': *optset |= PCRE_MULTILINE; break;
1826 case 's': *optset |= PCRE_DOTALL; break;
1827 case 'x': *optset |= PCRE_EXTENDED; break;
1828 case 'U': *optset |= PCRE_UNGREEDY; break;
1829 case 'X': *optset |= PCRE_EXTRA; break;
1837 /* Set up the changed option bits, but don't change anything yet. */
1839 newoptions = (options | set) & (~unset);
1841 /* If the options ended with ')' this is not the start of a nested
1842 group with option changes, so the options change at this level. At top
1843 level there is nothing else to be done (the options will in fact have
1844 been set from the start of compiling as a result of the first pass) but
1845 at an inner level we must compile code to change the ims options if
1846 necessary, and pass the new setting back so that it can be put at the
1847 start of any following branches, and when this group ends, a resetting
1848 item can be compiled. */
1852 if ((options & PCRE_INGROUP) != 0 &&
1853 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1856 *code++ = *optchanged = newoptions & PCRE_IMS;
1858 options = newoptions; /* Change options at this level */
1859 previous = NULL; /* This item can't be repeated */
1860 continue; /* It is complete */
1863 /* If the options ended with ':' we are heading into a nested group
1864 with possible change of options. Such groups are non-capturing and are
1865 not assertions of any kind. All we need to do is skip over the ':';
1866 the newoptions value is handled below. */
1873 /* Else we have a referencing group; adjust the opcode. If the bracket
1874 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1875 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1879 if (++(*brackets) > EXTRACT_BASIC_MAX)
1881 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1882 code[3] = OP_BRANUMBER;
1883 code[4] = *brackets >> 8;
1884 code[5] = *brackets & 255;
1887 else bravalue = OP_BRA + *brackets;
1890 /* Process nested bracketed re. Assertions may not be repeated, but other
1891 kinds can be. We copy code into a non-register variable in order to be able
1892 to pass its address because some compilers complain otherwise. Pass in a
1893 new setting for the ims options if they have changed. */
1895 previous = (bravalue >= OP_ONCE)? code : NULL;
1900 options | PCRE_INGROUP, /* Set for all nested groups */
1901 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1902 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1903 brackets, /* Extracting bracket count */
1904 &tempcode, /* Where to put code (updated) */
1905 &ptr, /* Input pointer (updated) */
1906 errorptr, /* Where to put an error message */
1907 (bravalue == OP_ASSERTBACK ||
1908 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1909 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
1910 &subreqchar, /* For possible last char */
1911 &subcountlits, /* For literal count */
1912 cd)) /* Tables block */
1915 /* At the end of compiling, code is still pointing to the start of the
1916 group, while tempcode has been updated to point past the end of the group
1917 and any option resetting that may follow it. The pattern pointer (ptr)
1918 is on the bracket. */
1920 /* If this is a conditional bracket, check that there are no more than
1921 two branches in the group. */
1923 else if (bravalue == OP_COND)
1930 tc += (tc[1] << 8) | tc[2];
1932 while (*tc != OP_KET);
1941 /* Handle updating of the required character. If the subpattern didn't
1942 set one, leave it as it was. Otherwise, update it for normal brackets of
1943 all kinds, forward assertions, and conditions with two branches. Don't
1944 update the literal count for forward assertions, however. If the bracket
1945 is followed by a quantifier with zero repeat, we have to back off. Hence
1946 the definition of prevreqchar and subcountlits outside the main loop so
1947 that they can be accessed for the back off. */
1949 if (subreqchar > 0 &&
1950 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1951 (bravalue == OP_COND && condcount == 2)))
1953 prevreqchar = *reqchar;
1954 *reqchar = subreqchar;
1955 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1958 /* Now update the main code pointer to the end of the group. */
1962 /* Error if hit end of pattern */
1971 /* Check \ for being a real metacharacter; if not, fall through and handle
1972 it as a data character at the start of a string. Escape items are checked
1973 for validity in the pre-compiling pass. */
1977 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1979 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1980 are arranged to be the negation of the corresponding OP_values. For the
1981 back references, the values are ESC_REF plus the reference number. Only
1982 back references and those types that consume a character may be repeated.
1983 We can test for values between ESC_b and ESC_Z for the latter; this may
1984 have to change if any new ones are ever created. */
1990 int number = -c - ESC_REF;
1993 *code++ = number >> 8;
1994 *code++ = number & 255;
1998 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
2004 /* Data character: reset and fall through */
2009 /* Handle a run of data characters until a metacharacter is encountered.
2010 The first character is guaranteed not to be whitespace or # when the
2011 extended flag is set. */
2022 if ((options & PCRE_EXTENDED) != 0)
2024 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2027 /* The space before the ; is to avoid a warning on a silly compiler
2028 on the Macintosh. */
2029 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2035 /* Backslash may introduce a data char or a metacharacter. Escaped items
2036 are checked for validity in the pre-compiling pass. Stop the string
2037 before a metaitem. */
2042 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2043 if (c < 0) { ptr = tempptr; break; }
2045 /* If a character is > 127 in UTF-8 mode, we have to turn it into
2046 two or more characters in the UTF-8 encoding. */
2049 if (c > 127 && (options & PCRE_UTF8) != 0)
2052 int len = ord2utf8(c, buffer);
2053 for (c = 0; c < len; c++) *code++ = buffer[c];
2060 /* Ordinary character or single-char escape */
2066 /* This "while" is the end of the "do" above. */
2068 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2070 /* Update the last character and the count of literals */
2072 prevreqchar = (length > 1)? code[-2] : *reqchar;
2073 *reqchar = code[-1];
2074 *countlits += length;
2076 /* Compute the length and set it in the data vector, and advance to
2079 previous[1] = length;
2080 if (length < MAXLIT) ptr--;
2083 } /* end of big loop */
2085 /* Control never reaches here by falling through, only by a goto for all the
2086 error states. Pass back the position in the pattern so that it can be displayed
2087 to the user for diagnosing the error. */
2097 /*************************************************
2098 * Compile sequence of alternatives *
2099 *************************************************/
2101 /* On entry, ptr is pointing past the bracket character, but on return
2102 it points to the closing bracket, or vertical bar, or end of string.
2103 The code variable is pointing at the byte into which the BRA operator has been
2104 stored. If the ims options are changed at the start (for a (?ims: group) or
2105 during any branch, we need to insert an OP_OPT item at the start of every
2106 following branch to ensure they get set correctly at run time, and also pass
2107 the new options into every subsequent branch compile.
2110 options the option bits
2111 optchanged new ims options to set as if (?ims) were at the start, or -1
2113 brackets -> int containing the number of extracting brackets used
2114 codeptr -> the address of the current code pointer
2115 ptrptr -> the address of the current pattern pointer
2116 errorptr -> pointer to error message
2117 lookbehind TRUE if this is a lookbehind assertion
2118 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2119 reqchar -> place to put the last required character, or a negative number
2120 countlits -> place to put the shortest literal count of any branch
2121 cd points to the data block with tables pointers
2123 Returns: TRUE on success
2127 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2128 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2129 int *reqchar, int *countlits, compile_data *cd)
2131 const uschar *ptr = *ptrptr;
2132 uschar *code = *codeptr;
2133 uschar *last_branch = code;
2134 uschar *start_bracket = code;
2135 uschar *reverse_count = NULL;
2136 int oldoptions = options & PCRE_IMS;
2137 int branchreqchar, branchcountlits;
2140 *countlits = INT_MAX;
2141 code += 3 + skipbytes;
2143 /* Loop for each alternative branch */
2149 /* Handle change of options */
2151 if (optchanged >= 0)
2154 *code++ = optchanged;
2155 options = (options & ~PCRE_IMS) | optchanged;
2158 /* Set up dummy OP_REVERSE if lookbehind assertion */
2162 *code++ = OP_REVERSE;
2163 reverse_count = code;
2168 /* Now compile the branch */
2170 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2171 &branchreqchar, &branchcountlits, cd))
2177 /* Fill in the length of the last branch */
2179 length = code - last_branch;
2180 last_branch[1] = length >> 8;
2181 last_branch[2] = length & 255;
2183 /* Save the last required character if all branches have the same; a current
2184 value of -1 means unset, while -2 means "previous branch had no last required
2189 if (branchreqchar >= 0)
2191 if (*reqchar == -1) *reqchar = branchreqchar;
2192 else if (*reqchar != branchreqchar) *reqchar = -2;
2197 /* Keep the shortest literal count */
2199 if (branchcountlits < *countlits) *countlits = branchcountlits;
2200 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2202 /* If lookbehind, check that this branch matches a fixed-length string,
2203 and put the length into the OP_REVERSE item. Temporarily mark the end of
2204 the branch with OP_END. */
2209 length = find_fixedlength(last_branch, options);
2210 DPRINTF(("fixed length = %d\n", length));
2217 reverse_count[0] = (length >> 8);
2218 reverse_count[1] = length & 255;
2221 /* Reached end of expression, either ')' or end of pattern. Insert a
2222 terminating ket and the length of the whole bracketed item, and return,
2223 leaving the pointer at the terminating char. If any of the ims options
2224 were changed inside the group, compile a resetting op-code following. */
2228 length = code - start_bracket;
2230 *code++ = length >> 8;
2231 *code++ = length & 255;
2232 if (optchanged >= 0)
2235 *code++ = oldoptions;
2242 /* Another branch follows; insert an "or" node and advance the pointer. */
2249 /* Control never reaches here */
2255 /*************************************************
2256 * Find first significant op code *
2257 *************************************************/
2259 /* This is called by several functions that scan a compiled expression looking
2260 for a fixed first character, or an anchoring op code etc. It skips over things
2261 that do not influence this. For one application, a change of caseless option is
2265 code pointer to the start of the group
2266 options pointer to external options
2267 optbit the option bit whose changing is significant, or
2269 optstop TRUE to return on option change, otherwise change the options
2272 Returns: pointer to the first significant opcode
2275 static const uschar*
2276 first_significant_code(const uschar *code, int *options, int optbit,
2284 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2286 if (optstop) return code;
2287 *options = (int)code[1];
2297 case OP_WORD_BOUNDARY:
2298 case OP_NOT_WORD_BOUNDARY:
2304 case OP_ASSERTBACK_NOT:
2305 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2313 /* Control never reaches here */
2319 /*************************************************
2320 * Check for anchored expression *
2321 *************************************************/
2323 /* Try to find out if this is an anchored regular expression. Consider each
2324 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2325 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2326 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2327 counts, since OP_CIRC can match in the middle.
2329 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2330 because that will try the rest of the pattern at all possible matching points,
2331 so there is no point trying them again.
2334 code points to start of expression (the bracket)
2335 options points to the options setting
2337 Returns: TRUE or FALSE
2341 is_anchored(register const uschar *code, int *options)
2344 const uschar *scode = first_significant_code(code + 3, options,
2345 PCRE_MULTILINE, FALSE);
2346 register int op = *scode;
2347 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2348 { if (!is_anchored(scode, options)) return FALSE; }
2349 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2350 (*options & PCRE_DOTALL) != 0)
2351 { if (scode[1] != OP_ANY) return FALSE; }
2352 else if (op != OP_SOD &&
2353 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2355 code += (code[1] << 8) + code[2];
2357 while (*code == OP_ALT);
2363 /*************************************************
2364 * Check for starting with ^ or .* *
2365 *************************************************/
2367 /* This is called to find out if every branch starts with ^ or .* so that
2368 "first char" processing can be done to speed things up in multiline
2369 matching and for non-DOTALL patterns that start with .* (which must start at
2370 the beginning or after \n).
2372 Argument: points to start of expression (the bracket)
2373 Returns: TRUE or FALSE
2377 is_startline(const uschar *code)
2380 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2381 register int op = *scode;
2382 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2383 { if (!is_startline(scode)) return FALSE; }
2384 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2385 { if (scode[1] != OP_ANY) return FALSE; }
2386 else if (op != OP_CIRC) return FALSE;
2387 code += (code[1] << 8) + code[2];
2389 while (*code == OP_ALT);
2395 /*************************************************
2396 * Check for fixed first char *
2397 *************************************************/
2399 /* Try to find out if there is a fixed first character. This is called for
2400 unanchored expressions, as it speeds up their processing quite considerably.
2401 Consider each alternative branch. If they all start with the same char, or with
2402 a bracket all of whose alternatives start with the same char (recurse ad lib),
2403 then we return that char, otherwise -1.
2406 code points to start of expression (the bracket)
2407 options pointer to the options (used to check casing changes)
2409 Returns: -1 or the fixed first char
2413 find_firstchar(const uschar *code, int *options)
2415 register int c = -1;
2418 const uschar *scode = first_significant_code(code + 3, options,
2419 PCRE_CASELESS, TRUE);
2420 register int op = *scode;
2422 if (op >= OP_BRA) op = OP_BRA;
2433 if ((d = find_firstchar(scode, options)) < 0) return -1;
2434 if (c < 0) c = d; else if (c != d) return -1;
2437 case OP_EXACT: /* Fall through */
2440 case OP_CHARS: /* Fall through */
2445 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2449 code += (code[1] << 8) + code[2];
2451 while (*code == OP_ALT);
2459 /*************************************************
2460 * Compile a Regular Expression *
2461 *************************************************/
2463 /* This function takes a string and returns a pointer to a block of store
2464 holding a compiled version of the expression.
2467 pattern the regular expression
2468 options various option bits
2469 errorptr pointer to pointer to error text
2470 erroroffset ptr offset in pattern where error was detected
2471 tables pointer to character tables or NULL
2473 Returns: pointer to compiled data block, or NULL on error,
2474 with errorptr and erroroffset set
2478 pcre_compile(const char *pattern, int options, const char **errorptr,
2479 int *erroroffset, const unsigned char *tables)
2482 int length = 3; /* For initial BRA plus length */
2484 int c, reqchar, countlits;
2486 int top_backref = 0;
2487 int branch_extra = 0;
2488 int branch_newextra;
2489 unsigned int brastackptr = 0;
2493 compile_data compile_block;
2494 int brastack[BRASTACK_SIZE];
2495 uschar bralenstack[BRASTACK_SIZE];
2498 uschar *code_base, *code_end;
2501 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2503 #ifndef SUPPORT_UTF8
2504 if ((options & PCRE_UTF8) != 0)
2511 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2512 can do is just return NULL. */
2514 if (errorptr == NULL) return NULL;
2517 /* However, we can give a message for this error */
2519 if (erroroffset == NULL)
2526 if ((options & ~PUBLIC_OPTIONS) != 0)
2532 /* Set up pointers to the individual character tables */
2534 if (tables == NULL) tables = pcre_default_tables;
2535 compile_block.lcc = tables + lcc_offset;
2536 compile_block.fcc = tables + fcc_offset;
2537 compile_block.cbits = tables + cbits_offset;
2538 compile_block.ctypes = tables + ctypes_offset;
2540 /* Reflect pattern for debugging output */
2542 DPRINTF(("------------------------------------------------------------------\n"));
2543 DPRINTF(("%s\n", pattern));
2545 /* The first thing to do is to make a pass over the pattern to compute the
2546 amount of store required to hold the compiled code. This does not have to be
2547 perfect as long as errors are overestimates. At the same time we can detect any
2548 internal flag settings. Make an attempt to correct for any counted white space
2549 if an "extended" flag setting appears late in the pattern. We can't be so
2550 clever for #-comments. */
2552 ptr = (const uschar *)(pattern - 1);
2553 while ((c = *(++ptr)) != 0)
2556 int class_charcount;
2559 if ((options & PCRE_EXTENDED) != 0)
2561 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2564 /* The space before the ; is to avoid a warning on a silly compiler
2565 on the Macintosh. */
2566 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2573 /* A backslashed item may be an escaped "normal" character or a
2574 character type. For a "normal" character, put the pointers and
2575 character back so that tests for whitespace etc. in the input
2576 are done correctly. */
2580 const uschar *save_ptr = ptr;
2581 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2582 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2592 /* A back reference needs an additional 2 bytes, plus either one or 5
2593 bytes for a repeat. We also need to keep the value of the highest
2598 int refnum = -c - ESC_REF;
2599 if (refnum > top_backref) top_backref = refnum;
2600 length += 2; /* For single back reference */
2601 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2603 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2604 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2605 if ((min == 0 && (max == 1 || max == -1)) ||
2606 (min == 1 && max == -1))
2609 if (ptr[1] == '?') ptr++;
2617 case '*': /* These repeats won't be after brackets; */
2618 case '+': /* those are handled separately */
2623 /* This covers the cases of repeats after a single char, metachar, class,
2624 or back reference. */
2627 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2628 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2629 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630 if ((min == 0 && (max == 1 || max == -1)) ||
2631 (min == 1 && max == -1))
2635 length--; /* Uncount the original char or metachar */
2636 if (min == 1) length++; else if (min > 0) length += 4;
2637 if (max > 0) length += 4; else length += 2;
2639 if (ptr[1] == '?') ptr++;
2642 /* An alternation contains an offset to the next branch or ket. If any ims
2643 options changed in the previous branch(es), and/or if we are in a
2644 lookbehind assertion, extra space will be needed at the start of the
2645 branch. This is handled by branch_extra. */
2648 length += 3 + branch_extra;
2651 /* A character class uses 33 characters. Don't worry about character types
2652 that aren't allowed in classes - they'll get picked up during the compile.
2653 A character class that contains only one character uses 2 or 3 bytes,
2654 depending on whether it is negated or not. Notice this where we can. */
2657 class_charcount = 0;
2658 if (*(++ptr) == '^') ptr++;
2663 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2665 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2666 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2668 else class_charcount++;
2671 while (*ptr != 0 && *ptr != ']');
2673 /* Repeats for negated single chars are handled by the general code */
2675 if (class_charcount == 1) length += 3; else
2679 /* A repeat needs either 1 or 5 bytes. */
2681 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2683 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2684 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2685 if ((min == 0 && (max == 1 || max == -1)) ||
2686 (min == 1 && max == -1))
2689 if (ptr[1] == '?') ptr++;
2694 /* Brackets may be genuine groups or special things */
2697 branch_newextra = 0;
2700 /* Handle special forms of bracket, which all start (? */
2709 /* Skip over comments entirely */
2712 while (*ptr != 0 && *ptr != ')') ptr++;
2716 goto PCRE_ERROR_RETURN;
2720 /* Non-referencing groups and lookaheads just move the pointer on, and
2721 then behave like a non-special bracket, except that they don't increment
2722 the count of extracting brackets. Ditto for the "once only" bracket,
2723 which is in Perl from version 5.005. */
2732 /* A recursive call to the regex is an extension, to provide the
2733 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2739 goto PCRE_ERROR_RETURN;
2745 /* Lookbehinds are in Perl from version 5.005 */
2748 if (ptr[3] == '=' || ptr[3] == '!')
2751 branch_newextra = 3;
2752 length += 3; /* For the first branch */
2756 goto PCRE_ERROR_RETURN;
2758 /* Conditionals are in Perl from version 5.005. The bracket must either
2759 be followed by a number (for bracket reference) or by an assertion
2763 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2767 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2771 goto PCRE_ERROR_RETURN;
2774 else /* An assertion must follow */
2776 ptr++; /* Can treat like ':' as far as spacing is concerned */
2777 if (ptr[2] != '?' ||
2778 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2780 ptr += 2; /* To get right offset in message */
2782 goto PCRE_ERROR_RETURN;
2787 /* Else loop checking valid options until ) is met. Anything else is an
2788 error. If we are without any brackets, i.e. at top level, the settings
2789 act as if specified in the options, so massage the options immediately.
2790 This is for backward compatibility with Perl 5.004. */
2803 *optset |= PCRE_CASELESS;
2807 *optset |= PCRE_MULTILINE;
2811 *optset |= PCRE_DOTALL;
2815 *optset |= PCRE_EXTENDED;
2819 *optset |= PCRE_EXTRA;
2823 *optset |= PCRE_UNGREEDY;
2830 /* A termination by ')' indicates an options-setting-only item;
2831 this is global at top level; otherwise nothing is done here and
2832 it is handled during the compiling process on a per-bracket-group
2836 if (brastackptr == 0)
2838 options = (options | set) & (~unset);
2839 set = unset = 0; /* To save length */
2843 /* A termination by ':' indicates the start of a nested group with
2844 the given options set. This is again handled at compile time, but
2845 we must allow for compiled space if any of the ims options are
2846 set. We also have to allow for resetting space at the end of
2847 the group, which is why 4 is added to the length and not just 2.
2848 If there are several changes of options within the same group, this
2849 will lead to an over-estimate on the length, but this shouldn't
2850 matter very much. We also have to allow for resetting options at
2851 the start of any alternations, which we do by setting
2852 branch_newextra to 2. Finally, we record whether the case-dependent
2853 flag ever changes within the regex. This is used by the "required
2857 if (((set|unset) & PCRE_IMS) != 0)
2860 branch_newextra = 2;
2861 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2865 /* Unrecognized option character */
2869 goto PCRE_ERROR_RETURN;
2873 /* If we hit a closing bracket, that's it - this is a freestanding
2874 option-setting. We need to ensure that branch_extra is updated if
2875 necessary. The only values branch_newextra can have here are 0 or 2.
2876 If the value is 2, then branch_extra must either be 2 or 5, depending
2877 on whether this is a lookbehind group or not. */
2882 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2883 branch_extra += branch_newextra;
2887 /* If options were terminated by ':' control comes here. Fall through
2888 to handle the group below. */
2892 /* Extracting brackets must be counted so we can process escapes in a
2893 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2894 need an additional 3 bytes of store per extracting bracket. */
2899 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2902 /* Save length for computing whole length at end if there's a repeat that
2903 requires duplication of the group. Also save the current value of
2904 branch_extra, and start the new group with the new value. If non-zero, this
2905 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2907 if (brastackptr >= sizeof(brastack)/sizeof(int))
2910 goto PCRE_ERROR_RETURN;
2913 bralenstack[brastackptr] = branch_extra;
2914 branch_extra = branch_newextra;
2916 brastack[brastackptr++] = length;
2917 length += bracket_length;
2920 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2921 have to replicate this bracket up to that many times. If brastackptr is
2922 0 this is an unmatched bracket which will generate an error, but take care
2923 not to try to access brastack[-1] when computing the length and restoring
2924 the branch_extra value. */
2933 if (brastackptr > 0)
2935 duplength = length - brastack[--brastackptr];
2936 branch_extra = bralenstack[brastackptr];
2940 /* Leave ptr at the final char; for read_repeat_counts this happens
2941 automatically; for the others we need an increment. */
2943 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2945 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2947 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2949 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2950 else if (c == '+') { maxval = -1; ptr++; }
2951 else if (c == '?') { minval = 0; ptr++; }
2953 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2954 group, and if the maximum is greater than zero, we have to replicate
2955 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2956 bracket set - hence the 7. */
2961 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2964 /* When the minimum is greater than zero, 1 we have to replicate up to
2965 minval-1 times, with no additions required in the copies. Then, if
2966 there is a limited maximum we have to replicate up to maxval-1 times
2967 allowing for a BRAZERO item before each optional copy and nesting
2968 brackets for all but one of the optional copies. */
2972 length += (minval - 1) * duplength;
2973 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2974 length += (maxval - minval) * (duplength + 7) - 6;
2979 /* Non-special character. For a run of such characters the length required
2980 is the number of characters + 2, except that the maximum run length is 255.
2981 We won't get a skipped space or a non-data escape or the start of a #
2982 comment as the first character, so the length can't be zero. */
2990 if ((options & PCRE_EXTENDED) != 0)
2992 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2995 /* The space before the ; is to avoid a warning on a silly compiler
2996 on the Macintosh. */
2997 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3002 /* Backslash may introduce a data char or a metacharacter; stop the
3003 string before the latter. */
3007 const uschar *saveptr = ptr;
3008 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3010 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3011 if (c < 0) { ptr = saveptr; break; }
3014 if (c > 127 && (options & PCRE_UTF8) != 0)
3017 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3018 if (c <= utf8_table1[i]) break;
3024 /* Ordinary character or single-char escape */
3029 /* This "while" is the end of the "do" above. */
3031 while (runlength < MAXLIT &&
3032 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3035 length += runlength;
3040 length += 4; /* For final KET and END */
3048 /* Compute the size of data block needed and get it, either from malloc or
3049 externally provided function. We specify "code[0]" in the offsetof() expression
3050 rather than just "code", because it has been reported that one broken compiler
3051 fails on "code" because it is also an independent variable. It should make no
3052 difference to the value of the offsetof(). */
3054 size = length + offsetof(real_pcre, code[0]);
3055 re = (real_pcre *)(pcre_malloc)(size);
3063 /* Put in the magic number, and save the size, options, and table pointer */
3065 re->magic_number = MAGIC_NUMBER;
3067 re->options = options;
3068 re->tables = tables;
3070 /* Set up a starting, non-extracting bracket, then compile the expression. On
3071 error, *errorptr will be set non-NULL, so we don't need to look at the result
3072 of the function here. */
3074 ptr = (const uschar *)pattern;
3078 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3079 &reqchar, &countlits, &compile_block);
3080 re->top_bracket = bracount;
3081 re->top_backref = top_backref;
3083 /* If not reached end of pattern on success, there's an excess bracket. */
3085 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3087 /* Fill in the terminating state and check for disastrous overflow, but
3088 if debugging, leave the test till after things are printed out. */
3093 if (code - re->code > length) *errorptr = ERR23;
3096 /* Give an error if there's back reference to a non-existent capturing
3099 if (top_backref > re->top_bracket) *errorptr = ERR15;
3101 /* Failed to compile */
3103 if (*errorptr != NULL)
3107 *erroroffset = ptr - (const uschar *)pattern;
3111 /* If the anchored option was not passed, set flag if we can determine that the
3112 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3113 starting with .* when DOTALL is set).
3115 Otherwise, see if we can determine what the first character has to be, because
3116 that speeds up unanchored matches no end. If not, see if we can set the
3117 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3118 start with ^. and also when all branches start with .* for non-DOTALL matches.
3121 if ((options & PCRE_ANCHORED) == 0)
3123 int temp_options = options;
3124 if (is_anchored(re->code, &temp_options))
3125 re->options |= PCRE_ANCHORED;
3128 int ch = find_firstchar(re->code, &temp_options);
3131 re->first_char = ch;
3132 re->options |= PCRE_FIRSTSET;
3134 else if (is_startline(re->code))
3135 re->options |= PCRE_STARTLINE;
3139 /* Save the last required character if there are at least two literal
3140 characters on all paths, or if there is no first character setting. */
3142 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3144 re->req_char = reqchar;
3145 re->options |= PCRE_REQCHSET;
3148 /* Print out the compiled data for debugging */
3152 printf("Length = %d top_bracket = %d top_backref = %d\n",
3153 length, re->top_bracket, re->top_backref);
3155 if (re->options != 0)
3157 printf("%s%s%s%s%s%s%s%s%s\n",
3158 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3159 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3160 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3161 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3162 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3163 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3164 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3165 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3166 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3169 if ((re->options & PCRE_FIRSTSET) != 0)
3171 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3172 else printf("First char = \\x%02x\n", re->first_char);
3175 if ((re->options & PCRE_REQCHSET) != 0)
3177 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3178 else printf("Req char = \\x%02x\n", re->req_char);
3182 code_base = code = re->code;
3184 while (code < code_end)
3188 printf("%3d ", code - code_base);
3190 if (*code >= OP_BRA)
3192 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3193 printf("%3d Bra extra", (code[1] << 8) + code[2]);
3195 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3202 printf(" %.2x %s", code[1], OP_names[*code]);
3207 charlength = *(++code);
3208 printf("%3d ", charlength);
3209 while (charlength-- > 0)
3210 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3220 case OP_ASSERTBACK_NOT:
3226 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3237 case OP_TYPEMINSTAR:
3239 case OP_TYPEMINPLUS:
3241 case OP_TYPEMINQUERY:
3242 if (*code >= OP_TYPESTAR)
3243 printf(" %s", OP_names[code[1]]);
3244 else if (isprint(c = code[1])) printf(" %c", c);
3245 else printf(" \\x%02x", c);
3246 printf("%s", OP_names[*code++]);
3252 if (isprint(c = code[3])) printf(" %c{", c);
3253 else printf(" \\x%02x{", c);
3254 if (*code != OP_EXACT) printf("0,");
3255 printf("%d}", (code[1] << 8) + code[2]);
3256 if (*code == OP_MINUPTO) printf("?");
3262 case OP_TYPEMINUPTO:
3263 printf(" %s{", OP_names[code[3]]);
3264 if (*code != OP_TYPEEXACT) printf(",");
3265 printf("%d}", (code[1] << 8) + code[2]);
3266 if (*code == OP_TYPEMINUPTO) printf("?");
3271 if (isprint(c = *(++code))) printf(" [^%c]", c);
3272 else printf(" [^\\x%02x]", c);
3280 case OP_NOTMINQUERY:
3281 if (isprint(c = code[1])) printf(" [^%c]", c);
3282 else printf(" [^\\x%02x]", c);
3283 printf("%s", OP_names[*code++]);
3289 if (isprint(c = code[3])) printf(" [^%c]{", c);
3290 else printf(" [^\\x%02x]{", c);
3291 if (*code != OP_NOTEXACT) printf(",");
3292 printf("%d}", (code[1] << 8) + code[2]);
3293 if (*code == OP_NOTMINUPTO) printf("?");
3298 printf(" \\%d", (code[1] << 8) | code[2]);
3300 goto CLASS_REF_REPEAT;
3308 for (i = 0; i < 256; i++)
3310 if ((code[i/8] & (1 << (i&7))) != 0)
3313 for (j = i+1; j < 256; j++)
3314 if ((code[j/8] & (1 << (j&7))) == 0) break;
3315 if (i == '-' || i == ']') printf("\\");
3316 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3320 if (j == '-' || j == ']') printf("\\");
3321 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3339 printf("%s", OP_names[*code]);
3344 min = (code[1] << 8) + code[2];
3345 max = (code[3] << 8) + code[4];
3346 if (max == 0) printf("{%d,}", min);
3347 else printf("{%d,%d}", min, max);
3348 if (*code == OP_CRMINRANGE) printf("?");
3358 /* Anything else is just a one-node item */
3361 printf(" %s", OP_names[*code]);
3368 printf("------------------------------------------------------------------\n");
3370 /* This check is done here in the debugging case so that the code that
3371 was compiled can be seen. */
3373 if (code - re->code > length)
3377 *erroroffset = ptr - (uschar *)pattern;
3387 /*************************************************
3388 * Match a back-reference *
3389 *************************************************/
3391 /* If a back reference hasn't been set, the length that is passed is greater
3392 than the number of characters left in the string, so the match fails.
3395 offset index into the offset vector
3396 eptr points into the subject
3397 length length to be matched
3398 md points to match data block
3401 Returns: TRUE if matched
3405 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3406 unsigned long int ims)
3408 const uschar *p = md->start_subject + md->offset_vector[offset];
3411 if (eptr >= md->end_subject)
3412 printf("matching subject <null>");
3415 printf("matching subject ");
3416 pchars(eptr, length, TRUE, md);
3418 printf(" against backref ");
3419 pchars(p, length, FALSE, md);
3423 /* Always fail if not enough characters left */
3425 if (length > md->end_subject - eptr) return FALSE;
3427 /* Separate the caselesss case for speed */
3429 if ((ims & PCRE_CASELESS) != 0)
3431 while (length-- > 0)
3432 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3435 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3442 /*************************************************
3443 * Match from current position *
3444 *************************************************/
3446 /* On entry ecode points to the first opcode, and eptr to the first character
3447 in the subject string, while eptrb holds the value of eptr at the start of the
3448 last bracketed group - used for breaking infinite loops matching zero-length
3452 eptr pointer in subject
3453 ecode position in code
3454 offset_top current top pointer
3455 md pointer to "static" info for the match
3456 ims current /i, /m, and /s options
3457 eptrb pointer to chain of blocks containing eptr at start of
3458 brackets - for testing for empty matches
3460 match_condassert - this is an assertion condition
3461 match_isgroup - this is the start of a bracketed group
3463 Returns: TRUE if matched
3467 match(register const uschar *eptr, register const uschar *ecode,
3468 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3471 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3474 /* At the start of a bracketed group, add the current subject pointer to the
3475 stack of such pointers, to be re-instated at the end of the group when we hit
3476 the closing ket. When match() is called in other circumstances, we don't add to
3479 if ((flags & match_isgroup) != 0)
3481 newptrb.prev = eptrb;
3482 newptrb.saved_eptr = eptr;
3486 /* Now start processing the operations. */
3490 int op = (int)*ecode;
3491 int min, max, ctype;
3494 BOOL minimize = FALSE;
3496 /* Opening capturing bracket. If there is space in the offset vector, save
3497 the current subject position in the working slot at the top of the vector. We
3498 mustn't change the current values of the data slot, because they may be set
3499 from a previous iteration of this group, and be referred to by a reference
3502 If the bracket fails to match, we need to restore this value and also the
3503 values of the final offsets, in case they were set by a previous iteration of
3506 If there isn't enough space in the offset vector, treat this as if it were a
3507 non-capturing bracket. Don't worry about setting the flag for the error case
3508 here; that is handled in the code for KET. */
3513 int number = op - OP_BRA;
3515 /* For extended extraction brackets (large number), we have to fish out the
3516 number from a dummy opcode at the start. */
3518 if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3519 offset = number << 1;
3522 printf("start bracket %d subject=", number);
3523 pchars(eptr, 16, TRUE, md);
3527 if (offset < md->offset_max)
3529 int save_offset1 = md->offset_vector[offset];
3530 int save_offset2 = md->offset_vector[offset+1];
3531 int save_offset3 = md->offset_vector[md->offset_end - number];
3533 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3534 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3538 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3540 ecode += (ecode[1] << 8) + ecode[2];
3542 while (*ecode == OP_ALT);
3544 DPRINTF(("bracket %d failed\n", number));
3546 md->offset_vector[offset] = save_offset1;
3547 md->offset_vector[offset+1] = save_offset2;
3548 md->offset_vector[md->offset_end - number] = save_offset3;
3553 /* Insufficient room for saving captured contents */
3558 /* Other types of node can be handled by a switch */
3562 case OP_BRA: /* Non-capturing bracket: optimized */
3563 DPRINTF(("start bracket 0\n"));
3566 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3568 ecode += (ecode[1] << 8) + ecode[2];
3570 while (*ecode == OP_ALT);
3571 DPRINTF(("bracket 0 failed\n"));
3574 /* Conditional group: compilation checked that there are no more than
3575 two branches. If the condition is false, skipping the first branch takes us
3576 past the end if there is only one branch, but that's OK because that is
3577 exactly what going to the ket would do. */
3580 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3582 int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3584 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585 6 : 3 + (ecode[1] << 8) + ecode[2]),
3586 offset_top, md, ims, eptrb, match_isgroup);
3589 /* The condition is an assertion. Call match() to evaluate it - setting
3590 the final argument TRUE causes it to stop at the end of an assertion. */
3594 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595 match_condassert | match_isgroup))
3597 ecode += 3 + (ecode[4] << 8) + ecode[5];
3598 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3600 else ecode += (ecode[1] << 8) + ecode[2];
3601 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3603 /* Control never reaches here */
3605 /* Skip over conditional reference or large extraction number data if
3613 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3614 an empty string - recursion will then try other alternatives, if any. */
3617 if (md->notempty && eptr == md->start_match) return FALSE;
3618 md->end_match_ptr = eptr; /* Record where we ended */
3619 md->end_offset_top = offset_top; /* and how many extracts were taken */
3622 /* Change option settings */
3627 DPRINTF(("ims set to %02lx\n", ims));
3630 /* Assertion brackets. Check the alternative branches in turn - the
3631 matching won't pass the KET for an assertion. If any one branch matches,
3632 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3633 start of each branch to move the current point backwards, so the code at
3634 this level is identical to the lookahead case. */
3640 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3641 ecode += (ecode[1] << 8) + ecode[2];
3643 while (*ecode == OP_ALT);
3644 if (*ecode == OP_KET) return FALSE;
3646 /* If checking an assertion for a condition, return TRUE. */
3648 if ((flags & match_condassert) != 0) return TRUE;
3650 /* Continue from after the assertion, updating the offsets high water
3651 mark, since extracts may have been taken during the assertion. */
3653 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3655 offset_top = md->end_offset_top;
3658 /* Negative assertion: all branches must fail to match */
3661 case OP_ASSERTBACK_NOT:
3664 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3666 ecode += (ecode[1] << 8) + ecode[2];
3668 while (*ecode == OP_ALT);
3670 if ((flags & match_condassert) != 0) return TRUE;
3675 /* Move the subject pointer back. This occurs only at the start of
3676 each branch of a lookbehind assertion. If we are too close to the start to
3677 move back, this match function fails. When working with UTF-8 we move
3678 back a number of characters, not bytes. */
3682 c = (ecode[1] << 8) + ecode[2];
3683 for (i = 0; i < c; i++)
3689 eptr -= (ecode[1] << 8) + ecode[2];
3692 if (eptr < md->start_subject) return FALSE;
3696 /* Recursion matches the current regex, nested. If there are any capturing
3697 brackets started but not finished, we have to save their starting points
3698 and reinstate them after the recursion. However, we don't know how many
3699 such there are (offset_top records the completed total) so we just have
3700 to save all the potential data. There may be up to 99 such values, which
3701 is a bit large to put on the stack, but using malloc for small numbers
3702 seems expensive. As a compromise, the stack is used when there are fewer
3703 than 16 values to store; otherwise malloc is used. A problem is what to do
3704 if the malloc fails ... there is no way of returning to the top level with
3705 an error. Save the top 15 values on the stack, and accept that the rest
3716 if (c < 16) save = stacksave; else
3718 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3726 for (i = 1; i <= c; i++)
3727 save[i] = md->offset_vector[md->offset_end - i];
3728 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3730 for (i = 1; i <= c; i++)
3731 md->offset_vector[md->offset_end - i] = save[i];
3732 if (save != stacksave) (pcre_free)(save);
3733 if (!rc) return FALSE;
3735 /* In case the recursion has set more capturing values, save the final
3736 number, then move along the subject till after the recursive match,
3737 and advance one byte in the pattern code. */
3739 offset_top = md->end_offset_top;
3740 eptr = md->end_match_ptr;
3745 /* "Once" brackets are like assertion brackets except that after a match,
3746 the point in the subject string is not moved back. Thus there can never be
3747 a move back into the brackets. Check the alternative branches in turn - the
3748 matching won't pass the KET for this kind of subpattern. If any one branch
3749 matches, we carry on as at the end of a normal bracket, leaving the subject
3754 const uschar *prev = ecode;
3755 const uschar *saved_eptr = eptr;
3759 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3761 ecode += (ecode[1] << 8) + ecode[2];
3763 while (*ecode == OP_ALT);
3765 /* If hit the end of the group (which could be repeated), fail */
3767 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3769 /* Continue as from after the assertion, updating the offsets high water
3770 mark, since extracts may have been taken. */
3772 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3774 offset_top = md->end_offset_top;
3775 eptr = md->end_match_ptr;
3777 /* For a non-repeating ket, just continue at this level. This also
3778 happens for a repeating ket if no characters were matched in the group.
3779 This is the forcible breaking of infinite loops as implemented in Perl
3780 5.005. If there is an options reset, it will get obeyed in the normal
3781 course of events. */
3783 if (*ecode == OP_KET || eptr == saved_eptr)
3789 /* The repeating kets try the rest of the pattern or restart from the
3790 preceding bracket, in the appropriate order. We need to reset any options
3791 that changed within the bracket before re-running it, so check the next
3794 if (ecode[3] == OP_OPT)
3796 ims = (ims & ~PCRE_IMS) | ecode[4];
3797 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3800 if (*ecode == OP_KETRMIN)
3802 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3803 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3806 else /* OP_KETRMAX */
3808 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3809 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3814 /* An alternation is the end of a branch; scan along to find the end of the
3815 bracketed group and go to there. */
3818 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3821 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3822 that it may occur zero times. It may repeat infinitely, or not at all -
3823 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3824 repeat limits are compiled as a number of copies, with the optional ones
3825 preceded by BRAZERO or BRAMINZERO. */
3829 const uschar *next = ecode+1;
3830 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3832 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3839 const uschar *next = ecode+1;
3840 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3841 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3847 /* End of a group, repeated or non-repeating. If we are at the end of
3848 an assertion "group", stop matching and return TRUE, but record the
3849 current high water mark for use by positive assertions. Do this also
3850 for the "once" (not-backup up) groups. */
3856 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3857 const uschar *saved_eptr = eptrb->saved_eptr;
3859 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3861 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3862 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3865 md->end_match_ptr = eptr; /* For ONCE */
3866 md->end_offset_top = offset_top;
3870 /* In all other cases except a conditional group we have to check the
3871 group number back at the start and if necessary complete handling an
3872 extraction by setting the offsets and bumping the high water mark. */
3874 if (*prev != OP_COND)
3877 int number = *prev - OP_BRA;
3879 /* For extended extraction brackets (large number), we have to fish out
3880 the number from a dummy opcode at the start. */
3882 if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3883 offset = number << 1;
3886 printf("end bracket %d", number);
3892 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3894 md->offset_vector[offset] =
3895 md->offset_vector[md->offset_end - number];
3896 md->offset_vector[offset+1] = eptr - md->start_subject;
3897 if (offset_top <= offset) offset_top = offset + 2;
3902 /* Reset the value of the ims flags, in case they got changed during
3906 DPRINTF(("ims reset to %02lx\n", ims));
3908 /* For a non-repeating ket, just continue at this level. This also
3909 happens for a repeating ket if no characters were matched in the group.
3910 This is the forcible breaking of infinite loops as implemented in Perl
3911 5.005. If there is an options reset, it will get obeyed in the normal
3912 course of events. */
3914 if (*ecode == OP_KET || eptr == saved_eptr)
3920 /* The repeating kets try the rest of the pattern or restart from the
3921 preceding bracket, in the appropriate order. */
3923 if (*ecode == OP_KETRMIN)
3925 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3926 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3929 else /* OP_KETRMAX */
3931 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3932 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3937 /* Start of subject unless notbol, or after internal newline if multiline */
3940 if (md->notbol && eptr == md->start_subject) return FALSE;
3941 if ((ims & PCRE_MULTILINE) != 0)
3943 if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3947 /* ... else fall through */
3949 /* Start of subject assertion */
3952 if (eptr != md->start_subject) return FALSE;
3956 /* Assert before internal newline if multiline, or before a terminating
3957 newline unless endonly is set, else end of subject unless noteol is set. */
3960 if ((ims & PCRE_MULTILINE) != 0)
3962 if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3963 else { if (md->noteol) return FALSE; }
3969 if (md->noteol) return FALSE;
3972 if (eptr < md->end_subject - 1 ||
3973 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3979 /* ... else fall through */
3981 /* End of subject assertion (\z) */
3984 if (eptr < md->end_subject) return FALSE;
3988 /* End of subject or ending \n assertion (\Z) */
3991 if (eptr < md->end_subject - 1 ||
3992 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3996 /* Word boundary assertions */
3998 case OP_NOT_WORD_BOUNDARY:
3999 case OP_WORD_BOUNDARY:
4001 BOOL prev_is_word = (eptr != md->start_subject) &&
4002 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
4003 BOOL cur_is_word = (eptr < md->end_subject) &&
4004 ((md->ctypes[*eptr] & ctype_word) != 0);
4005 if ((*ecode++ == OP_WORD_BOUNDARY)?
4006 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
4011 /* Match a single character type; inline for speed */
4014 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4016 if (eptr++ >= md->end_subject) return FALSE;
4019 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4025 if (eptr >= md->end_subject ||
4026 (md->ctypes[*eptr++] & ctype_digit) != 0)
4032 if (eptr >= md->end_subject ||
4033 (md->ctypes[*eptr++] & ctype_digit) == 0)
4038 case OP_NOT_WHITESPACE:
4039 if (eptr >= md->end_subject ||
4040 (md->ctypes[*eptr++] & ctype_space) != 0)
4046 if (eptr >= md->end_subject ||
4047 (md->ctypes[*eptr++] & ctype_space) == 0)
4052 case OP_NOT_WORDCHAR:
4053 if (eptr >= md->end_subject ||
4054 (md->ctypes[*eptr++] & ctype_word) != 0)
4060 if (eptr >= md->end_subject ||
4061 (md->ctypes[*eptr++] & ctype_word) == 0)
4066 /* Match a back reference, possibly repeatedly. Look past the end of the
4067 item to see if there is repeat information following. The code is similar
4068 to that for character classes, but repeated for efficiency. Then obey
4069 similar code to character type repeats - written out again for speed.
4070 However, if the referenced string is the empty string, always treat
4071 it as matched, any number of times (otherwise there could be infinite
4077 int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4078 ecode += 3; /* Advance past item */
4080 /* If the reference is unset, set the length to be longer than the amount
4081 of subject left; this ensures that every attempt at a match fails. We
4082 can't just fail here, because of the possibility of quantifiers with zero
4085 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4086 md->end_subject - eptr + 1 :
4087 md->offset_vector[offset+1] - md->offset_vector[offset];
4089 /* Set up for repetition, or handle the non-repeated case */
4099 c = *ecode++ - OP_CRSTAR;
4100 minimize = (c & 1) != 0;
4101 min = rep_min[c]; /* Pick up values from tables; */
4102 max = rep_max[c]; /* zero for max => infinity */
4103 if (max == 0) max = INT_MAX;
4108 minimize = (*ecode == OP_CRMINRANGE);
4109 min = (ecode[1] << 8) + ecode[2];
4110 max = (ecode[3] << 8) + ecode[4];
4111 if (max == 0) max = INT_MAX;
4115 default: /* No repeat follows */
4116 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4118 continue; /* With the main loop */
4121 /* If the length of the reference is zero, just continue with the
4124 if (length == 0) continue;
4126 /* First, ensure the minimum number of matches are present. We get back
4127 the length of the reference string explicitly rather than passing the
4128 address of eptr, so that eptr can be a register variable. */
4130 for (i = 1; i <= min; i++)
4132 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4136 /* If min = max, continue at the same level without recursion.
4137 They are not both allowed to be zero. */
4139 if (min == max) continue;
4141 /* If minimizing, keep trying and advancing the pointer */
4147 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4149 if (i >= max || !match_ref(offset, eptr, length, md, ims))
4153 /* Control never gets here */
4156 /* If maximizing, find the longest string and work backwards */
4160 const uschar *pp = eptr;
4161 for (i = min; i < max; i++)
4163 if (!match_ref(offset, eptr, length, md, ims)) break;
4168 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4175 /* Control never gets here */
4179 /* Match a character class, possibly repeatedly. Look past the end of the
4180 item to see if there is repeat information following. Then obey similar
4181 code to character type repeats - written out again for speed. */
4185 const uschar *data = ecode + 1; /* Save for matching */
4186 ecode += 33; /* Advance past the item */
4196 c = *ecode++ - OP_CRSTAR;
4197 minimize = (c & 1) != 0;
4198 min = rep_min[c]; /* Pick up values from tables; */
4199 max = rep_max[c]; /* zero for max => infinity */
4200 if (max == 0) max = INT_MAX;
4205 minimize = (*ecode == OP_CRMINRANGE);
4206 min = (ecode[1] << 8) + ecode[2];
4207 max = (ecode[3] << 8) + ecode[4];
4208 if (max == 0) max = INT_MAX;
4212 default: /* No repeat follows */
4217 /* First, ensure the minimum number of matches are present. */
4219 for (i = 1; i <= min; i++)
4221 if (eptr >= md->end_subject) return FALSE;
4222 GETCHARINC(c, eptr) /* Get character; increment eptr */
4225 /* We do not yet support class members > 255 */
4226 if (c > 255) return FALSE;
4229 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4233 /* If max == min we can continue with the main loop without the
4236 if (min == max) continue;
4238 /* If minimizing, keep testing the rest of the expression and advancing
4239 the pointer while it matches the class. */
4245 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4247 if (i >= max || eptr >= md->end_subject) return FALSE;
4248 GETCHARINC(c, eptr) /* Get character; increment eptr */
4251 /* We do not yet support class members > 255 */
4252 if (c > 255) return FALSE;
4254 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4257 /* Control never gets here */
4260 /* If maximizing, find the longest possible run, then work backwards. */
4264 const uschar *pp = eptr;
4266 for (i = min; i < max; i++)
4268 if (eptr >= md->end_subject) break;
4269 GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
4272 /* We do not yet support class members > 255 */
4275 if ((data[c/8] & (1 << (c&7))) == 0) break;
4281 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4291 /* Control never gets here */
4293 /* Match a run of characters */
4297 register int length = ecode[1];
4300 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4301 if (eptr >= md->end_subject)
4302 printf("matching subject <null> against pattern ");
4305 printf("matching subject ");
4306 pchars(eptr, length, TRUE, md);
4307 printf(" against pattern ");
4309 pchars(ecode, length, FALSE, md);
4313 if (length > md->end_subject - eptr) return FALSE;
4314 if ((ims & PCRE_CASELESS) != 0)
4316 while (length-- > 0)
4317 if (md->lcc[*ecode++] != md->lcc[*eptr++])
4322 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4327 /* Match a single character repeatedly; different opcodes share code. */
4330 min = max = (ecode[1] << 8) + ecode[2];
4337 max = (ecode[1] << 8) + ecode[2];
4338 minimize = *ecode == OP_MINUPTO;
4348 c = *ecode++ - OP_STAR;
4349 minimize = (c & 1) != 0;
4350 min = rep_min[c]; /* Pick up values from tables; */
4351 max = rep_max[c]; /* zero for max => infinity */
4352 if (max == 0) max = INT_MAX;
4354 /* Common code for all repeated single-character matches. We can give
4355 up quickly if there are fewer than the minimum number of characters left in
4359 if (min > md->end_subject - eptr) return FALSE;
4362 /* The code is duplicated for the caseless and caseful cases, for speed,
4363 since matching characters is likely to be quite common. First, ensure the
4364 minimum number of matches are present. If min = max, continue at the same
4365 level without recursing. Otherwise, if minimizing, keep trying the rest of
4366 the expression and advancing one matching character if failing, up to the
4367 maximum. Alternatively, if maximizing, find the maximum number of
4368 characters and work backwards. */
4370 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4373 if ((ims & PCRE_CASELESS) != 0)
4376 for (i = 1; i <= min; i++)
4377 if (c != md->lcc[*eptr++]) return FALSE;
4378 if (min == max) continue;
4383 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4385 if (i >= max || eptr >= md->end_subject ||
4386 c != md->lcc[*eptr++])
4389 /* Control never gets here */
4393 const uschar *pp = eptr;
4394 for (i = min; i < max; i++)
4396 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4400 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4404 /* Control never gets here */
4407 /* Caseful comparisons */
4411 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4412 if (min == max) continue;
4417 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4419 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4421 /* Control never gets here */
4425 const uschar *pp = eptr;
4426 for (i = min; i < max; i++)
4428 if (eptr >= md->end_subject || c != *eptr) break;
4432 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4437 /* Control never gets here */
4439 /* Match a negated single character */
4442 if (eptr >= md->end_subject) return FALSE;
4444 if ((ims & PCRE_CASELESS) != 0)
4446 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4450 if (*ecode++ == *eptr++) return FALSE;
4454 /* Match a negated single character repeatedly. This is almost a repeat of
4455 the code for a repeated single character, but I haven't found a nice way of
4456 commoning these up that doesn't require a test of the positive/negative
4457 option for each character match. Maybe that wouldn't add very much to the
4458 time taken, but character matching *is* what this is all about... */
4461 min = max = (ecode[1] << 8) + ecode[2];
4468 max = (ecode[1] << 8) + ecode[2];
4469 minimize = *ecode == OP_NOTMINUPTO;
4478 case OP_NOTMINQUERY:
4479 c = *ecode++ - OP_NOTSTAR;
4480 minimize = (c & 1) != 0;
4481 min = rep_min[c]; /* Pick up values from tables; */
4482 max = rep_max[c]; /* zero for max => infinity */
4483 if (max == 0) max = INT_MAX;
4485 /* Common code for all repeated single-character matches. We can give
4486 up quickly if there are fewer than the minimum number of characters left in
4490 if (min > md->end_subject - eptr) return FALSE;
4493 /* The code is duplicated for the caseless and caseful cases, for speed,
4494 since matching characters is likely to be quite common. First, ensure the
4495 minimum number of matches are present. If min = max, continue at the same
4496 level without recursing. Otherwise, if minimizing, keep trying the rest of
4497 the expression and advancing one matching character if failing, up to the
4498 maximum. Alternatively, if maximizing, find the maximum number of
4499 characters and work backwards. */
4501 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4504 if ((ims & PCRE_CASELESS) != 0)
4507 for (i = 1; i <= min; i++)
4508 if (c == md->lcc[*eptr++]) return FALSE;
4509 if (min == max) continue;
4514 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4516 if (i >= max || eptr >= md->end_subject ||
4517 c == md->lcc[*eptr++])
4520 /* Control never gets here */
4524 const uschar *pp = eptr;
4525 for (i = min; i < max; i++)
4527 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4531 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4535 /* Control never gets here */
4538 /* Caseful comparisons */
4542 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4543 if (min == max) continue;
4548 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4550 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4552 /* Control never gets here */
4556 const uschar *pp = eptr;
4557 for (i = min; i < max; i++)
4559 if (eptr >= md->end_subject || c == *eptr) break;
4563 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4568 /* Control never gets here */
4570 /* Match a single character type repeatedly; several different opcodes
4571 share code. This is very similar to the code for single characters, but we
4572 repeat it in the interests of efficiency. */
4575 min = max = (ecode[1] << 8) + ecode[2];
4581 case OP_TYPEMINUPTO:
4583 max = (ecode[1] << 8) + ecode[2];
4584 minimize = *ecode == OP_TYPEMINUPTO;
4589 case OP_TYPEMINSTAR:
4591 case OP_TYPEMINPLUS:
4593 case OP_TYPEMINQUERY:
4594 c = *ecode++ - OP_TYPESTAR;
4595 minimize = (c & 1) != 0;
4596 min = rep_min[c]; /* Pick up values from tables; */
4597 max = rep_max[c]; /* zero for max => infinity */
4598 if (max == 0) max = INT_MAX;
4600 /* Common code for all repeated single character type matches */
4603 ctype = *ecode++; /* Code for the character type */
4605 /* First, ensure the minimum number of matches are present. Use inline
4606 code for maximizing the speed, and do the type test once at the start
4607 (i.e. keep it out of the loop). Also we can test that there are at least
4608 the minimum number of bytes before we start, except when doing '.' in
4609 UTF8 mode. Leave the test in in all cases; in the special case we have
4610 to test after each character. */
4612 if (min > md->end_subject - eptr) return FALSE;
4613 if (min > 0) switch(ctype)
4619 for (i = 1; i <= min; i++)
4621 if (eptr >= md->end_subject ||
4622 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4624 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4629 /* Non-UTF8 can be faster */
4630 if ((ims & PCRE_DOTALL) == 0)
4631 { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4636 for (i = 1; i <= min; i++)
4637 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4641 for (i = 1; i <= min; i++)
4642 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4645 case OP_NOT_WHITESPACE:
4646 for (i = 1; i <= min; i++)
4647 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4651 for (i = 1; i <= min; i++)
4652 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4655 case OP_NOT_WORDCHAR:
4656 for (i = 1; i <= min; i++)
4657 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4662 for (i = 1; i <= min; i++)
4663 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4668 /* If min = max, continue at the same level without recursing */
4670 if (min == max) continue;
4672 /* If minimizing, we have to test the rest of the pattern before each
4673 subsequent match. */
4679 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4680 if (i >= max || eptr >= md->end_subject) return FALSE;
4686 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4689 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4694 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4698 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4701 case OP_NOT_WHITESPACE:
4702 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4706 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4709 case OP_NOT_WORDCHAR:
4710 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4714 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4718 /* Control never gets here */
4721 /* If maximizing it is worth using inline code for speed, doing the type
4722 test once at the start (i.e. keep it out of the loop). */
4726 const uschar *pp = eptr;
4731 /* Special code is required for UTF8, but when the maximum is unlimited
4732 we don't need it. */
4735 if (md->utf8 && max < INT_MAX)
4737 if ((ims & PCRE_DOTALL) == 0)
4739 for (i = min; i < max; i++)
4741 if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4742 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4747 for (i = min; i < max; i++)
4750 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4756 /* Non-UTF8 can be faster */
4757 if ((ims & PCRE_DOTALL) == 0)
4759 for (i = min; i < max; i++)
4761 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4768 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4774 for (i = min; i < max; i++)
4776 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4783 for (i = min; i < max; i++)
4785 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4791 case OP_NOT_WHITESPACE:
4792 for (i = min; i < max; i++)
4794 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4801 for (i = min; i < max; i++)
4803 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4809 case OP_NOT_WORDCHAR:
4810 for (i = min; i < max; i++)
4812 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4819 for (i = min; i < max; i++)
4821 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4830 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4834 while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4839 /* Control never gets here */
4841 /* There's been some horrible disaster. */
4844 DPRINTF(("Unknown opcode %d\n", *ecode));
4845 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4849 /* Do not stick any code in here without much thought; it is assumed
4850 that "continue" in the code above comes out to here to repeat the main
4853 } /* End of main loop */
4854 /* Control never reaches here */
4860 /*************************************************
4861 * Execute a Regular Expression *
4862 *************************************************/
4864 /* This function applies a compiled re to a subject string and picks out
4865 portions of the string if it matches. Two elements in the vector are set for
4866 each substring: the offsets to the start and end of the substring.
4869 external_re points to the compiled expression
4870 external_extra points to "hints" from pcre_study() or is NULL
4871 subject points to the subject string
4872 length length of subject string (may contain binary zeros)
4873 start_offset where to start in the subject string
4875 offsets points to a vector of ints to be filled in with offsets
4876 offsetcount the number of elements in the vector
4878 Returns: > 0 => success; value is the number of elements filled in
4879 = 0 => success, but offsets is not big enough
4880 -1 => failed to match
4881 < -1 => some kind of unexpected problem
4885 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4886 const char *subject, int length, int start_offset, int options, int *offsets,
4889 int resetcount, ocount;
4890 int first_char = -1;
4893 unsigned long int ims = 0;
4894 match_data match_block;
4895 const uschar *start_bits = NULL;
4896 const uschar *start_match = (const uschar *)subject + start_offset;
4897 const uschar *end_subject;
4898 const uschar *req_char_ptr = start_match - 1;
4899 const real_pcre *re = (const real_pcre *)external_re;
4900 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4901 BOOL using_temporary_offsets = FALSE;
4905 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4907 if (re == NULL || subject == NULL ||
4908 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4909 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4911 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4912 startline = (re->options & PCRE_STARTLINE) != 0;
4914 match_block.start_pattern = re->code;
4915 match_block.start_subject = (const uschar *)subject;
4916 match_block.end_subject = match_block.start_subject + length;
4917 end_subject = match_block.end_subject;
4919 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4920 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4922 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4923 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4924 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4926 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4928 match_block.lcc = re->tables + lcc_offset;
4929 match_block.ctypes = re->tables + ctypes_offset;
4931 /* The ims options can vary during the matching as a result of the presence
4932 of (?ims) items in the pattern. They are kept in a local variable so that
4933 restoring at the exit of a group is easy. */
4935 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4937 /* If the expression has got more back references than the offsets supplied can
4938 hold, we get a temporary bit of working store to use during the matching.
4939 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4942 ocount = offsetcount - (offsetcount % 3);
4944 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4946 ocount = re->top_backref * 3 + 3;
4947 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4948 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4949 using_temporary_offsets = TRUE;
4950 DPRINTF(("Got memory to hold back references\n"));
4952 else match_block.offset_vector = offsets;
4954 match_block.offset_end = ocount;
4955 match_block.offset_max = (2*ocount)/3;
4956 match_block.offset_overflow = FALSE;
4958 /* Compute the minimum number of offsets that we need to reset each time. Doing
4959 this makes a huge difference to execution time when there aren't many brackets
4962 resetcount = 2 + re->top_bracket * 2;
4963 if (resetcount > offsetcount) resetcount = ocount;
4965 /* Reset the working variable associated with each extraction. These should
4966 never be used unless previously set, but they get saved and restored, and so we
4967 initialize them to avoid reading uninitialized locations. */
4969 if (match_block.offset_vector != NULL)
4971 register int *iptr = match_block.offset_vector + ocount;
4972 register int *iend = iptr - resetcount/2 + 1;
4973 while (--iptr >= iend) *iptr = -1;
4976 /* Set up the first character to match, if available. The first_char value is
4977 never set for an anchored regular expression, but the anchoring may be forced
4978 at run time, so we have to test for anchoring. The first char may be unset for
4979 an unanchored pattern, of course. If there's no first char and the pattern was
4980 studied, there may be a bitmap of possible first characters. */
4984 if ((re->options & PCRE_FIRSTSET) != 0)
4986 first_char = re->first_char;
4987 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4990 if (!startline && extra != NULL &&
4991 (extra->options & PCRE_STUDY_MAPPED) != 0)
4992 start_bits = extra->start_bits;
4995 /* For anchored or unanchored matches, there may be a "last known required
4996 character" set. If the PCRE_CASELESS is set, implying that the match starts
4997 caselessly, or if there are any changes of this flag within the regex, set up
4998 both cases of the character. Otherwise set the two values the same, which will
4999 avoid duplicate testing (which takes significant time). This covers the vast
5000 majority of cases. It will be suboptimal when the case flag changes in a regex
5001 and the required character in fact is caseful. */
5003 if ((re->options & PCRE_REQCHSET) != 0)
5005 req_char = re->req_char;
5006 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
5007 (re->tables + fcc_offset)[req_char] : req_char;
5010 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5011 the loop runs just once. */
5016 register int *iptr = match_block.offset_vector;
5017 register int *iend = iptr + resetcount;
5019 /* Reset the maximum number of extractions we might see. */
5021 while (iptr < iend) *iptr++ = -1;
5023 /* Advance to a unique first char if possible */
5025 if (first_char >= 0)
5027 if ((ims & PCRE_CASELESS) != 0)
5028 while (start_match < end_subject &&
5029 match_block.lcc[*start_match] != first_char)
5032 while (start_match < end_subject && *start_match != first_char)
5036 /* Or to just after \n for a multiline match if possible */
5040 if (start_match > match_block.start_subject + start_offset)
5042 while (start_match < end_subject && start_match[-1] != NEWLINE)
5047 /* Or to a non-unique first char after study */
5049 else if (start_bits != NULL)
5051 while (start_match < end_subject)
5053 register int c = *start_match;
5054 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5058 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5059 printf(">>>> Match against: ");
5060 pchars(start_match, end_subject - start_match, TRUE, &match_block);
5064 /* If req_char is set, we know that that character must appear in the subject
5065 for the match to succeed. If the first character is set, req_char must be
5066 later in the subject; otherwise the test starts at the match point. This
5067 optimization can save a huge amount of backtracking in patterns with nested
5068 unlimited repeats that aren't going to match. We don't know what the state of
5069 case matching may be when this character is hit, so test for it in both its
5070 cases if necessary. However, the different cased versions will not be set up
5071 unless PCRE_CASELESS was given or the casing state changes within the regex.
5072 Writing separate code makes it go faster, as does using an autoincrement and
5073 backing off on a match. */
5077 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5079 /* We don't need to repeat the search if we haven't yet reached the
5080 place we found it at last time. */
5082 if (p > req_char_ptr)
5084 /* Do a single test if no case difference is set up */
5086 if (req_char == req_char2)
5088 while (p < end_subject)
5090 if (*p++ == req_char) { p--; break; }
5094 /* Otherwise test for either case */
5098 while (p < end_subject)
5100 register int pp = *p++;
5101 if (pp == req_char || pp == req_char2) { p--; break; }
5105 /* If we can't find the required character, break the matching loop */
5107 if (p >= end_subject) break;
5109 /* If we have found the required character, save the point where we
5110 found it, so that we don't search again next time round the loop if
5111 the start hasn't passed this character yet. */
5117 /* When a match occurs, substrings will be set for all internal extractions;
5118 we just need to set up the whole thing as substring 0 before returning. If
5119 there were too many extractions, set the return code to zero. In the case
5120 where we had to get some local store to hold offsets for backreferences, copy
5121 those back references that we can. In this case there need not be overflow
5122 if certain parts of the pattern were not used. */
5124 match_block.start_match = start_match;
5125 if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5128 /* Copy the offset information from temporary store if necessary */
5130 if (using_temporary_offsets)
5132 if (offsetcount >= 4)
5134 memcpy(offsets + 2, match_block.offset_vector + 2,
5135 (offsetcount - 2) * sizeof(int));
5136 DPRINTF(("Copied offsets from temporary memory\n"));
5138 if (match_block.end_offset_top > offsetcount)
5139 match_block.offset_overflow = TRUE;
5141 DPRINTF(("Freeing temporary memory\n"));
5142 (pcre_free)(match_block.offset_vector);
5145 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5147 if (offsetcount < 2) rc = 0; else
5149 offsets[0] = start_match - match_block.start_subject;
5150 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5153 DPRINTF((">>>> returning %d\n", rc));
5157 /* This "while" is the end of the "do" above */
5160 match_block.errorcode == PCRE_ERROR_NOMATCH &&
5161 start_match++ < end_subject);
5163 if (using_temporary_offsets)
5165 DPRINTF(("Freeing temporary memory\n"));
5166 (pcre_free)(match_block.offset_vector);
5169 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5171 return match_block.errorcode;