pcre/pcre.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /*
   6 This is a library of functions to support regular expressions whose syntax
   7 and semantics are as close as possible to those of the Perl 5 language. See
   8 the file Tech.Notes for some information on the internals.
   9
  10 Written by: Philip Hazel <ph10@cam.ac.uk>
  11
  12            Copyright (c) 1997-2001 University of Cambridge
  13
  14 -----------------------------------------------------------------------------
  15 Permission is granted to anyone to use this software for any purpose on any
  16 computer system, and to redistribute it freely, subject to the following
  17 restrictions:
  18
  19 1. This software is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  22
  23 2. The origin of this software must not be misrepresented, either by
  24    explicit claim or by omission.
  25
  26 3. Altered versions must be plainly marked as such, and must not be
  27    misrepresented as being the original software.
  28
  29 4. If PCRE is embedded in any software that is released under the GNU
  30    General Purpose Licence (GPL), then the terms of that licence shall
  31    supersede any condition above with which it is incompatible.
  32 -----------------------------------------------------------------------------
  33 */
  34
  35
  36 /* Define DEBUG to get debugging output on stdout. */
  37
  38 /* #define DEBUG */
  39
  40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
  41 inline, and there are *still* stupid compilers about that don't like indented
  42 pre-processor statements. I suppose it's only been 10 years... */
  43
  44 #ifdef DEBUG
  45 #define DPRINTF(p) printf p
  46 #else
  47 #define DPRINTF(p) /*nothing*/
  48 #endif
  49
  50 /* Include the internals header, which itself includes Standard C headers plus
  51 the external pcre header. */
  52
  53 #include "internal.h"
  54
  55
  56 /* Allow compilation as C++ source code, should anybody want to do that. */
  57
  58 #ifdef __cplusplus
  59 #define class pcre_class
  60 #endif
  61
  62
  63 /* Maximum number of items on the nested bracket stacks at compile time. This
  64 applies to the nesting of all kinds of parentheses. It does not limit
  65 un-nested, non-capturing parentheses. This number can be made bigger if
  66 necessary - it is used to dimension one int and one unsigned char vector at
  67 compile time. */
  68
  69 #define BRASTACK_SIZE 200
  70
  71
  72 /* The number of bytes in a literal character string above which we can't add
  73 any more is different when UTF-8 characters may be encountered. */
  74
  75 #ifdef SUPPORT_UTF8
  76 #define MAXLIT 250
  77 #else
  78 #define MAXLIT 255
  79 #endif
  80
  81
  82 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
  83
  84 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
  85 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
  86
  87 /* Text forms of OP_ values and things, for debugging (not all used) */
  88
  89 #ifdef DEBUG
  90 static const char *OP_names[] = {
  91   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
  92   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
  93   "Opt", "^", "$", "Any", "chars", "not",
  94   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  95   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  96   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  97   "*", "*?", "+", "+?", "?", "??", "{", "{",
  98   "class", "Ref", "Recurse",
  99   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
 100   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
 101   "Brazero", "Braminzero", "Branumber", "Bra"
 102 };
 103 #endif
 104
 105 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 106 are simple data values; negative values are for special things like \d and so
 107 on. Zero means further processing is needed (for things like \x), or the escape
 108 is invalid. */
 109
 110 static const short int escapes[] = {
 111     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 112     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 113   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
 114     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 115     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 116     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 117   '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */
 118     0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */
 119     0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */
 120     0,      0, -ESC_z                                            /* x - z */
 121 };
 122
 123 /* Tables of names of POSIX character classes and their lengths. The list is
 124 terminated by a zero length entry. The first three must be alpha, upper, lower,
 125 as this is assumed for handling case independence. */
 126
 127 static const char *posix_names[] = {
 128   "alpha", "lower", "upper",
 129   "alnum", "ascii", "cntrl", "digit", "graph",
 130   "print", "punct", "space", "word",  "xdigit" };
 131
 132 static const uschar posix_name_lengths[] = {
 133   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 134
 135 /* Table of class bit maps for each POSIX class; up to three may be combined
 136 to form the class. */
 137
 138 static const int posix_class_maps[] = {
 139   cbit_lower, cbit_upper, -1,             /* alpha */
 140   cbit_lower, -1,         -1,             /* lower */
 141   cbit_upper, -1,         -1,             /* upper */
 142   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
 143   cbit_print, cbit_cntrl, -1,             /* ascii */
 144   cbit_cntrl, -1,         -1,             /* cntrl */
 145   cbit_digit, -1,         -1,             /* digit */
 146   cbit_graph, -1,         -1,             /* graph */
 147   cbit_print, -1,         -1,             /* print */
 148   cbit_punct, -1,         -1,             /* punct */
 149   cbit_space, -1,         -1,             /* space */
 150   cbit_word,  -1,         -1,             /* word */
 151   cbit_xdigit,-1,         -1              /* xdigit */
 152 };
 153
 154
 155 /* Definition to allow mutual recursion */
 156
 157 static BOOL
 158   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
 159     BOOL, int, int *, int *, compile_data *);
 160
 161 /* Structure for building a chain of data that actually lives on the
 162 stack, for holding the values of the subject pointer at the start of each
 163 subpattern, so as to detect when an empty string has been matched by a
 164 subpattern - to break infinite loops. */
 165
 166 typedef struct eptrblock {
 167   struct eptrblock *prev;
 168   const uschar *saved_eptr;
 169 } eptrblock;
 170
 171 /* Flag bits for the match() function */
 172
 173 #define match_condassert   0x01    /* Called to check a condition assertion */
 174 #define match_isgroup      0x02    /* Set if start of bracketed group */
 175
 176
 177
 178 /*************************************************
 179 *               Global variables                 *
 180 *************************************************/
 181
 182 /* PCRE is thread-clean and doesn't use any global variables in the normal
 183 sense. However, it calls memory allocation and free functions via the two
 184 indirections below, which are can be changed by the caller, but are shared
 185 between all threads. */
 186
 187 void *(*pcre_malloc)(size_t) = malloc;
 188 void  (*pcre_free)(void *) = free;
 189
 190
 191
 192 /*************************************************
 193 *    Macros and tables for character handling    *
 194 *************************************************/
 195
 196 /* When UTF-8 encoding is being used, a character is no longer just a single
 197 byte. The macros for character handling generate simple sequences when used in
 198 byte-mode, and more complicated ones for UTF-8 characters. */
 199
 200 #ifndef SUPPORT_UTF8
 201 #define GETCHARINC(c, eptr) c = *eptr++;
 202 #define GETCHARLEN(c, eptr, len) c = *eptr;
 203 #define BACKCHAR(eptr)
 204
 205 #else   /* SUPPORT_UTF8 */
 206
 207 /* Get the next UTF-8 character, advancing the pointer */
 208
 209 #define GETCHARINC(c, eptr) \
 210   c = *eptr++; \
 211   if (md->utf8 && (c & 0xc0) == 0xc0) \
 212     { \
 213     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 214     int s = 6*a; \
 215     c = (c & utf8_table3[a]) << s; \
 216     while (a-- > 0) \
 217       { \
 218       s -= 6; \
 219       c |= (*eptr++ & 0x3f) << s; \
 220       } \
 221     }
 222
 223 /* Get the next UTF-8 character, not advancing the pointer, setting length */
 224
 225 #define GETCHARLEN(c, eptr, len) \
 226   c = *eptr; \
 227   len = 1; \
 228   if (md->utf8 && (c & 0xc0) == 0xc0) \
 229     { \
 230     int i; \
 231     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 232     int s = 6*a; \
 233     c = (c & utf8_table3[a]) << s; \
 234     for (i = 1; i <= a; i++) \
 235       { \
 236       s -= 6; \
 237       c |= (eptr[i] & 0x3f) << s; \
 238       } \
 239     len += a; \
 240     }
 241
 242 /* If the pointer is not at the start of a character, move it back until
 243 it is. */
 244
 245 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
 246
 247 #endif
 248
 249
 250
 251 /*************************************************
 252 *             Default character tables           *
 253 *************************************************/
 254
 255 /* A default set of character tables is included in the PCRE binary. Its source
 256 is built by the maketables auxiliary program, which uses the default C ctypes
 257 functions, and put in the file chartables.c. These tables are used by PCRE
 258 whenever the caller of pcre_compile() does not provide an alternate set of
 259 tables. */
 260
 261 #include "chartables.c"
 262
 263
 264
 265 #ifdef SUPPORT_UTF8
 266 /*************************************************
 267 *           Tables for UTF-8 support             *
 268 *************************************************/
 269
 270 /* These are the breakpoints for different numbers of bytes in a UTF-8
 271 character. */
 272
 273 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
 274
 275 /* These are the indicator bits and the mask for the data bits to set in the
 276 first byte of a character, indexed by the number of additional bytes. */
 277
 278 static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
 279 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
 280
 281 /* Table of the number of extra characters, indexed by the first character
 282 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
 283 0x3d. */
 284
 285 static uschar utf8_table4[] = {
 286   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 287   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 288   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 289   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 290
 291
 292 /*************************************************
 293 *       Convert character value to UTF-8         *
 294 *************************************************/
 295
 296 /* This function takes an integer value in the range 0 - 0x7fffffff
 297 and encodes it as a UTF-8 character in 0 to 6 bytes.
 298
 299 Arguments:
 300   cvalue     the character value
 301   buffer     pointer to buffer for result - at least 6 bytes long
 302
 303 Returns:     number of characters placed in the buffer
 304 */
 305
 306 static int
 307 ord2utf8(int cvalue, uschar *buffer)
 308 {
 309 register int i, j;
 310 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
 311   if (cvalue <= utf8_table1[i]) break;
 312 buffer += i;
 313 for (j = i; j > 0; j--)
 314  {
 315  *buffer-- = 0x80 | (cvalue & 0x3f);
 316  cvalue >>= 6;
 317  }
 318 *buffer = utf8_table2[i] | cvalue;
 319 return i + 1;
 320 }
 321 #endif
 322
 323
 324
 325 /*************************************************
 326 *          Return version string                 *
 327 *************************************************/
 328
 329 #define STRING(a)  # a
 330 #define XSTRING(s) STRING(s)
 331
 332 const char *
 333 pcre_version(void)
 334 {
 335 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
 336 }
 337
 338
 339
 340
 341 /*************************************************
 342 * (Obsolete) Return info about compiled pattern  *
 343 *************************************************/
 344
 345 /* This is the original "info" function. It picks potentially useful data out
 346 of the private structure, but its interface was too rigid. It remains for
 347 backwards compatibility. The public options are passed back in an int - though
 348 the re->options field has been expanded to a long int, all the public options
 349 at the low end of it, and so even on 16-bit systems this will still be OK.
 350 Therefore, I haven't changed the API for pcre_info().
 351
 352 Arguments:
 353   external_re   points to compiled code
 354   optptr        where to pass back the options
 355   first_char    where to pass back the first character,
 356                 or -1 if multiline and all branches start ^,
 357                 or -2 otherwise
 358
 359 Returns:        number of capturing subpatterns
 360                 or negative values on error
 361 */
 362
 363 int
 364 pcre_info(const pcre *external_re, int *optptr, int *first_char)
 365 {
 366 const real_pcre *re = (const real_pcre *)external_re;
 367 if (re == NULL) return PCRE_ERROR_NULL;
 368 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 369 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
 370 if (first_char != NULL)
 371   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 372      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 373 return re->top_bracket;
 374 }
 375
 376
 377
 378 /*************************************************
 379 *        Return info about compiled pattern      *
 380 *************************************************/
 381
 382 /* This is a newer "info" function which has an extensible interface so
 383 that additional items can be added compatibly.
 384
 385 Arguments:
 386   external_re      points to compiled code
 387   external_study   points to study data, or NULL
 388   what             what information is required
 389   where            where to put the information
 390
 391 Returns:           0 if data returned, negative on error
 392 */
 393
 394 int
 395 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
 396   void *where)
 397 {
 398 const real_pcre *re = (const real_pcre *)external_re;
 399 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
 400
 401 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
 402 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 403
 404 switch (what)
 405   {
 406   case PCRE_INFO_OPTIONS:
 407   *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
 408   break;
 409
 410   case PCRE_INFO_SIZE:
 411   *((size_t *)where) = re->size;
 412   break;
 413
 414   case PCRE_INFO_CAPTURECOUNT:
 415   *((int *)where) = re->top_bracket;
 416   break;
 417
 418   case PCRE_INFO_BACKREFMAX:
 419   *((int *)where) = re->top_backref;
 420   break;
 421
 422   case PCRE_INFO_FIRSTCHAR:
 423   *((int *)where) =
 424     ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 425     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 426   break;
 427
 428   case PCRE_INFO_FIRSTTABLE:
 429   *((const uschar **)where) =
 430     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
 431       study->start_bits : NULL;
 432   break;
 433
 434   case PCRE_INFO_LASTLITERAL:
 435   *((int *)where) =
 436     ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
 437   break;
 438
 439   default: return PCRE_ERROR_BADOPTION;
 440   }
 441
 442 return 0;
 443 }
 444
 445
 446
 447 #ifdef DEBUG
 448 /*************************************************
 449 *        Debugging function to print chars       *
 450 *************************************************/
 451
 452 /* Print a sequence of chars in printable format, stopping at the end of the
 453 subject if the requested.
 454
 455 Arguments:
 456   p           points to characters
 457   length      number to print
 458   is_subject  TRUE if printing from within md->start_subject
 459   md          pointer to matching data block, if is_subject is TRUE
 460
 461 Returns:     nothing
 462 */
 463
 464 static void
 465 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 466 {
 467 int c;
 468 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 469 while (length-- > 0)
 470   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 471 }
 472 #endif
 473
 474
 475
 476
 477 /*************************************************
 478 *            Handle escapes                      *
 479 *************************************************/
 480
 481 /* This function is called when a \ has been encountered. It either returns a
 482 positive value for a simple escape such as \n, or a negative value which
 483 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
 484 a positive value greater than 255 may be returned. On entry, ptr is pointing at
 485 the \. On exit, it is on the final character of the escape sequence.
 486
 487 Arguments:
 488   ptrptr     points to the pattern position pointer
 489   errorptr   points to the pointer to the error message
 490   bracount   number of previous extracting brackets
 491   options    the options bits
 492   isclass    TRUE if inside a character class
 493   cd         pointer to char tables block
 494
 495 Returns:     zero or positive => a data character
 496              negative => a special escape sequence
 497              on error, errorptr is set
 498 */
 499
 500 static int
 501 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 502   int options, BOOL isclass, compile_data *cd)
 503 {
 504 const uschar *ptr = *ptrptr;
 505 int c, i;
 506
 507 /* If backslash is at the end of the pattern, it's an error. */
 508
 509 c = *(++ptr);
 510 if (c == 0) *errorptr = ERR1;
 511
 512 /* Digits or letters may have special meaning; all others are literals. */
 513
 514 else if (c < '0' || c > 'z') {}
 515
 516 /* Do an initial lookup in a table. A non-zero result is something that can be
 517 returned immediately. Otherwise further processing may be required. */
 518
 519 else if ((i = escapes[c - '0']) != 0) c = i;
 520
 521 /* Escapes that need further processing, or are illegal. */
 522
 523 else
 524   {
 525   const uschar *oldptr;
 526   switch (c)
 527     {
 528     /* The handling of escape sequences consisting of a string of digits
 529     starting with one that is not zero is not straightforward. By experiment,
 530     the way Perl works seems to be as follows:
 531
 532     Outside a character class, the digits are read as a decimal number. If the
 533     number is less than 10, or if there are that many previous extracting
 534     left brackets, then it is a back reference. Otherwise, up to three octal
 535     digits are read to form an escaped byte. Thus \123 is likely to be octal
 536     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 537     value is greater than 377, the least significant 8 bits are taken. Inside a
 538     character class, \ followed by a digit is always an octal number. */
 539
 540     case '1': case '2': case '3': case '4': case '5':
 541     case '6': case '7': case '8': case '9':
 542
 543     if (!isclass)
 544       {
 545       oldptr = ptr;
 546       c -= '0';
 547       while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
 548         c = c * 10 + *(++ptr) - '0';
 549       if (c < 10 || c <= bracount)
 550         {
 551         c = -(ESC_REF + c);
 552         break;
 553         }
 554       ptr = oldptr;      /* Put the pointer back and fall through */
 555       }
 556
 557     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 558     generates a binary zero byte and treats the digit as a following literal.
 559     Thus we have to pull back the pointer by one. */
 560
 561     if ((c = *ptr) >= '8')
 562       {
 563       ptr--;
 564       c = 0;
 565       break;
 566       }
 567
 568     /* \0 always starts an octal number, but we may drop through to here with a
 569     larger first octal digit. */
 570
 571     case '0':
 572     c -= '0';
 573     while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
 574       ptr[1] != '8' && ptr[1] != '9')
 575         c = c * 8 + *(++ptr) - '0';
 576     c &= 255;     /* Take least significant 8 bits */
 577     break;
 578
 579     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
 580     which can be greater than 0xff, but only if the ddd are hex digits. */
 581
 582     case 'x':
 583 #ifdef SUPPORT_UTF8
 584     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
 585       {
 586       const uschar *pt = ptr + 2;
 587       register int count = 0;
 588       c = 0;
 589       while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
 590         {
 591         count++;
 592         c = c * 16 + cd->lcc[*pt] -
 593           (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
 594         pt++;
 595         }
 596       if (*pt == '}')
 597         {
 598         if (c < 0 || count > 8) *errorptr = ERR34;
 599         ptr = pt;
 600         break;
 601         }
 602       /* If the sequence of hex digits does not end with '}', then we don't
 603       recognize this construct; fall through to the normal \x handling. */
 604       }
 605 #endif
 606
 607     /* Read just a single hex char */
 608
 609     c = 0;
 610     while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
 611       {
 612       ptr++;
 613       c = c * 16 + cd->lcc[*ptr] -
 614         (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
 615       }
 616     break;
 617
 618     /* Other special escapes not starting with a digit are straightforward */
 619
 620     case 'c':
 621     c = *(++ptr);
 622     if (c == 0)
 623       {
 624       *errorptr = ERR2;
 625       return 0;
 626       }
 627
 628     /* A letter is upper-cased; then the 0x40 bit is flipped */
 629
 630     if (c >= 'a' && c <= 'z') c = cd->fcc[c];
 631     c ^= 0x40;
 632     break;
 633
 634     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 635     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
 636     for Perl compatibility, it is a literal. This code looks a bit odd, but
 637     there used to be some cases other than the default, and there may be again
 638     in future, so I haven't "optimized" it. */
 639
 640     default:
 641     if ((options & PCRE_EXTRA) != 0) switch(c)
 642       {
 643       default:
 644       *errorptr = ERR3;
 645       break;
 646       }
 647     break;
 648     }
 649   }
 650
 651 *ptrptr = ptr;
 652 return c;
 653 }
 654
 655
 656
 657 /*************************************************
 658 *            Check for counted repeat            *
 659 *************************************************/
 660
 661 /* This function is called when a '{' is encountered in a place where it might
 662 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 663 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 664 where the ddds are digits.
 665
 666 Arguments:
 667   p         pointer to the first char after '{'
 668   cd        pointer to char tables block
 669
 670 Returns:    TRUE or FALSE
 671 */
 672
 673 static BOOL
 674 is_counted_repeat(const uschar *p, compile_data *cd)
 675 {
 676 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
 677 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
 678 if (*p == '}') return TRUE;
 679
 680 if (*p++ != ',') return FALSE;
 681 if (*p == '}') return TRUE;
 682
 683 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
 684 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
 685 return (*p == '}');
 686 }
 687
 688
 689
 690 /*************************************************
 691 *         Read repeat counts                     *
 692 *************************************************/
 693
 694 /* Read an item of the form {n,m} and return the values. This is called only
 695 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 696 so the syntax is guaranteed to be correct, but we need to check the values.
 697
 698 Arguments:
 699   p          pointer to first char after '{'
 700   minp       pointer to int for min
 701   maxp       pointer to int for max
 702              returned as -1 if no max
 703   errorptr   points to pointer to error message
 704   cd         pointer to character tables clock
 705
 706 Returns:     pointer to '}' on success;
 707              current ptr on error, with errorptr set
 708 */
 709
 710 static const uschar *
 711 read_repeat_counts(const uschar *p, int *minp, int *maxp,
 712   const char **errorptr, compile_data *cd)
 713 {
 714 int min = 0;
 715 int max = -1;
 716
 717 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
 718
 719 if (*p == '}') max = min; else
 720   {
 721   if (*(++p) != '}')
 722     {
 723     max = 0;
 724     while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
 725     if (max < min)
 726       {
 727       *errorptr = ERR4;
 728       return p;
 729       }
 730     }
 731   }
 732
 733 /* Do paranoid checks, then fill in the required variables, and pass back the
 734 pointer to the terminating '}'. */
 735
 736 if (min > 65535 || max > 65535)
 737   *errorptr = ERR5;
 738 else
 739   {
 740   *minp = min;
 741   *maxp = max;
 742   }
 743 return p;
 744 }
 745
 746
 747
 748 /*************************************************
 749 *        Find the fixed length of a pattern      *
 750 *************************************************/
 751
 752 /* Scan a pattern and compute the fixed length of subject that will match it,
 753 if the length is fixed. This is needed for dealing with backward assertions.
 754
 755 Arguments:
 756   code     points to the start of the pattern (the bracket)
 757   options  the compiling options
 758
 759 Returns:   the fixed length, or -1 if there is no fixed length
 760 */
 761
 762 static int
 763 find_fixedlength(uschar *code, int options)
 764 {
 765 int length = -1;
 766
 767 register int branchlength = 0;
 768 register uschar *cc = code + 3;
 769
 770 /* Scan along the opcodes for this branch. If we get to the end of the
 771 branch, check the length against that of the other branches. */
 772
 773 for (;;)
 774   {
 775   int d;
 776   register int op = *cc;
 777   if (op >= OP_BRA) op = OP_BRA;
 778
 779   switch (op)
 780     {
 781     case OP_BRA:
 782     case OP_ONCE:
 783     case OP_COND:
 784     d = find_fixedlength(cc, options);
 785     if (d < 0) return -1;
 786     branchlength += d;
 787     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 788     cc += 3;
 789     break;
 790
 791     /* Reached end of a branch; if it's a ket it is the end of a nested
 792     call. If it's ALT it is an alternation in a nested call. If it is
 793     END it's the end of the outer call. All can be handled by the same code. */
 794
 795     case OP_ALT:
 796     case OP_KET:
 797     case OP_KETRMAX:
 798     case OP_KETRMIN:
 799     case OP_END:
 800     if (length < 0) length = branchlength;
 801       else if (length != branchlength) return -1;
 802     if (*cc != OP_ALT) return length;
 803     cc += 3;
 804     branchlength = 0;
 805     break;
 806
 807     /* Skip over assertive subpatterns */
 808
 809     case OP_ASSERT:
 810     case OP_ASSERT_NOT:
 811     case OP_ASSERTBACK:
 812     case OP_ASSERTBACK_NOT:
 813     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 814     cc += 3;
 815     break;
 816
 817     /* Skip over things that don't match chars */
 818
 819     case OP_REVERSE:
 820     case OP_BRANUMBER:
 821     case OP_CREF:
 822     cc++;
 823     /* Fall through */
 824
 825     case OP_OPT:
 826     cc++;
 827     /* Fall through */
 828
 829     case OP_SOD:
 830     case OP_EOD:
 831     case OP_EODN:
 832     case OP_CIRC:
 833     case OP_DOLL:
 834     case OP_NOT_WORD_BOUNDARY:
 835     case OP_WORD_BOUNDARY:
 836     cc++;
 837     break;
 838
 839     /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
 840     This requires a scan of the string, unfortunately. We assume valid UTF-8
 841     strings, so all we do is reduce the length by one for byte whose bits are
 842     10xxxxxx. */
 843
 844     case OP_CHARS:
 845     branchlength += *(++cc);
 846 #ifdef SUPPORT_UTF8
 847     for (d = 1; d <= *cc; d++)
 848       if ((cc[d] & 0xc0) == 0x80) branchlength--;
 849 #endif
 850     cc += *cc + 1;
 851     break;
 852
 853     /* Handle exact repetitions */
 854
 855     case OP_EXACT:
 856     case OP_TYPEEXACT:
 857     branchlength += (cc[1] << 8) + cc[2];
 858     cc += 4;
 859     break;
 860
 861     /* Handle single-char matchers */
 862
 863     case OP_NOT_DIGIT:
 864     case OP_DIGIT:
 865     case OP_NOT_WHITESPACE:
 866     case OP_WHITESPACE:
 867     case OP_NOT_WORDCHAR:
 868     case OP_WORDCHAR:
 869     case OP_ANY:
 870     branchlength++;
 871     cc++;
 872     break;
 873
 874
 875     /* Check a class for variable quantification */
 876
 877     case OP_CLASS:
 878     cc += 33;
 879
 880     switch (*cc)
 881       {
 882       case OP_CRSTAR:
 883       case OP_CRMINSTAR:
 884       case OP_CRQUERY:
 885       case OP_CRMINQUERY:
 886       return -1;
 887
 888       case OP_CRRANGE:
 889       case OP_CRMINRANGE:
 890       if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
 891       branchlength += (cc[1] << 8) + cc[2];
 892       cc += 5;
 893       break;
 894
 895       default:
 896       branchlength++;
 897       }
 898     break;
 899
 900     /* Anything else is variable length */
 901
 902     default:
 903     return -1;
 904     }
 905   }
 906 /* Control never gets here */
 907 }
 908
 909
 910
 911
 912 /*************************************************
 913 *           Check for POSIX class syntax         *
 914 *************************************************/
 915
 916 /* This function is called when the sequence "[:" or "[." or "[=" is
 917 encountered in a character class. It checks whether this is followed by an
 918 optional ^ and then a sequence of letters, terminated by a matching ":]" or
 919 ".]" or "=]".
 920
 921 Argument:
 922   ptr      pointer to the initial [
 923   endptr   where to return the end pointer
 924   cd       pointer to compile data
 925
 926 Returns:   TRUE or FALSE
 927 */
 928
 929 static BOOL
 930 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
 931 {
 932 int terminator;          /* Don't combine these lines; the Solaris cc */
 933 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 934 if (*(++ptr) == '^') ptr++;
 935 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
 936 if (*ptr == terminator && ptr[1] == ']')
 937   {
 938   *endptr = ptr;
 939   return TRUE;
 940   }
 941 return FALSE;
 942 }
 943
 944
 945
 946
 947 /*************************************************
 948 *          Check POSIX class name                *
 949 *************************************************/
 950
 951 /* This function is called to check the name given in a POSIX-style class entry
 952 such as [:alnum:].
 953
 954 Arguments:
 955   ptr        points to the first letter
 956   len        the length of the name
 957
 958 Returns:     a value representing the name, or -1 if unknown
 959 */
 960
 961 static int
 962 check_posix_name(const uschar *ptr, int len)
 963 {
 964 register int yield = 0;
 965 while (posix_name_lengths[yield] != 0)
 966   {
 967   if (len == posix_name_lengths[yield] &&
 968     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
 969   yield++;
 970   }
 971 return -1;
 972 }
 973
 974
 975
 976
 977 /*************************************************
 978 *           Compile one branch                   *
 979 *************************************************/
 980
 981 /* Scan the pattern, compiling it into the code vector.
 982
 983 Arguments:
 984   options      the option bits
 985   brackets     points to number of extracting brackets used
 986   code         points to the pointer to the current code point
 987   ptrptr       points to the current pattern pointer
 988   errorptr     points to pointer to error message
 989   optchanged   set to the value of the last OP_OPT item compiled
 990   reqchar      set to the last literal character required, else -1
 991   countlits    set to count of mandatory literal characters
 992   cd           contains pointers to tables
 993
 994 Returns:       TRUE on success
 995                FALSE, with *errorptr set on error
 996 */
 997
 998 static BOOL
 999 compile_branch(int options, int *brackets, uschar **codeptr,
1000   const uschar **ptrptr, const char **errorptr, int *optchanged,
1001   int *reqchar, int *countlits, compile_data *cd)
1002 {
1003 int repeat_type, op_type;
1004 int repeat_min, repeat_max;
1005 int bravalue, length;
1006 int greedy_default, greedy_non_default;
1007 int prevreqchar;
1008 int condcount = 0;
1009 int subcountlits = 0;
1010 register int c;
1011 register uschar *code = *codeptr;
1012 uschar *tempcode;
1013 const uschar *ptr = *ptrptr;
1014 const uschar *tempptr;
1015 uschar *previous = NULL;
1016 uschar class[32];
1017
1018 /* Set up the default and non-default settings for greediness */
1019
1020 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1021 greedy_non_default = greedy_default ^ 1;
1022
1023 /* Initialize no required char, and count of literals */
1024
1025 *reqchar = prevreqchar = -1;
1026 *countlits = 0;
1027
1028 /* Switch on next character until the end of the branch */
1029
1030 for (;; ptr++)
1031   {
1032   BOOL negate_class;
1033   int class_charcount;
1034   int class_lastchar;
1035   int newoptions;
1036   int skipbytes;
1037   int subreqchar;
1038
1039   c = *ptr;
1040   if ((options & PCRE_EXTENDED) != 0)
1041     {
1042     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1043     if (c == '#')
1044       {
1045       /* The space before the ; is to avoid a warning on a silly compiler
1046       on the Macintosh. */
1047       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1048       continue;
1049       }
1050     }
1051
1052   switch(c)
1053     {
1054     /* The branch terminates at end of string, |, or ). */
1055
1056     case 0:
1057     case '|':
1058     case ')':
1059     *codeptr = code;
1060     *ptrptr = ptr;
1061     return TRUE;
1062
1063     /* Handle single-character metacharacters */
1064
1065     case '^':
1066     previous = NULL;
1067     *code++ = OP_CIRC;
1068     break;
1069
1070     case '$':
1071     previous = NULL;
1072     *code++ = OP_DOLL;
1073     break;
1074
1075     case '.':
1076     previous = code;
1077     *code++ = OP_ANY;
1078     break;
1079
1080     /* Character classes. These always build a 32-byte bitmap of the permitted
1081     characters, except in the special case where there is only one character.
1082     For negated classes, we build the map as usual, then invert it at the end.
1083     */
1084
1085     case '[':
1086     previous = code;
1087     *code++ = OP_CLASS;
1088
1089     /* If the first character is '^', set the negation flag and skip it. */
1090
1091     if ((c = *(++ptr)) == '^')
1092       {
1093       negate_class = TRUE;
1094       c = *(++ptr);
1095       }
1096     else negate_class = FALSE;
1097
1098     /* Keep a count of chars so that we can optimize the case of just a single
1099     character. */
1100
1101     class_charcount = 0;
1102     class_lastchar = -1;
1103
1104     /* Initialize the 32-char bit map to all zeros. We have to build the
1105     map in a temporary bit of store, in case the class contains only 1
1106     character, because in that case the compiled code doesn't use the
1107     bit map. */
1108
1109     memset(class, 0, 32 * sizeof(uschar));
1110
1111     /* Process characters until ] is reached. By writing this as a "do" it
1112     means that an initial ] is taken as a data character. */
1113
1114     do
1115       {
1116       if (c == 0)
1117         {
1118         *errorptr = ERR6;
1119         goto FAILED;
1120         }
1121
1122       /* Handle POSIX class names. Perl allows a negation extension of the
1123       form [:^name]. A square bracket that doesn't match the syntax is
1124       treated as a literal. We also recognize the POSIX constructions
1125       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1126       5.6 does. */
1127
1128       if (c == '[' &&
1129           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1130           check_posix_syntax(ptr, &tempptr, cd))
1131         {
1132         BOOL local_negate = FALSE;
1133         int posix_class, i;
1134         register const uschar *cbits = cd->cbits;
1135
1136         if (ptr[1] != ':')
1137           {
1138           *errorptr = ERR31;
1139           goto FAILED;
1140           }
1141
1142         ptr += 2;
1143         if (*ptr == '^')
1144           {
1145           local_negate = TRUE;
1146           ptr++;
1147           }
1148
1149         posix_class = check_posix_name(ptr, tempptr - ptr);
1150         if (posix_class < 0)
1151           {
1152           *errorptr = ERR30;
1153           goto FAILED;
1154           }
1155
1156         /* If matching is caseless, upper and lower are converted to
1157         alpha. This relies on the fact that the class table starts with
1158         alpha, lower, upper as the first 3 entries. */
1159
1160         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1161           posix_class = 0;
1162
1163         /* Or into the map we are building up to 3 of the static class
1164         tables, or their negations. */
1165
1166         posix_class *= 3;
1167         for (i = 0; i < 3; i++)
1168           {
1169           int taboffset = posix_class_maps[posix_class + i];
1170           if (taboffset < 0) break;
1171           if (local_negate)
1172             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1173           else
1174             for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1175           }
1176
1177         ptr = tempptr + 1;
1178         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1179         continue;
1180         }
1181
1182       /* Backslash may introduce a single character, or it may introduce one
1183       of the specials, which just set a flag. Escaped items are checked for
1184       validity in the pre-compiling pass. The sequence \b is a special case.
1185       Inside a class (and only there) it is treated as backspace. Elsewhere
1186       it marks a word boundary. Other escapes have preset maps ready to
1187       or into the one we are building. We assume they have more than one
1188       character in them, so set class_count bigger than one. */
1189
1190       if (c == '\\')
1191         {
1192         c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1193         if (-c == ESC_b) c = '\b';
1194         else if (c < 0)
1195           {
1196           register const uschar *cbits = cd->cbits;
1197           class_charcount = 10;
1198           switch (-c)
1199             {
1200             case ESC_d:
1201             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1202             continue;
1203
1204             case ESC_D:
1205             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1206             continue;
1207
1208             case ESC_w:
1209             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1210             continue;
1211
1212             case ESC_W:
1213             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1214             continue;
1215
1216             case ESC_s:
1217             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1218             continue;
1219
1220             case ESC_S:
1221             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1222             continue;
1223
1224             default:
1225             *errorptr = ERR7;
1226             goto FAILED;
1227             }
1228           }
1229
1230         /* Fall through if single character, but don't at present allow
1231         chars > 255 in UTF-8 mode. */
1232
1233 #ifdef SUPPORT_UTF8
1234         if (c > 255)
1235           {
1236           *errorptr = ERR33;
1237           goto FAILED;
1238           }
1239 #endif
1240         }
1241
1242       /* A single character may be followed by '-' to form a range. However,
1243       Perl does not permit ']' to be the end of the range. A '-' character
1244       here is treated as a literal. */
1245
1246       if (ptr[1] == '-' && ptr[2] != ']')
1247         {
1248         int d;
1249         ptr += 2;
1250         d = *ptr;
1251
1252         if (d == 0)
1253           {
1254           *errorptr = ERR6;
1255           goto FAILED;
1256           }
1257
1258         /* The second part of a range can be a single-character escape, but
1259         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1260         in such circumstances. */
1261
1262         if (d == '\\')
1263           {
1264           const uschar *oldptr = ptr;
1265           d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1266
1267 #ifdef SUPPORT_UTF8
1268           if (d > 255)
1269             {
1270             *errorptr = ERR33;
1271             goto FAILED;
1272             }
1273 #endif
1274           /* \b is backslash; any other special means the '-' was literal */
1275
1276           if (d < 0)
1277             {
1278             if (d == -ESC_b) d = '\b'; else
1279               {
1280               ptr = oldptr - 2;
1281               goto SINGLE_CHARACTER;  /* A few lines below */
1282               }
1283             }
1284           }
1285
1286         if (d < c)
1287           {
1288           *errorptr = ERR8;
1289           goto FAILED;
1290           }
1291
1292         for (; c <= d; c++)
1293           {
1294           class[c/8] |= (1 << (c&7));
1295           if ((options & PCRE_CASELESS) != 0)
1296             {
1297             int uc = cd->fcc[c];           /* flip case */
1298             class[uc/8] |= (1 << (uc&7));
1299             }
1300           class_charcount++;                /* in case a one-char range */
1301           class_lastchar = c;
1302           }
1303         continue;   /* Go get the next char in the class */
1304         }
1305
1306       /* Handle a lone single character - we can get here for a normal
1307       non-escape char, or after \ that introduces a single character. */
1308
1309       SINGLE_CHARACTER:
1310
1311       class [c/8] |= (1 << (c&7));
1312       if ((options & PCRE_CASELESS) != 0)
1313         {
1314         c = cd->fcc[c];   /* flip case */
1315         class[c/8] |= (1 << (c&7));
1316         }
1317       class_charcount++;
1318       class_lastchar = c;
1319       }
1320
1321     /* Loop until ']' reached; the check for end of string happens inside the
1322     loop. This "while" is the end of the "do" above. */
1323
1324     while ((c = *(++ptr)) != ']');
1325
1326     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1327     precisely one character. This doesn't need the whole 32-byte bit map.
1328     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1329     it's negative. */
1330
1331     if (class_charcount == 1 && class_lastchar >= 0)
1332       {
1333       if (negate_class)
1334         {
1335         code[-1] = OP_NOT;
1336         }
1337       else
1338         {
1339         code[-1] = OP_CHARS;
1340         *code++ = 1;
1341         }
1342       *code++ = class_lastchar;
1343       }
1344
1345     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1346     the code vector. */
1347
1348     else
1349       {
1350       if (negate_class)
1351         for (c = 0; c < 32; c++) code[c] = ~class[c];
1352       else
1353         memcpy(code, class, 32);
1354       code += 32;
1355       }
1356     break;
1357
1358     /* Various kinds of repeat */
1359
1360     case '{':
1361     if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1362     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1363     if (*errorptr != NULL) goto FAILED;
1364     goto REPEAT;
1365
1366     case '*':
1367     repeat_min = 0;
1368     repeat_max = -1;
1369     goto REPEAT;
1370
1371     case '+':
1372     repeat_min = 1;
1373     repeat_max = -1;
1374     goto REPEAT;
1375
1376     case '?':
1377     repeat_min = 0;
1378     repeat_max = 1;
1379
1380     REPEAT:
1381     if (previous == NULL)
1382       {
1383       *errorptr = ERR9;
1384       goto FAILED;
1385       }
1386
1387     /* If the next character is '?' this is a minimizing repeat, by default,
1388     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1389     next character. */
1390
1391     if (ptr[1] == '?')
1392       { repeat_type = greedy_non_default; ptr++; }
1393     else repeat_type = greedy_default;
1394
1395     /* If previous was a string of characters, chop off the last one and use it
1396     as the subject of the repeat. If there was only one character, we can
1397     abolish the previous item altogether. A repeat with a zero minimum wipes
1398     out any reqchar setting, backing up to the previous value. We must also
1399     adjust the countlits value. */
1400
1401     if (*previous == OP_CHARS)
1402       {
1403       int len = previous[1];
1404
1405       if (repeat_min == 0) *reqchar = prevreqchar;
1406       *countlits += repeat_min - 1;
1407
1408       if (len == 1)
1409         {
1410         c = previous[2];
1411         code = previous;
1412         }
1413       else
1414         {
1415         c = previous[len+1];
1416         previous[1]--;
1417         code--;
1418         }
1419       op_type = 0;                 /* Use single-char op codes */
1420       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1421       }
1422
1423     /* If previous was a single negated character ([^a] or similar), we use
1424     one of the special opcodes, replacing it. The code is shared with single-
1425     character repeats by adding a suitable offset into repeat_type. */
1426
1427     else if ((int)*previous == OP_NOT)
1428       {
1429       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1430       c = previous[1];
1431       code = previous;
1432       goto OUTPUT_SINGLE_REPEAT;
1433       }
1434
1435     /* If previous was a character type match (\d or similar), abolish it and
1436     create a suitable repeat item. The code is shared with single-character
1437     repeats by adding a suitable offset into repeat_type. */
1438
1439     else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1440       {
1441       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1442       c = *previous;
1443       code = previous;
1444
1445       OUTPUT_SINGLE_REPEAT:
1446
1447       /* If the maximum is zero then the minimum must also be zero; Perl allows
1448       this case, so we do too - by simply omitting the item altogether. */
1449
1450       if (repeat_max == 0) goto END_REPEAT;
1451
1452       /* Combine the op_type with the repeat_type */
1453
1454       repeat_type += op_type;
1455
1456       /* A minimum of zero is handled either as the special case * or ?, or as
1457       an UPTO, with the maximum given. */
1458
1459       if (repeat_min == 0)
1460         {
1461         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1462           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1463         else
1464           {
1465           *code++ = OP_UPTO + repeat_type;
1466           *code++ = repeat_max >> 8;
1467           *code++ = (repeat_max & 255);
1468           }
1469         }
1470
1471       /* The case {1,} is handled as the special case + */
1472
1473       else if (repeat_min == 1 && repeat_max == -1)
1474         *code++ = OP_PLUS + repeat_type;
1475
1476       /* The case {n,n} is just an EXACT, while the general case {n,m} is
1477       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1478
1479       else
1480         {
1481         if (repeat_min != 1)
1482           {
1483           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1484           *code++ = repeat_min >> 8;
1485           *code++ = (repeat_min & 255);
1486           }
1487
1488         /* If the mininum is 1 and the previous item was a character string,
1489         we either have to put back the item that got cancelled if the string
1490         length was 1, or add the character back onto the end of a longer
1491         string. For a character type nothing need be done; it will just get
1492         put back naturally. Note that the final character is always going to
1493         get added below. */
1494
1495         else if (*previous == OP_CHARS)
1496           {
1497           if (code == previous) code += 2; else previous[1]++;
1498           }
1499
1500         /*  For a single negated character we also have to put back the
1501         item that got cancelled. */
1502
1503         else if (*previous == OP_NOT) code++;
1504
1505         /* If the maximum is unlimited, insert an OP_STAR. */
1506
1507         if (repeat_max < 0)
1508           {
1509           *code++ = c;
1510           *code++ = OP_STAR + repeat_type;
1511           }
1512
1513         /* Else insert an UPTO if the max is greater than the min. */
1514
1515         else if (repeat_max != repeat_min)
1516           {
1517           *code++ = c;
1518           repeat_max -= repeat_min;
1519           *code++ = OP_UPTO + repeat_type;
1520           *code++ = repeat_max >> 8;
1521           *code++ = (repeat_max & 255);
1522           }
1523         }
1524
1525       /* The character or character type itself comes last in all cases. */
1526
1527       *code++ = c;
1528       }
1529
1530     /* If previous was a character class or a back reference, we put the repeat
1531     stuff after it, but just skip the item if the repeat was {0,0}. */
1532
1533     else if (*previous == OP_CLASS || *previous == OP_REF)
1534       {
1535       if (repeat_max == 0)
1536         {
1537         code = previous;
1538         goto END_REPEAT;
1539         }
1540       if (repeat_min == 0 && repeat_max == -1)
1541         *code++ = OP_CRSTAR + repeat_type;
1542       else if (repeat_min == 1 && repeat_max == -1)
1543         *code++ = OP_CRPLUS + repeat_type;
1544       else if (repeat_min == 0 && repeat_max == 1)
1545         *code++ = OP_CRQUERY + repeat_type;
1546       else
1547         {
1548         *code++ = OP_CRRANGE + repeat_type;
1549         *code++ = repeat_min >> 8;
1550         *code++ = repeat_min & 255;
1551         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
1552         *code++ = repeat_max >> 8;
1553         *code++ = repeat_max & 255;
1554         }
1555       }
1556
1557     /* If previous was a bracket group, we may have to replicate it in certain
1558     cases. */
1559
1560     else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1561              (int)*previous == OP_COND)
1562       {
1563       register int i;
1564       int ketoffset = 0;
1565       int len = code - previous;
1566       uschar *bralink = NULL;
1567
1568       /* If the maximum repeat count is unlimited, find the end of the bracket
1569       by scanning through from the start, and compute the offset back to it
1570       from the current code pointer. There may be an OP_OPT setting following
1571       the final KET, so we can't find the end just by going back from the code
1572       pointer. */
1573
1574       if (repeat_max == -1)
1575         {
1576         register uschar *ket = previous;
1577         do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1578         ketoffset = code - ket;
1579         }
1580
1581       /* The case of a zero minimum is special because of the need to stick
1582       OP_BRAZERO in front of it, and because the group appears once in the
1583       data, whereas in other cases it appears the minimum number of times. For
1584       this reason, it is simplest to treat this case separately, as otherwise
1585       the code gets far too messy. There are several special subcases when the
1586       minimum is zero. */
1587
1588       if (repeat_min == 0)
1589         {
1590         /* If we set up a required char from the bracket, we must back off
1591         to the previous value and reset the countlits value too. */
1592
1593         if (subcountlits > 0)
1594           {
1595           *reqchar = prevreqchar;
1596           *countlits -= subcountlits;
1597           }
1598
1599         /* If the maximum is also zero, we just omit the group from the output
1600         altogether. */
1601
1602         if (repeat_max == 0)
1603           {
1604           code = previous;
1605           goto END_REPEAT;
1606           }
1607
1608         /* If the maximum is 1 or unlimited, we just have to stick in the
1609         BRAZERO and do no more at this point. */
1610
1611         if (repeat_max <= 1)
1612           {
1613           memmove(previous+1, previous, len);
1614           code++;
1615           *previous++ = OP_BRAZERO + repeat_type;
1616           }
1617
1618         /* If the maximum is greater than 1 and limited, we have to replicate
1619         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1620         The first one has to be handled carefully because it's the original
1621         copy, which has to be moved up. The remainder can be handled by code
1622         that is common with the non-zero minimum case below. We just have to
1623         adjust the value or repeat_max, since one less copy is required. */
1624
1625         else
1626           {
1627           int offset;
1628           memmove(previous+4, previous, len);
1629           code += 4;
1630           *previous++ = OP_BRAZERO + repeat_type;
1631           *previous++ = OP_BRA;
1632
1633           /* We chain together the bracket offset fields that have to be
1634           filled in later when the ends of the brackets are reached. */
1635
1636           offset = (bralink == NULL)? 0 : previous - bralink;
1637           bralink = previous;
1638           *previous++ = offset >> 8;
1639           *previous++ = offset & 255;
1640           }
1641
1642         repeat_max--;
1643         }
1644
1645       /* If the minimum is greater than zero, replicate the group as many
1646       times as necessary, and adjust the maximum to the number of subsequent
1647       copies that we need. */
1648
1649       else
1650         {
1651         for (i = 1; i < repeat_min; i++)
1652           {
1653           memcpy(code, previous, len);
1654           code += len;
1655           }
1656         if (repeat_max > 0) repeat_max -= repeat_min;
1657         }
1658
1659       /* This code is common to both the zero and non-zero minimum cases. If
1660       the maximum is limited, it replicates the group in a nested fashion,
1661       remembering the bracket starts on a stack. In the case of a zero minimum,
1662       the first one was set up above. In all cases the repeat_max now specifies
1663       the number of additional copies needed. */
1664
1665       if (repeat_max >= 0)
1666         {
1667         for (i = repeat_max - 1; i >= 0; i--)
1668           {
1669           *code++ = OP_BRAZERO + repeat_type;
1670
1671           /* All but the final copy start a new nesting, maintaining the
1672           chain of brackets outstanding. */
1673
1674           if (i != 0)
1675             {
1676             int offset;
1677             *code++ = OP_BRA;
1678             offset = (bralink == NULL)? 0 : code - bralink;
1679             bralink = code;
1680             *code++ = offset >> 8;
1681             *code++ = offset & 255;
1682             }
1683
1684           memcpy(code, previous, len);
1685           code += len;
1686           }
1687
1688         /* Now chain through the pending brackets, and fill in their length
1689         fields (which are holding the chain links pro tem). */
1690
1691         while (bralink != NULL)
1692           {
1693           int oldlinkoffset;
1694           int offset = code - bralink + 1;
1695           uschar *bra = code - offset;
1696           oldlinkoffset = (bra[1] << 8) + bra[2];
1697           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1698           *code++ = OP_KET;
1699           *code++ = bra[1] = offset >> 8;
1700           *code++ = bra[2] = (offset & 255);
1701           }
1702         }
1703
1704       /* If the maximum is unlimited, set a repeater in the final copy. We
1705       can't just offset backwards from the current code point, because we
1706       don't know if there's been an options resetting after the ket. The
1707       correct offset was computed above. */
1708
1709       else code[-ketoffset] = OP_KETRMAX + repeat_type;
1710       }
1711
1712     /* Else there's some kind of shambles */
1713
1714     else
1715       {
1716       *errorptr = ERR11;
1717       goto FAILED;
1718       }
1719
1720     /* In all case we no longer have a previous item. */
1721
1722     END_REPEAT:
1723     previous = NULL;
1724     break;
1725
1726
1727     /* Start of nested bracket sub-expression, or comment or lookahead or
1728     lookbehind or option setting or condition. First deal with special things
1729     that can come after a bracket; all are introduced by ?, and the appearance
1730     of any of them means that this is not a referencing group. They were
1731     checked for validity in the first pass over the string, so we don't have to
1732     check for syntax errors here.  */
1733
1734     case '(':
1735     newoptions = options;
1736     skipbytes = 0;
1737
1738     if (*(++ptr) == '?')
1739       {
1740       int set, unset;
1741       int *optset;
1742
1743       switch (*(++ptr))
1744         {
1745         case '#':                 /* Comment; skip to ket */
1746         ptr++;
1747         while (*ptr != ')') ptr++;
1748         continue;
1749
1750         case ':':                 /* Non-extracting bracket */
1751         bravalue = OP_BRA;
1752         ptr++;
1753         break;
1754
1755         case '(':
1756         bravalue = OP_COND;       /* Conditional group */
1757         if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1758           {
1759           int condref = *ptr - '0';
1760           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1761           if (condref == 0)
1762             {
1763             *errorptr = ERR35;
1764             goto FAILED;
1765             }
1766           ptr++;
1767           code[3] = OP_CREF;
1768           code[4] = condref >> 8;
1769           code[5] = condref & 255;
1770           skipbytes = 3;
1771           }
1772         else ptr--;
1773         break;
1774
1775         case '=':                 /* Positive lookahead */
1776         bravalue = OP_ASSERT;
1777         ptr++;
1778         break;
1779
1780         case '!':                 /* Negative lookahead */
1781         bravalue = OP_ASSERT_NOT;
1782         ptr++;
1783         break;
1784
1785         case '<':                 /* Lookbehinds */
1786         switch (*(++ptr))
1787           {
1788           case '=':               /* Positive lookbehind */
1789           bravalue = OP_ASSERTBACK;
1790           ptr++;
1791           break;
1792
1793           case '!':               /* Negative lookbehind */
1794           bravalue = OP_ASSERTBACK_NOT;
1795           ptr++;
1796           break;
1797
1798           default:                /* Syntax error */
1799           *errorptr = ERR24;
1800           goto FAILED;
1801           }
1802         break;
1803
1804         case '>':                 /* One-time brackets */
1805         bravalue = OP_ONCE;
1806         ptr++;
1807         break;
1808
1809         case 'R':                 /* Pattern recursion */
1810         *code++ = OP_RECURSE;
1811         ptr++;
1812         continue;
1813
1814         default:                  /* Option setting */
1815         set = unset = 0;
1816         optset = &set;
1817
1818         while (*ptr != ')' && *ptr != ':')
1819           {
1820           switch (*ptr++)
1821             {
1822             case '-': optset = &unset; break;
1823
1824             case 'i': *optset |= PCRE_CASELESS; break;
1825             case 'm': *optset |= PCRE_MULTILINE; break;
1826             case 's': *optset |= PCRE_DOTALL; break;
1827             case 'x': *optset |= PCRE_EXTENDED; break;
1828             case 'U': *optset |= PCRE_UNGREEDY; break;
1829             case 'X': *optset |= PCRE_EXTRA; break;
1830
1831             default:
1832             *errorptr = ERR12;
1833             goto FAILED;
1834             }
1835           }
1836
1837         /* Set up the changed option bits, but don't change anything yet. */
1838
1839         newoptions = (options | set) & (~unset);
1840
1841         /* If the options ended with ')' this is not the start of a nested
1842         group with option changes, so the options change at this level. At top
1843         level there is nothing else to be done (the options will in fact have
1844         been set from the start of compiling as a result of the first pass) but
1845         at an inner level we must compile code to change the ims options if
1846         necessary, and pass the new setting back so that it can be put at the
1847         start of any following branches, and when this group ends, a resetting
1848         item can be compiled. */
1849
1850         if (*ptr == ')')
1851           {
1852           if ((options & PCRE_INGROUP) != 0 &&
1853               (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1854             {
1855             *code++ = OP_OPT;
1856             *code++ = *optchanged = newoptions & PCRE_IMS;
1857             }
1858           options = newoptions;  /* Change options at this level */
1859           previous = NULL;       /* This item can't be repeated */
1860           continue;              /* It is complete */
1861           }
1862
1863         /* If the options ended with ':' we are heading into a nested group
1864         with possible change of options. Such groups are non-capturing and are
1865         not assertions of any kind. All we need to do is skip over the ':';
1866         the newoptions value is handled below. */
1867
1868         bravalue = OP_BRA;
1869         ptr++;
1870         }
1871       }
1872
1873     /* Else we have a referencing group; adjust the opcode. If the bracket
1874     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1875     arrange for the true number to follow later, in an OP_BRANUMBER item. */
1876
1877     else
1878       {
1879       if (++(*brackets) > EXTRACT_BASIC_MAX)
1880         {
1881         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1882         code[3] = OP_BRANUMBER;
1883         code[4] = *brackets >> 8;
1884         code[5] = *brackets & 255;
1885         skipbytes = 3;
1886         }
1887       else bravalue = OP_BRA + *brackets;
1888       }
1889
1890     /* Process nested bracketed re. Assertions may not be repeated, but other
1891     kinds can be. We copy code into a non-register variable in order to be able
1892     to pass its address because some compilers complain otherwise. Pass in a
1893     new setting for the ims options if they have changed. */
1894
1895     previous = (bravalue >= OP_ONCE)? code : NULL;
1896     *code = bravalue;
1897     tempcode = code;
1898
1899     if (!compile_regex(
1900          options | PCRE_INGROUP,       /* Set for all nested groups */
1901          ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1902            newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1903          brackets,                     /* Extracting bracket count */
1904          &tempcode,                    /* Where to put code (updated) */
1905          &ptr,                         /* Input pointer (updated) */
1906          errorptr,                     /* Where to put an error message */
1907          (bravalue == OP_ASSERTBACK ||
1908           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1909          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
1910          &subreqchar,                  /* For possible last char */
1911          &subcountlits,                /* For literal count */
1912          cd))                          /* Tables block */
1913       goto FAILED;
1914
1915     /* At the end of compiling, code is still pointing to the start of the
1916     group, while tempcode has been updated to point past the end of the group
1917     and any option resetting that may follow it. The pattern pointer (ptr)
1918     is on the bracket. */
1919
1920     /* If this is a conditional bracket, check that there are no more than
1921     two branches in the group. */
1922
1923     else if (bravalue == OP_COND)
1924       {
1925       uschar *tc = code;
1926       condcount = 0;
1927
1928       do {
1929          condcount++;
1930          tc += (tc[1] << 8) | tc[2];
1931          }
1932       while (*tc != OP_KET);
1933
1934       if (condcount > 2)
1935         {
1936         *errorptr = ERR27;
1937         goto FAILED;
1938         }
1939       }
1940
1941     /* Handle updating of the required character. If the subpattern didn't
1942     set one, leave it as it was. Otherwise, update it for normal brackets of
1943     all kinds, forward assertions, and conditions with two branches. Don't
1944     update the literal count for forward assertions, however. If the bracket
1945     is followed by a quantifier with zero repeat, we have to back off. Hence
1946     the definition of prevreqchar and subcountlits outside the main loop so
1947     that they can be accessed for the back off. */
1948
1949     if (subreqchar > 0 &&
1950          (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1951          (bravalue == OP_COND && condcount == 2)))
1952       {
1953       prevreqchar = *reqchar;
1954       *reqchar = subreqchar;
1955       if (bravalue != OP_ASSERT) *countlits += subcountlits;
1956       }
1957
1958     /* Now update the main code pointer to the end of the group. */
1959
1960     code = tempcode;
1961
1962     /* Error if hit end of pattern */
1963
1964     if (*ptr != ')')
1965       {
1966       *errorptr = ERR14;
1967       goto FAILED;
1968       }
1969     break;
1970
1971     /* Check \ for being a real metacharacter; if not, fall through and handle
1972     it as a data character at the start of a string. Escape items are checked
1973     for validity in the pre-compiling pass. */
1974
1975     case '\\':
1976     tempptr = ptr;
1977     c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1978
1979     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1980     are arranged to be the negation of the corresponding OP_values. For the
1981     back references, the values are ESC_REF plus the reference number. Only
1982     back references and those types that consume a character may be repeated.
1983     We can test for values between ESC_b and ESC_Z for the latter; this may
1984     have to change if any new ones are ever created. */
1985
1986     if (c < 0)
1987       {
1988       if (-c >= ESC_REF)
1989         {
1990         int number = -c - ESC_REF;
1991         previous = code;
1992         *code++ = OP_REF;
1993         *code++ = number >> 8;
1994         *code++ = number & 255;
1995         }
1996       else
1997         {
1998         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1999         *code++ = -c;
2000         }
2001       continue;
2002       }
2003
2004     /* Data character: reset and fall through */
2005
2006     ptr = tempptr;
2007     c = '\\';
2008
2009     /* Handle a run of data characters until a metacharacter is encountered.
2010     The first character is guaranteed not to be whitespace or # when the
2011     extended flag is set. */
2012
2013     NORMAL_CHAR:
2014     default:
2015     previous = code;
2016     *code = OP_CHARS;
2017     code += 2;
2018     length = 0;
2019
2020     do
2021       {
2022       if ((options & PCRE_EXTENDED) != 0)
2023         {
2024         if ((cd->ctypes[c] & ctype_space) != 0) continue;
2025         if (c == '#')
2026           {
2027           /* The space before the ; is to avoid a warning on a silly compiler
2028           on the Macintosh. */
2029           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2030           if (c == 0) break;
2031           continue;
2032           }
2033         }
2034
2035       /* Backslash may introduce a data char or a metacharacter. Escaped items
2036       are checked for validity in the pre-compiling pass. Stop the string
2037       before a metaitem. */
2038
2039       if (c == '\\')
2040         {
2041         tempptr = ptr;
2042         c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2043         if (c < 0) { ptr = tempptr; break; }
2044
2045         /* If a character is > 127 in UTF-8 mode, we have to turn it into
2046         two or more characters in the UTF-8 encoding. */
2047
2048 #ifdef SUPPORT_UTF8
2049         if (c > 127 && (options & PCRE_UTF8) != 0)
2050           {
2051           uschar buffer[8];
2052           int len = ord2utf8(c, buffer);
2053           for (c = 0; c < len; c++) *code++ = buffer[c];
2054           length += len;
2055           continue;
2056           }
2057 #endif
2058         }
2059
2060       /* Ordinary character or single-char escape */
2061
2062       *code++ = c;
2063       length++;
2064       }
2065
2066     /* This "while" is the end of the "do" above. */
2067
2068     while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2069
2070     /* Update the last character and the count of literals */
2071
2072     prevreqchar = (length > 1)? code[-2] : *reqchar;
2073     *reqchar = code[-1];
2074     *countlits += length;
2075
2076     /* Compute the length and set it in the data vector, and advance to
2077     the next state. */
2078
2079     previous[1] = length;
2080     if (length < MAXLIT) ptr--;
2081     break;
2082     }
2083   }                   /* end of big loop */
2084
2085 /* Control never reaches here by falling through, only by a goto for all the
2086 error states. Pass back the position in the pattern so that it can be displayed
2087 to the user for diagnosing the error. */
2088
2089 FAILED:
2090 *ptrptr = ptr;
2091 return FALSE;
2092 }
2093
2094
2095
2096
2097 /*************************************************
2098 *     Compile sequence of alternatives           *
2099 *************************************************/
2100
2101 /* On entry, ptr is pointing past the bracket character, but on return
2102 it points to the closing bracket, or vertical bar, or end of string.
2103 The code variable is pointing at the byte into which the BRA operator has been
2104 stored. If the ims options are changed at the start (for a (?ims: group) or
2105 during any branch, we need to insert an OP_OPT item at the start of every
2106 following branch to ensure they get set correctly at run time, and also pass
2107 the new options into every subsequent branch compile.
2108
2109 Argument:
2110   options     the option bits
2111   optchanged  new ims options to set as if (?ims) were at the start, or -1
2112                for no change
2113   brackets    -> int containing the number of extracting brackets used
2114   codeptr     -> the address of the current code pointer
2115   ptrptr      -> the address of the current pattern pointer
2116   errorptr    -> pointer to error message
2117   lookbehind  TRUE if this is a lookbehind assertion
2118   skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2119   reqchar     -> place to put the last required character, or a negative number
2120   countlits   -> place to put the shortest literal count of any branch
2121   cd          points to the data block with tables pointers
2122
2123 Returns:      TRUE on success
2124 */
2125
2126 static BOOL
2127 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2128   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2129   int *reqchar, int *countlits, compile_data *cd)
2130 {
2131 const uschar *ptr = *ptrptr;
2132 uschar *code = *codeptr;
2133 uschar *last_branch = code;
2134 uschar *start_bracket = code;
2135 uschar *reverse_count = NULL;
2136 int oldoptions = options & PCRE_IMS;
2137 int branchreqchar, branchcountlits;
2138
2139 *reqchar = -1;
2140 *countlits = INT_MAX;
2141 code += 3 + skipbytes;
2142
2143 /* Loop for each alternative branch */
2144
2145 for (;;)
2146   {
2147   int length;
2148
2149   /* Handle change of options */
2150
2151   if (optchanged >= 0)
2152     {
2153     *code++ = OP_OPT;
2154     *code++ = optchanged;
2155     options = (options & ~PCRE_IMS) | optchanged;
2156     }
2157
2158   /* Set up dummy OP_REVERSE if lookbehind assertion */
2159
2160   if (lookbehind)
2161     {
2162     *code++ = OP_REVERSE;
2163     reverse_count = code;
2164     *code++ = 0;
2165     *code++ = 0;
2166     }
2167
2168   /* Now compile the branch */
2169
2170   if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2171       &branchreqchar, &branchcountlits, cd))
2172     {
2173     *ptrptr = ptr;
2174     return FALSE;
2175     }
2176
2177   /* Fill in the length of the last branch */
2178
2179   length = code - last_branch;
2180   last_branch[1] = length >> 8;
2181   last_branch[2] = length & 255;
2182
2183   /* Save the last required character if all branches have the same; a current
2184   value of -1 means unset, while -2 means "previous branch had no last required
2185   char".  */
2186
2187   if (*reqchar != -2)
2188     {
2189     if (branchreqchar >= 0)
2190       {
2191       if (*reqchar == -1) *reqchar = branchreqchar;
2192       else if (*reqchar != branchreqchar) *reqchar = -2;
2193       }
2194     else *reqchar = -2;
2195     }
2196
2197   /* Keep the shortest literal count */
2198
2199   if (branchcountlits < *countlits) *countlits = branchcountlits;
2200   DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2201
2202   /* If lookbehind, check that this branch matches a fixed-length string,
2203   and put the length into the OP_REVERSE item. Temporarily mark the end of
2204   the branch with OP_END. */
2205
2206   if (lookbehind)
2207     {
2208     *code = OP_END;
2209     length = find_fixedlength(last_branch, options);
2210     DPRINTF(("fixed length = %d\n", length));
2211     if (length < 0)
2212       {
2213       *errorptr = ERR25;
2214       *ptrptr = ptr;
2215       return FALSE;
2216       }
2217     reverse_count[0] = (length >> 8);
2218     reverse_count[1] = length & 255;
2219     }
2220
2221   /* Reached end of expression, either ')' or end of pattern. Insert a
2222   terminating ket and the length of the whole bracketed item, and return,
2223   leaving the pointer at the terminating char. If any of the ims options
2224   were changed inside the group, compile a resetting op-code following. */
2225
2226   if (*ptr != '|')
2227     {
2228     length = code - start_bracket;
2229     *code++ = OP_KET;
2230     *code++ = length >> 8;
2231     *code++ = length & 255;
2232     if (optchanged >= 0)
2233       {
2234       *code++ = OP_OPT;
2235       *code++ = oldoptions;
2236       }
2237     *codeptr = code;
2238     *ptrptr = ptr;
2239     return TRUE;
2240     }
2241
2242   /* Another branch follows; insert an "or" node and advance the pointer. */
2243
2244   *code = OP_ALT;
2245   last_branch = code;
2246   code += 3;
2247   ptr++;
2248   }
2249 /* Control never reaches here */
2250 }
2251
2252
2253
2254
2255 /*************************************************
2256 *      Find first significant op code            *
2257 *************************************************/
2258
2259 /* This is called by several functions that scan a compiled expression looking
2260 for a fixed first character, or an anchoring op code etc. It skips over things
2261 that do not influence this. For one application, a change of caseless option is
2262 important.
2263
2264 Arguments:
2265   code       pointer to the start of the group
2266   options    pointer to external options
2267   optbit     the option bit whose changing is significant, or
2268              zero if none are
2269   optstop    TRUE to return on option change, otherwise change the options
2270                value and continue
2271
2272 Returns:     pointer to the first significant opcode
2273 */
2274
2275 static const uschar*
2276 first_significant_code(const uschar *code, int *options, int optbit,
2277   BOOL optstop)
2278 {
2279 for (;;)
2280   {
2281   switch ((int)*code)
2282     {
2283     case OP_OPT:
2284     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2285       {
2286       if (optstop) return code;
2287       *options = (int)code[1];
2288       }
2289     code += 2;
2290     break;
2291
2292     case OP_CREF:
2293     case OP_BRANUMBER:
2294     code += 3;
2295     break;
2296
2297     case OP_WORD_BOUNDARY:
2298     case OP_NOT_WORD_BOUNDARY:
2299     code++;
2300     break;
2301
2302     case OP_ASSERT_NOT:
2303     case OP_ASSERTBACK:
2304     case OP_ASSERTBACK_NOT:
2305     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2306     code += 3;
2307     break;
2308
2309     default:
2310     return code;
2311     }
2312   }
2313 /* Control never reaches here */
2314 }
2315
2316
2317
2318
2319 /*************************************************
2320 *          Check for anchored expression         *
2321 *************************************************/
2322
2323 /* Try to find out if this is an anchored regular expression. Consider each
2324 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2325 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2326 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2327 counts, since OP_CIRC can match in the middle.
2328
2329 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2330 because that will try the rest of the pattern at all possible matching points,
2331 so there is no point trying them again.
2332
2333 Arguments:
2334   code       points to start of expression (the bracket)
2335   options    points to the options setting
2336
2337 Returns:     TRUE or FALSE
2338 */
2339
2340 static BOOL
2341 is_anchored(register const uschar *code, int *options)
2342 {
2343 do {
2344    const uschar *scode = first_significant_code(code + 3, options,
2345      PCRE_MULTILINE, FALSE);
2346    register int op = *scode;
2347    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2348      { if (!is_anchored(scode, options)) return FALSE; }
2349    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2350             (*options & PCRE_DOTALL) != 0)
2351      { if (scode[1] != OP_ANY) return FALSE; }
2352    else if (op != OP_SOD &&
2353            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2354      return FALSE;
2355    code += (code[1] << 8) + code[2];
2356    }
2357 while (*code == OP_ALT);
2358 return TRUE;
2359 }
2360
2361
2362
2363 /*************************************************
2364 *         Check for starting with ^ or .*        *
2365 *************************************************/
2366
2367 /* This is called to find out if every branch starts with ^ or .* so that
2368 "first char" processing can be done to speed things up in multiline
2369 matching and for non-DOTALL patterns that start with .* (which must start at
2370 the beginning or after \n).
2371
2372 Argument:  points to start of expression (the bracket)
2373 Returns:   TRUE or FALSE
2374 */
2375
2376 static BOOL
2377 is_startline(const uschar *code)
2378 {
2379 do {
2380    const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2381    register int op = *scode;
2382    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2383      { if (!is_startline(scode)) return FALSE; }
2384    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2385      { if (scode[1] != OP_ANY) return FALSE; }
2386    else if (op != OP_CIRC) return FALSE;
2387    code += (code[1] << 8) + code[2];
2388    }
2389 while (*code == OP_ALT);
2390 return TRUE;
2391 }
2392
2393
2394
2395 /*************************************************
2396 *          Check for fixed first char            *
2397 *************************************************/
2398
2399 /* Try to find out if there is a fixed first character. This is called for
2400 unanchored expressions, as it speeds up their processing quite considerably.
2401 Consider each alternative branch. If they all start with the same char, or with
2402 a bracket all of whose alternatives start with the same char (recurse ad lib),
2403 then we return that char, otherwise -1.
2404
2405 Arguments:
2406   code       points to start of expression (the bracket)
2407   options    pointer to the options (used to check casing changes)
2408
2409 Returns:     -1 or the fixed first char
2410 */
2411
2412 static int
2413 find_firstchar(const uschar *code, int *options)
2414 {
2415 register int c = -1;
2416 do {
2417    int d;
2418    const uschar *scode = first_significant_code(code + 3, options,
2419      PCRE_CASELESS, TRUE);
2420    register int op = *scode;
2421
2422    if (op >= OP_BRA) op = OP_BRA;
2423
2424    switch(op)
2425      {
2426      default:
2427      return -1;
2428
2429      case OP_BRA:
2430      case OP_ASSERT:
2431      case OP_ONCE:
2432      case OP_COND:
2433      if ((d = find_firstchar(scode, options)) < 0) return -1;
2434      if (c < 0) c = d; else if (c != d) return -1;
2435      break;
2436
2437      case OP_EXACT:       /* Fall through */
2438      scode++;
2439
2440      case OP_CHARS:       /* Fall through */
2441      scode++;
2442
2443      case OP_PLUS:
2444      case OP_MINPLUS:
2445      if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2446      break;
2447      }
2448
2449    code += (code[1] << 8) + code[2];
2450    }
2451 while (*code == OP_ALT);
2452 return c;
2453 }
2454
2455
2456
2457
2458
2459 /*************************************************
2460 *        Compile a Regular Expression            *
2461 *************************************************/
2462
2463 /* This function takes a string and returns a pointer to a block of store
2464 holding a compiled version of the expression.
2465
2466 Arguments:
2467   pattern      the regular expression
2468   options      various option bits
2469   errorptr     pointer to pointer to error text
2470   erroroffset  ptr offset in pattern where error was detected
2471   tables       pointer to character tables or NULL
2472
2473 Returns:       pointer to compiled data block, or NULL on error,
2474                with errorptr and erroroffset set
2475 */
2476
2477 pcre *
2478 pcre_compile(const char *pattern, int options, const char **errorptr,
2479   int *erroroffset, const unsigned char *tables)
2480 {
2481 real_pcre *re;
2482 int length = 3;      /* For initial BRA plus length */
2483 int runlength;
2484 int c, reqchar, countlits;
2485 int bracount = 0;
2486 int top_backref = 0;
2487 int branch_extra = 0;
2488 int branch_newextra;
2489 unsigned int brastackptr = 0;
2490 size_t size;
2491 uschar *code;
2492 const uschar *ptr;
2493 compile_data compile_block;
2494 int brastack[BRASTACK_SIZE];
2495 uschar bralenstack[BRASTACK_SIZE];
2496
2497 #ifdef DEBUG
2498 uschar *code_base, *code_end;
2499 #endif
2500
2501 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2502
2503 #ifndef SUPPORT_UTF8
2504 if ((options & PCRE_UTF8) != 0)
2505   {
2506   *errorptr = ERR32;
2507   return NULL;
2508   }
2509 #endif
2510
2511 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2512 can do is just return NULL. */
2513
2514 if (errorptr == NULL) return NULL;
2515 *errorptr = NULL;
2516
2517 /* However, we can give a message for this error */
2518
2519 if (erroroffset == NULL)
2520   {
2521   *errorptr = ERR16;
2522   return NULL;
2523   }
2524 *erroroffset = 0;
2525
2526 if ((options & ~PUBLIC_OPTIONS) != 0)
2527   {
2528   *errorptr = ERR17;
2529   return NULL;
2530   }
2531
2532 /* Set up pointers to the individual character tables */
2533
2534 if (tables == NULL) tables = pcre_default_tables;
2535 compile_block.lcc = tables + lcc_offset;
2536 compile_block.fcc = tables + fcc_offset;
2537 compile_block.cbits = tables + cbits_offset;
2538 compile_block.ctypes = tables + ctypes_offset;
2539
2540 /* Reflect pattern for debugging output */
2541
2542 DPRINTF(("------------------------------------------------------------------\n"));
2543 DPRINTF(("%s\n", pattern));
2544
2545 /* The first thing to do is to make a pass over the pattern to compute the
2546 amount of store required to hold the compiled code. This does not have to be
2547 perfect as long as errors are overestimates. At the same time we can detect any
2548 internal flag settings. Make an attempt to correct for any counted white space
2549 if an "extended" flag setting appears late in the pattern. We can't be so
2550 clever for #-comments. */
2551
2552 ptr = (const uschar *)(pattern - 1);
2553 while ((c = *(++ptr)) != 0)
2554   {
2555   int min, max;
2556   int class_charcount;
2557   int bracket_length;
2558
2559   if ((options & PCRE_EXTENDED) != 0)
2560     {
2561     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2562     if (c == '#')
2563       {
2564       /* The space before the ; is to avoid a warning on a silly compiler
2565       on the Macintosh. */
2566       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2567       continue;
2568       }
2569     }
2570
2571   switch(c)
2572     {
2573     /* A backslashed item may be an escaped "normal" character or a
2574     character type. For a "normal" character, put the pointers and
2575     character back so that tests for whitespace etc. in the input
2576     are done correctly. */
2577
2578     case '\\':
2579       {
2580       const uschar *save_ptr = ptr;
2581       c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2582       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583       if (c >= 0)
2584         {
2585         ptr = save_ptr;
2586         c = '\\';
2587         goto NORMAL_CHAR;
2588         }
2589       }
2590     length++;
2591
2592     /* A back reference needs an additional 2 bytes, plus either one or 5
2593     bytes for a repeat. We also need to keep the value of the highest
2594     back reference. */
2595
2596     if (c <= -ESC_REF)
2597       {
2598       int refnum = -c - ESC_REF;
2599       if (refnum > top_backref) top_backref = refnum;
2600       length += 2;   /* For single back reference */
2601       if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2602         {
2603         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2604         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2605         if ((min == 0 && (max == 1 || max == -1)) ||
2606           (min == 1 && max == -1))
2607             length++;
2608         else length += 5;
2609         if (ptr[1] == '?') ptr++;
2610         }
2611       }
2612     continue;
2613
2614     case '^':
2615     case '.':
2616     case '$':
2617     case '*':     /* These repeats won't be after brackets; */
2618     case '+':     /* those are handled separately */
2619     case '?':
2620     length++;
2621     continue;
2622
2623     /* This covers the cases of repeats after a single char, metachar, class,
2624     or back reference. */
2625
2626     case '{':
2627     if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2628     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2629     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630     if ((min == 0 && (max == 1 || max == -1)) ||
2631       (min == 1 && max == -1))
2632         length++;
2633     else
2634       {
2635       length--;   /* Uncount the original char or metachar */
2636       if (min == 1) length++; else if (min > 0) length += 4;
2637       if (max > 0) length += 4; else length += 2;
2638       }
2639     if (ptr[1] == '?') ptr++;
2640     continue;
2641
2642     /* An alternation contains an offset to the next branch or ket. If any ims
2643     options changed in the previous branch(es), and/or if we are in a
2644     lookbehind assertion, extra space will be needed at the start of the
2645     branch. This is handled by branch_extra. */
2646
2647     case '|':
2648     length += 3 + branch_extra;
2649     continue;
2650
2651     /* A character class uses 33 characters. Don't worry about character types
2652     that aren't allowed in classes - they'll get picked up during the compile.
2653     A character class that contains only one character uses 2 or 3 bytes,
2654     depending on whether it is negated or not. Notice this where we can. */
2655
2656     case '[':
2657     class_charcount = 0;
2658     if (*(++ptr) == '^') ptr++;
2659     do
2660       {
2661       if (*ptr == '\\')
2662         {
2663         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2664           &compile_block);
2665         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2666         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2667         }
2668       else class_charcount++;
2669       ptr++;
2670       }
2671     while (*ptr != 0 && *ptr != ']');
2672
2673     /* Repeats for negated single chars are handled by the general code */
2674
2675     if (class_charcount == 1) length += 3; else
2676       {
2677       length += 33;
2678
2679       /* A repeat needs either 1 or 5 bytes. */
2680
2681       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2682         {
2683         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2684         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2685         if ((min == 0 && (max == 1 || max == -1)) ||
2686           (min == 1 && max == -1))
2687             length++;
2688         else length += 5;
2689         if (ptr[1] == '?') ptr++;
2690         }
2691       }
2692     continue;
2693
2694     /* Brackets may be genuine groups or special things */
2695
2696     case '(':
2697     branch_newextra = 0;
2698     bracket_length = 3;
2699
2700     /* Handle special forms of bracket, which all start (? */
2701
2702     if (ptr[1] == '?')
2703       {
2704       int set, unset;
2705       int *optset;
2706
2707       switch (c = ptr[2])
2708         {
2709         /* Skip over comments entirely */
2710         case '#':
2711         ptr += 3;
2712         while (*ptr != 0 && *ptr != ')') ptr++;
2713         if (*ptr == 0)
2714           {
2715           *errorptr = ERR18;
2716           goto PCRE_ERROR_RETURN;
2717           }
2718         continue;
2719
2720         /* Non-referencing groups and lookaheads just move the pointer on, and
2721         then behave like a non-special bracket, except that they don't increment
2722         the count of extracting brackets. Ditto for the "once only" bracket,
2723         which is in Perl from version 5.005. */
2724
2725         case ':':
2726         case '=':
2727         case '!':
2728         case '>':
2729         ptr += 2;
2730         break;
2731
2732         /* A recursive call to the regex is an extension, to provide the
2733         facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2734
2735         case 'R':
2736         if (ptr[3] != ')')
2737           {
2738           *errorptr = ERR29;
2739           goto PCRE_ERROR_RETURN;
2740           }
2741         ptr += 3;
2742         length += 1;
2743         break;
2744
2745         /* Lookbehinds are in Perl from version 5.005 */
2746
2747         case '<':
2748         if (ptr[3] == '=' || ptr[3] == '!')
2749           {
2750           ptr += 3;
2751           branch_newextra = 3;
2752           length += 3;         /* For the first branch */
2753           break;
2754           }
2755         *errorptr = ERR24;
2756         goto PCRE_ERROR_RETURN;
2757
2758         /* Conditionals are in Perl from version 5.005. The bracket must either
2759         be followed by a number (for bracket reference) or by an assertion
2760         group. */
2761
2762         case '(':
2763         if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2764           {
2765           ptr += 4;
2766           length += 3;
2767           while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2768           if (*ptr != ')')
2769             {
2770             *errorptr = ERR26;
2771             goto PCRE_ERROR_RETURN;
2772             }
2773           }
2774         else   /* An assertion must follow */
2775           {
2776           ptr++;   /* Can treat like ':' as far as spacing is concerned */
2777           if (ptr[2] != '?' ||
2778              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2779             {
2780             ptr += 2;    /* To get right offset in message */
2781             *errorptr = ERR28;
2782             goto PCRE_ERROR_RETURN;
2783             }
2784           }
2785         break;
2786
2787         /* Else loop checking valid options until ) is met. Anything else is an
2788         error. If we are without any brackets, i.e. at top level, the settings
2789         act as if specified in the options, so massage the options immediately.
2790         This is for backward compatibility with Perl 5.004. */
2791
2792         default:
2793         set = unset = 0;
2794         optset = &set;
2795         ptr += 2;
2796
2797         for (;; ptr++)
2798           {
2799           c = *ptr;
2800           switch (c)
2801             {
2802             case 'i':
2803             *optset |= PCRE_CASELESS;
2804             continue;
2805
2806             case 'm':
2807             *optset |= PCRE_MULTILINE;
2808             continue;
2809
2810             case 's':
2811             *optset |= PCRE_DOTALL;
2812             continue;
2813
2814             case 'x':
2815             *optset |= PCRE_EXTENDED;
2816             continue;
2817
2818             case 'X':
2819             *optset |= PCRE_EXTRA;
2820             continue;
2821
2822             case 'U':
2823             *optset |= PCRE_UNGREEDY;
2824             continue;
2825
2826             case '-':
2827             optset = &unset;
2828             continue;
2829
2830             /* A termination by ')' indicates an options-setting-only item;
2831             this is global at top level; otherwise nothing is done here and
2832             it is handled during the compiling process on a per-bracket-group
2833             basis. */
2834
2835             case ')':
2836             if (brastackptr == 0)
2837               {
2838               options = (options | set) & (~unset);
2839               set = unset = 0;     /* To save length */
2840               }
2841             /* Fall through */
2842
2843             /* A termination by ':' indicates the start of a nested group with
2844             the given options set. This is again handled at compile time, but
2845             we must allow for compiled space if any of the ims options are
2846             set. We also have to allow for resetting space at the end of
2847             the group, which is why 4 is added to the length and not just 2.
2848             If there are several changes of options within the same group, this
2849             will lead to an over-estimate on the length, but this shouldn't
2850             matter very much. We also have to allow for resetting options at
2851             the start of any alternations, which we do by setting
2852             branch_newextra to 2. Finally, we record whether the case-dependent
2853             flag ever changes within the regex. This is used by the "required
2854             character" code. */
2855
2856             case ':':
2857             if (((set|unset) & PCRE_IMS) != 0)
2858               {
2859               length += 4;
2860               branch_newextra = 2;
2861               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2862               }
2863             goto END_OPTIONS;
2864
2865             /* Unrecognized option character */
2866
2867             default:
2868             *errorptr = ERR12;
2869             goto PCRE_ERROR_RETURN;
2870             }
2871           }
2872
2873         /* If we hit a closing bracket, that's it - this is a freestanding
2874         option-setting. We need to ensure that branch_extra is updated if
2875         necessary. The only values branch_newextra can have here are 0 or 2.
2876         If the value is 2, then branch_extra must either be 2 or 5, depending
2877         on whether this is a lookbehind group or not. */
2878
2879         END_OPTIONS:
2880         if (c == ')')
2881           {
2882           if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2883             branch_extra += branch_newextra;
2884           continue;
2885           }
2886
2887         /* If options were terminated by ':' control comes here. Fall through
2888         to handle the group below. */
2889         }
2890       }
2891
2892     /* Extracting brackets must be counted so we can process escapes in a
2893     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2894     need an additional 3 bytes of store per extracting bracket. */
2895
2896     else
2897       {
2898       bracount++;
2899       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2900       }
2901
2902     /* Save length for computing whole length at end if there's a repeat that
2903     requires duplication of the group. Also save the current value of
2904     branch_extra, and start the new group with the new value. If non-zero, this
2905     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2906
2907     if (brastackptr >= sizeof(brastack)/sizeof(int))
2908       {
2909       *errorptr = ERR19;
2910       goto PCRE_ERROR_RETURN;
2911       }
2912
2913     bralenstack[brastackptr] = branch_extra;
2914     branch_extra = branch_newextra;
2915
2916     brastack[brastackptr++] = length;
2917     length += bracket_length;
2918     continue;
2919
2920     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2921     have to replicate this bracket up to that many times. If brastackptr is
2922     0 this is an unmatched bracket which will generate an error, but take care
2923     not to try to access brastack[-1] when computing the length and restoring
2924     the branch_extra value. */
2925
2926     case ')':
2927     length += 3;
2928       {
2929       int minval = 1;
2930       int maxval = 1;
2931       int duplength;
2932
2933       if (brastackptr > 0)
2934         {
2935         duplength = length - brastack[--brastackptr];
2936         branch_extra = bralenstack[brastackptr];
2937         }
2938       else duplength = 0;
2939
2940       /* Leave ptr at the final char; for read_repeat_counts this happens
2941       automatically; for the others we need an increment. */
2942
2943       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2944         {
2945         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2946           &compile_block);
2947         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2948         }
2949       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2950       else if (c == '+') { maxval = -1; ptr++; }
2951       else if (c == '?') { minval = 0; ptr++; }
2952
2953       /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2954       group, and if the maximum is greater than zero, we have to replicate
2955       maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2956       bracket set - hence the 7. */
2957
2958       if (minval == 0)
2959         {
2960         length++;
2961         if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2962         }
2963
2964       /* When the minimum is greater than zero, 1 we have to replicate up to
2965       minval-1 times, with no additions required in the copies. Then, if
2966       there is a limited maximum we have to replicate up to maxval-1 times
2967       allowing for a BRAZERO item before each optional copy and nesting
2968       brackets for all but one of the optional copies. */
2969
2970       else
2971         {
2972         length += (minval - 1) * duplength;
2973         if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2974           length += (maxval - minval) * (duplength + 7) - 6;
2975         }
2976       }
2977     continue;
2978
2979     /* Non-special character. For a run of such characters the length required
2980     is the number of characters + 2, except that the maximum run length is 255.
2981     We won't get a skipped space or a non-data escape or the start of a #
2982     comment as the first character, so the length can't be zero. */
2983
2984     NORMAL_CHAR:
2985     default:
2986     length += 2;
2987     runlength = 0;
2988     do
2989       {
2990       if ((options & PCRE_EXTENDED) != 0)
2991         {
2992         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2993         if (c == '#')
2994           {
2995           /* The space before the ; is to avoid a warning on a silly compiler
2996           on the Macintosh. */
2997           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2998           continue;
2999           }
3000         }
3001
3002       /* Backslash may introduce a data char or a metacharacter; stop the
3003       string before the latter. */
3004
3005       if (c == '\\')
3006         {
3007         const uschar *saveptr = ptr;
3008         c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3009           &compile_block);
3010         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3011         if (c < 0) { ptr = saveptr; break; }
3012
3013 #ifdef SUPPORT_UTF8
3014         if (c > 127 && (options & PCRE_UTF8) != 0)
3015           {
3016           int i;
3017           for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3018             if (c <= utf8_table1[i]) break;
3019           runlength += i;
3020           }
3021 #endif
3022         }
3023
3024       /* Ordinary character or single-char escape */
3025
3026       runlength++;
3027       }
3028
3029     /* This "while" is the end of the "do" above. */
3030
3031     while (runlength < MAXLIT &&
3032       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3033
3034     ptr--;
3035     length += runlength;
3036     continue;
3037     }
3038   }
3039
3040 length += 4;    /* For final KET and END */
3041
3042 if (length > 65539)
3043   {
3044   *errorptr = ERR20;
3045   return NULL;
3046   }
3047
3048 /* Compute the size of data block needed and get it, either from malloc or
3049 externally provided function. We specify "code[0]" in the offsetof() expression
3050 rather than just "code", because it has been reported that one broken compiler
3051 fails on "code" because it is also an independent variable. It should make no
3052 difference to the value of the offsetof(). */
3053
3054 size = length + offsetof(real_pcre, code[0]);
3055 re = (real_pcre *)(pcre_malloc)(size);
3056
3057 if (re == NULL)
3058   {
3059   *errorptr = ERR21;
3060   return NULL;
3061   }
3062
3063 /* Put in the magic number, and save the size, options, and table pointer */
3064
3065 re->magic_number = MAGIC_NUMBER;
3066 re->size = size;
3067 re->options = options;
3068 re->tables = tables;
3069
3070 /* Set up a starting, non-extracting bracket, then compile the expression. On
3071 error, *errorptr will be set non-NULL, so we don't need to look at the result
3072 of the function here. */
3073
3074 ptr = (const uschar *)pattern;
3075 code = re->code;
3076 *code = OP_BRA;
3077 bracount = 0;
3078 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3079   &reqchar, &countlits, &compile_block);
3080 re->top_bracket = bracount;
3081 re->top_backref = top_backref;
3082
3083 /* If not reached end of pattern on success, there's an excess bracket. */
3084
3085 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3086
3087 /* Fill in the terminating state and check for disastrous overflow, but
3088 if debugging, leave the test till after things are printed out. */
3089
3090 *code++ = OP_END;
3091
3092 #ifndef DEBUG
3093 if (code - re->code > length) *errorptr = ERR23;
3094 #endif
3095
3096 /* Give an error if there's back reference to a non-existent capturing
3097 subpattern. */
3098
3099 if (top_backref > re->top_bracket) *errorptr = ERR15;
3100
3101 /* Failed to compile */
3102
3103 if (*errorptr != NULL)
3104   {
3105   (pcre_free)(re);
3106   PCRE_ERROR_RETURN:
3107   *erroroffset = ptr - (const uschar *)pattern;
3108   return NULL;
3109   }
3110
3111 /* If the anchored option was not passed, set flag if we can determine that the
3112 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3113 starting with .* when DOTALL is set).
3114
3115 Otherwise, see if we can determine what the first character has to be, because
3116 that speeds up unanchored matches no end. If not, see if we can set the
3117 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3118 start with ^. and also when all branches start with .* for non-DOTALL matches.
3119 */
3120
3121 if ((options & PCRE_ANCHORED) == 0)
3122   {
3123   int temp_options = options;
3124   if (is_anchored(re->code, &temp_options))
3125     re->options |= PCRE_ANCHORED;
3126   else
3127     {
3128     int ch = find_firstchar(re->code, &temp_options);
3129     if (ch >= 0)
3130       {
3131       re->first_char = ch;
3132       re->options |= PCRE_FIRSTSET;
3133       }
3134     else if (is_startline(re->code))
3135       re->options |= PCRE_STARTLINE;
3136     }
3137   }
3138
3139 /* Save the last required character if there are at least two literal
3140 characters on all paths, or if there is no first character setting. */
3141
3142 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3143   {
3144   re->req_char = reqchar;
3145   re->options |= PCRE_REQCHSET;
3146   }
3147
3148 /* Print out the compiled data for debugging */
3149
3150 #ifdef DEBUG
3151
3152 printf("Length = %d top_bracket = %d top_backref = %d\n",
3153   length, re->top_bracket, re->top_backref);
3154
3155 if (re->options != 0)
3156   {
3157   printf("%s%s%s%s%s%s%s%s%s\n",
3158     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3159     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3160     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3161     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3162     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3163     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3164     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3165     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3166     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3167   }
3168
3169 if ((re->options & PCRE_FIRSTSET) != 0)
3170   {
3171   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3172     else printf("First char = \\x%02x\n", re->first_char);
3173   }
3174
3175 if ((re->options & PCRE_REQCHSET) != 0)
3176   {
3177   if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3178     else printf("Req char = \\x%02x\n", re->req_char);
3179   }
3180
3181 code_end = code;
3182 code_base = code = re->code;
3183
3184 while (code < code_end)
3185   {
3186   int charlength;
3187
3188   printf("%3d ", code - code_base);
3189
3190   if (*code >= OP_BRA)
3191     {
3192     if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3193       printf("%3d Bra extra", (code[1] << 8) + code[2]);
3194     else
3195       printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3196     code += 2;
3197     }
3198
3199   else switch(*code)
3200     {
3201     case OP_OPT:
3202     printf(" %.2x %s", code[1], OP_names[*code]);
3203     code++;
3204     break;
3205
3206     case OP_CHARS:
3207     charlength = *(++code);
3208     printf("%3d ", charlength);
3209     while (charlength-- > 0)
3210       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3211     break;
3212
3213     case OP_KETRMAX:
3214     case OP_KETRMIN:
3215     case OP_ALT:
3216     case OP_KET:
3217     case OP_ASSERT:
3218     case OP_ASSERT_NOT:
3219     case OP_ASSERTBACK:
3220     case OP_ASSERTBACK_NOT:
3221     case OP_ONCE:
3222     case OP_REVERSE:
3223     case OP_BRANUMBER:
3224     case OP_COND:
3225     case OP_CREF:
3226     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3227     code += 2;
3228     break;
3229
3230     case OP_STAR:
3231     case OP_MINSTAR:
3232     case OP_PLUS:
3233     case OP_MINPLUS:
3234     case OP_QUERY:
3235     case OP_MINQUERY:
3236     case OP_TYPESTAR:
3237     case OP_TYPEMINSTAR:
3238     case OP_TYPEPLUS:
3239     case OP_TYPEMINPLUS:
3240     case OP_TYPEQUERY:
3241     case OP_TYPEMINQUERY:
3242     if (*code >= OP_TYPESTAR)
3243       printf("    %s", OP_names[code[1]]);
3244     else if (isprint(c = code[1])) printf("    %c", c);
3245       else printf("    \\x%02x", c);
3246     printf("%s", OP_names[*code++]);
3247     break;
3248
3249     case OP_EXACT:
3250     case OP_UPTO:
3251     case OP_MINUPTO:
3252     if (isprint(c = code[3])) printf("    %c{", c);
3253       else printf("    \\x%02x{", c);
3254     if (*code != OP_EXACT) printf("0,");
3255     printf("%d}", (code[1] << 8) + code[2]);
3256     if (*code == OP_MINUPTO) printf("?");
3257     code += 3;
3258     break;
3259
3260     case OP_TYPEEXACT:
3261     case OP_TYPEUPTO:
3262     case OP_TYPEMINUPTO:
3263     printf("    %s{", OP_names[code[3]]);
3264     if (*code != OP_TYPEEXACT) printf(",");
3265     printf("%d}", (code[1] << 8) + code[2]);
3266     if (*code == OP_TYPEMINUPTO) printf("?");
3267     code += 3;
3268     break;
3269
3270     case OP_NOT:
3271     if (isprint(c = *(++code))) printf("    [^%c]", c);
3272       else printf("    [^\\x%02x]", c);
3273     break;
3274
3275     case OP_NOTSTAR:
3276     case OP_NOTMINSTAR:
3277     case OP_NOTPLUS:
3278     case OP_NOTMINPLUS:
3279     case OP_NOTQUERY:
3280     case OP_NOTMINQUERY:
3281     if (isprint(c = code[1])) printf("    [^%c]", c);
3282       else printf("    [^\\x%02x]", c);
3283     printf("%s", OP_names[*code++]);
3284     break;
3285
3286     case OP_NOTEXACT:
3287     case OP_NOTUPTO:
3288     case OP_NOTMINUPTO:
3289     if (isprint(c = code[3])) printf("    [^%c]{", c);
3290       else printf("    [^\\x%02x]{", c);
3291     if (*code != OP_NOTEXACT) printf(",");
3292     printf("%d}", (code[1] << 8) + code[2]);
3293     if (*code == OP_NOTMINUPTO) printf("?");
3294     code += 3;
3295     break;
3296
3297     case OP_REF:
3298     printf("    \\%d", (code[1] << 8) | code[2]);
3299     code += 3;
3300     goto CLASS_REF_REPEAT;
3301
3302     case OP_CLASS:
3303       {
3304       int i, min, max;
3305       code++;
3306       printf("    [");
3307
3308       for (i = 0; i < 256; i++)
3309         {
3310         if ((code[i/8] & (1 << (i&7))) != 0)
3311           {
3312           int j;
3313           for (j = i+1; j < 256; j++)
3314             if ((code[j/8] & (1 << (j&7))) == 0) break;
3315           if (i == '-' || i == ']') printf("\\");
3316           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3317           if (--j > i)
3318             {
3319             printf("-");
3320             if (j == '-' || j == ']') printf("\\");
3321             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3322             }
3323           i = j;
3324           }
3325         }
3326       printf("]");
3327       code += 32;
3328
3329       CLASS_REF_REPEAT:
3330
3331       switch(*code)
3332         {
3333         case OP_CRSTAR:
3334         case OP_CRMINSTAR:
3335         case OP_CRPLUS:
3336         case OP_CRMINPLUS:
3337         case OP_CRQUERY:
3338         case OP_CRMINQUERY:
3339         printf("%s", OP_names[*code]);
3340         break;
3341
3342         case OP_CRRANGE:
3343         case OP_CRMINRANGE:
3344         min = (code[1] << 8) + code[2];
3345         max = (code[3] << 8) + code[4];
3346         if (max == 0) printf("{%d,}", min);
3347         else printf("{%d,%d}", min, max);
3348         if (*code == OP_CRMINRANGE) printf("?");
3349         code += 4;
3350         break;
3351
3352         default:
3353         code--;
3354         }
3355       }
3356     break;
3357
3358     /* Anything else is just a one-node item */
3359
3360     default:
3361     printf("    %s", OP_names[*code]);
3362     break;
3363     }
3364
3365   code++;
3366   printf("\n");
3367   }
3368 printf("------------------------------------------------------------------\n");
3369
3370 /* This check is done here in the debugging case so that the code that
3371 was compiled can be seen. */
3372
3373 if (code - re->code > length)
3374   {
3375   *errorptr = ERR23;
3376   (pcre_free)(re);
3377   *erroroffset = ptr - (uschar *)pattern;
3378   return NULL;
3379   }
3380 #endif
3381
3382 return (pcre *)re;
3383 }
3384
3385
3386
3387 /*************************************************
3388 *          Match a back-reference                *
3389 *************************************************/
3390
3391 /* If a back reference hasn't been set, the length that is passed is greater
3392 than the number of characters left in the string, so the match fails.
3393
3394 Arguments:
3395   offset      index into the offset vector
3396   eptr        points into the subject
3397   length      length to be matched
3398   md          points to match data block
3399   ims         the ims flags
3400
3401 Returns:      TRUE if matched
3402 */
3403
3404 static BOOL
3405 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3406   unsigned long int ims)
3407 {
3408 const uschar *p = md->start_subject + md->offset_vector[offset];
3409
3410 #ifdef DEBUG
3411 if (eptr >= md->end_subject)
3412   printf("matching subject <null>");
3413 else
3414   {
3415   printf("matching subject ");
3416   pchars(eptr, length, TRUE, md);
3417   }
3418 printf(" against backref ");
3419 pchars(p, length, FALSE, md);
3420 printf("\n");
3421 #endif
3422
3423 /* Always fail if not enough characters left */
3424
3425 if (length > md->end_subject - eptr) return FALSE;
3426
3427 /* Separate the caselesss case for speed */
3428
3429 if ((ims & PCRE_CASELESS) != 0)
3430   {
3431   while (length-- > 0)
3432     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3433   }
3434 else
3435   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3436
3437 return TRUE;
3438 }
3439
3440
3441
3442 /*************************************************
3443 *         Match from current position            *
3444 *************************************************/
3445
3446 /* On entry ecode points to the first opcode, and eptr to the first character
3447 in the subject string, while eptrb holds the value of eptr at the start of the
3448 last bracketed group - used for breaking infinite loops matching zero-length
3449 strings.
3450
3451 Arguments:
3452    eptr        pointer in subject
3453    ecode       position in code
3454    offset_top  current top pointer
3455    md          pointer to "static" info for the match
3456    ims         current /i, /m, and /s options
3457    eptrb       pointer to chain of blocks containing eptr at start of
3458                  brackets - for testing for empty matches
3459    flags       can contain
3460                  match_condassert - this is an assertion condition
3461                  match_isgroup - this is the start of a bracketed group
3462
3463 Returns:       TRUE if matched
3464 */
3465
3466 static BOOL
3467 match(register const uschar *eptr, register const uschar *ecode,
3468   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3469   int flags)
3470 {
3471 unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3472 eptrblock newptrb;
3473
3474 /* At the start of a bracketed group, add the current subject pointer to the
3475 stack of such pointers, to be re-instated at the end of the group when we hit
3476 the closing ket. When match() is called in other circumstances, we don't add to
3477 the stack. */
3478
3479 if ((flags & match_isgroup) != 0)
3480   {
3481   newptrb.prev = eptrb;
3482   newptrb.saved_eptr = eptr;
3483   eptrb = &newptrb;
3484   }
3485
3486 /* Now start processing the operations. */
3487
3488 for (;;)
3489   {
3490   int op = (int)*ecode;
3491   int min, max, ctype;
3492   register int i;
3493   register int c;
3494   BOOL minimize = FALSE;
3495
3496   /* Opening capturing bracket. If there is space in the offset vector, save
3497   the current subject position in the working slot at the top of the vector. We
3498   mustn't change the current values of the data slot, because they may be set
3499   from a previous iteration of this group, and be referred to by a reference
3500   inside the group.
3501
3502   If the bracket fails to match, we need to restore this value and also the
3503   values of the final offsets, in case they were set by a previous iteration of
3504   the same bracket.
3505
3506   If there isn't enough space in the offset vector, treat this as if it were a
3507   non-capturing bracket. Don't worry about setting the flag for the error case
3508   here; that is handled in the code for KET. */
3509
3510   if (op > OP_BRA)
3511     {
3512     int offset;
3513     int number = op - OP_BRA;
3514
3515     /* For extended extraction brackets (large number), we have to fish out the
3516     number from a dummy opcode at the start. */
3517
3518     if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3519     offset = number << 1;
3520
3521 #ifdef DEBUG
3522     printf("start bracket %d subject=", number);
3523     pchars(eptr, 16, TRUE, md);
3524     printf("\n");
3525 #endif
3526
3527     if (offset < md->offset_max)
3528       {
3529       int save_offset1 = md->offset_vector[offset];
3530       int save_offset2 = md->offset_vector[offset+1];
3531       int save_offset3 = md->offset_vector[md->offset_end - number];
3532
3533       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3534       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3535
3536       do
3537         {
3538         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3539           return TRUE;
3540         ecode += (ecode[1] << 8) + ecode[2];
3541         }
3542       while (*ecode == OP_ALT);
3543
3544       DPRINTF(("bracket %d failed\n", number));
3545
3546       md->offset_vector[offset] = save_offset1;
3547       md->offset_vector[offset+1] = save_offset2;
3548       md->offset_vector[md->offset_end - number] = save_offset3;
3549
3550       return FALSE;
3551       }
3552
3553     /* Insufficient room for saving captured contents */
3554
3555     else op = OP_BRA;
3556     }
3557
3558   /* Other types of node can be handled by a switch */
3559
3560   switch(op)
3561     {
3562     case OP_BRA:     /* Non-capturing bracket: optimized */
3563     DPRINTF(("start bracket 0\n"));
3564     do
3565       {
3566       if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3567         return TRUE;
3568       ecode += (ecode[1] << 8) + ecode[2];
3569       }
3570     while (*ecode == OP_ALT);
3571     DPRINTF(("bracket 0 failed\n"));
3572     return FALSE;
3573
3574     /* Conditional group: compilation checked that there are no more than
3575     two branches. If the condition is false, skipping the first branch takes us
3576     past the end if there is only one branch, but that's OK because that is
3577     exactly what going to the ket would do. */
3578
3579     case OP_COND:
3580     if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3581       {
3582       int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3583       return match(eptr,
3584         ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585           6 : 3 + (ecode[1] << 8) + ecode[2]),
3586         offset_top, md, ims, eptrb, match_isgroup);
3587       }
3588
3589     /* The condition is an assertion. Call match() to evaluate it - setting
3590     the final argument TRUE causes it to stop at the end of an assertion. */
3591
3592     else
3593       {
3594       if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595           match_condassert | match_isgroup))
3596         {
3597         ecode += 3 + (ecode[4] << 8) + ecode[5];
3598         while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3599         }
3600       else ecode += (ecode[1] << 8) + ecode[2];
3601       return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3602       }
3603     /* Control never reaches here */
3604
3605     /* Skip over conditional reference or large extraction number data if
3606     encountered. */
3607
3608     case OP_CREF:
3609     case OP_BRANUMBER:
3610     ecode += 3;
3611     break;
3612
3613     /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3614     an empty string - recursion will then try other alternatives, if any. */
3615
3616     case OP_END:
3617     if (md->notempty && eptr == md->start_match) return FALSE;
3618     md->end_match_ptr = eptr;          /* Record where we ended */
3619     md->end_offset_top = offset_top;   /* and how many extracts were taken */
3620     return TRUE;
3621
3622     /* Change option settings */
3623
3624     case OP_OPT:
3625     ims = ecode[1];
3626     ecode += 2;
3627     DPRINTF(("ims set to %02lx\n", ims));
3628     break;
3629
3630     /* Assertion brackets. Check the alternative branches in turn - the
3631     matching won't pass the KET for an assertion. If any one branch matches,
3632     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3633     start of each branch to move the current point backwards, so the code at
3634     this level is identical to the lookahead case. */
3635
3636     case OP_ASSERT:
3637     case OP_ASSERTBACK:
3638     do
3639       {
3640       if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3641       ecode += (ecode[1] << 8) + ecode[2];
3642       }
3643     while (*ecode == OP_ALT);
3644     if (*ecode == OP_KET) return FALSE;
3645
3646     /* If checking an assertion for a condition, return TRUE. */
3647
3648     if ((flags & match_condassert) != 0) return TRUE;
3649
3650     /* Continue from after the assertion, updating the offsets high water
3651     mark, since extracts may have been taken during the assertion. */
3652
3653     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3654     ecode += 3;
3655     offset_top = md->end_offset_top;
3656     continue;
3657
3658     /* Negative assertion: all branches must fail to match */
3659
3660     case OP_ASSERT_NOT:
3661     case OP_ASSERTBACK_NOT:
3662     do
3663       {
3664       if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3665         return FALSE;
3666       ecode += (ecode[1] << 8) + ecode[2];
3667       }
3668     while (*ecode == OP_ALT);
3669
3670     if ((flags & match_condassert) != 0) return TRUE;
3671
3672     ecode += 3;
3673     continue;
3674
3675     /* Move the subject pointer back. This occurs only at the start of
3676     each branch of a lookbehind assertion. If we are too close to the start to
3677     move back, this match function fails. When working with UTF-8 we move
3678     back a number of characters, not bytes. */
3679
3680     case OP_REVERSE:
3681 #ifdef SUPPORT_UTF8
3682     c = (ecode[1] << 8) + ecode[2];
3683     for (i = 0; i < c; i++)
3684       {
3685       eptr--;
3686       BACKCHAR(eptr)
3687       }
3688 #else
3689     eptr -= (ecode[1] << 8) + ecode[2];
3690 #endif
3691
3692     if (eptr < md->start_subject) return FALSE;
3693     ecode += 3;
3694     break;
3695
3696     /* Recursion matches the current regex, nested. If there are any capturing
3697     brackets started but not finished, we have to save their starting points
3698     and reinstate them after the recursion. However, we don't know how many
3699     such there are (offset_top records the completed total) so we just have
3700     to save all the potential data. There may be up to 99 such values, which
3701     is a bit large to put on the stack, but using malloc for small numbers
3702     seems expensive. As a compromise, the stack is used when there are fewer
3703     than 16 values to store; otherwise malloc is used. A problem is what to do
3704     if the malloc fails ... there is no way of returning to the top level with
3705     an error. Save the top 15 values on the stack, and accept that the rest
3706     may be wrong. */
3707
3708     case OP_RECURSE:
3709       {
3710       BOOL rc;
3711       int *save;
3712       int stacksave[15];
3713
3714       c = md->offset_max;
3715
3716       if (c < 16) save = stacksave; else
3717         {
3718         save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3719         if (save == NULL)
3720           {
3721           save = stacksave;
3722           c = 15;
3723           }
3724         }
3725
3726       for (i = 1; i <= c; i++)
3727         save[i] = md->offset_vector[md->offset_end - i];
3728       rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3729         match_isgroup);
3730       for (i = 1; i <= c; i++)
3731         md->offset_vector[md->offset_end - i] = save[i];
3732       if (save != stacksave) (pcre_free)(save);
3733       if (!rc) return FALSE;
3734
3735       /* In case the recursion has set more capturing values, save the final
3736       number, then move along the subject till after the recursive match,
3737       and advance one byte in the pattern code. */
3738
3739       offset_top = md->end_offset_top;
3740       eptr = md->end_match_ptr;
3741       ecode++;
3742       }
3743     break;
3744
3745     /* "Once" brackets are like assertion brackets except that after a match,
3746     the point in the subject string is not moved back. Thus there can never be
3747     a move back into the brackets. Check the alternative branches in turn - the
3748     matching won't pass the KET for this kind of subpattern. If any one branch
3749     matches, we carry on as at the end of a normal bracket, leaving the subject
3750     pointer. */
3751
3752     case OP_ONCE:
3753       {
3754       const uschar *prev = ecode;
3755       const uschar *saved_eptr = eptr;
3756
3757       do
3758         {
3759         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3760           break;
3761         ecode += (ecode[1] << 8) + ecode[2];
3762         }
3763       while (*ecode == OP_ALT);
3764
3765       /* If hit the end of the group (which could be repeated), fail */
3766
3767       if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3768
3769       /* Continue as from after the assertion, updating the offsets high water
3770       mark, since extracts may have been taken. */
3771
3772       do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3773
3774       offset_top = md->end_offset_top;
3775       eptr = md->end_match_ptr;
3776
3777       /* For a non-repeating ket, just continue at this level. This also
3778       happens for a repeating ket if no characters were matched in the group.
3779       This is the forcible breaking of infinite loops as implemented in Perl
3780       5.005. If there is an options reset, it will get obeyed in the normal
3781       course of events. */
3782
3783       if (*ecode == OP_KET || eptr == saved_eptr)
3784         {
3785         ecode += 3;
3786         break;
3787         }
3788
3789       /* The repeating kets try the rest of the pattern or restart from the
3790       preceding bracket, in the appropriate order. We need to reset any options
3791       that changed within the bracket before re-running it, so check the next
3792       opcode. */
3793
3794       if (ecode[3] == OP_OPT)
3795         {
3796         ims = (ims & ~PCRE_IMS) | ecode[4];
3797         DPRINTF(("ims set to %02lx at group repeat\n", ims));
3798         }
3799
3800       if (*ecode == OP_KETRMIN)
3801         {
3802         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3803             match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3804               return TRUE;
3805         }
3806       else  /* OP_KETRMAX */
3807         {
3808         if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3809             match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3810         }
3811       }
3812     return FALSE;
3813
3814     /* An alternation is the end of a branch; scan along to find the end of the
3815     bracketed group and go to there. */
3816
3817     case OP_ALT:
3818     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3819     break;
3820
3821     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3822     that it may occur zero times. It may repeat infinitely, or not at all -
3823     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3824     repeat limits are compiled as a number of copies, with the optional ones
3825     preceded by BRAZERO or BRAMINZERO. */
3826
3827     case OP_BRAZERO:
3828       {
3829       const uschar *next = ecode+1;
3830       if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3831         return TRUE;
3832       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3833       ecode = next + 3;
3834       }
3835     break;
3836
3837     case OP_BRAMINZERO:
3838       {
3839       const uschar *next = ecode+1;
3840       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3841       if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3842         return TRUE;
3843       ecode++;
3844       }
3845     break;
3846
3847     /* End of a group, repeated or non-repeating. If we are at the end of
3848     an assertion "group", stop matching and return TRUE, but record the
3849     current high water mark for use by positive assertions. Do this also
3850     for the "once" (not-backup up) groups. */
3851
3852     case OP_KET:
3853     case OP_KETRMIN:
3854     case OP_KETRMAX:
3855       {
3856       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3857       const uschar *saved_eptr = eptrb->saved_eptr;
3858
3859       eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3860
3861       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3862           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3863           *prev == OP_ONCE)
3864         {
3865         md->end_match_ptr = eptr;      /* For ONCE */
3866         md->end_offset_top = offset_top;
3867         return TRUE;
3868         }
3869
3870       /* In all other cases except a conditional group we have to check the
3871       group number back at the start and if necessary complete handling an
3872       extraction by setting the offsets and bumping the high water mark. */
3873
3874       if (*prev != OP_COND)
3875         {
3876         int offset;
3877         int number = *prev - OP_BRA;
3878
3879         /* For extended extraction brackets (large number), we have to fish out
3880         the number from a dummy opcode at the start. */
3881
3882         if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3883         offset = number << 1;
3884
3885 #ifdef DEBUG
3886         printf("end bracket %d", number);
3887         printf("\n");
3888 #endif
3889
3890         if (number > 0)
3891           {
3892           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3893             {
3894             md->offset_vector[offset] =
3895               md->offset_vector[md->offset_end - number];
3896             md->offset_vector[offset+1] = eptr - md->start_subject;
3897             if (offset_top <= offset) offset_top = offset + 2;
3898             }
3899           }
3900         }
3901
3902       /* Reset the value of the ims flags, in case they got changed during
3903       the group. */
3904
3905       ims = original_ims;
3906       DPRINTF(("ims reset to %02lx\n", ims));
3907
3908       /* For a non-repeating ket, just continue at this level. This also
3909       happens for a repeating ket if no characters were matched in the group.
3910       This is the forcible breaking of infinite loops as implemented in Perl
3911       5.005. If there is an options reset, it will get obeyed in the normal
3912       course of events. */
3913
3914       if (*ecode == OP_KET || eptr == saved_eptr)
3915         {
3916         ecode += 3;
3917         break;
3918         }
3919
3920       /* The repeating kets try the rest of the pattern or restart from the
3921       preceding bracket, in the appropriate order. */
3922
3923       if (*ecode == OP_KETRMIN)
3924         {
3925         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3926             match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3927               return TRUE;
3928         }
3929       else  /* OP_KETRMAX */
3930         {
3931         if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3932             match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3933         }
3934       }
3935     return FALSE;
3936
3937     /* Start of subject unless notbol, or after internal newline if multiline */
3938
3939     case OP_CIRC:
3940     if (md->notbol && eptr == md->start_subject) return FALSE;
3941     if ((ims & PCRE_MULTILINE) != 0)
3942       {
3943       if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3944       ecode++;
3945       break;
3946       }
3947     /* ... else fall through */
3948
3949     /* Start of subject assertion */
3950
3951     case OP_SOD:
3952     if (eptr != md->start_subject) return FALSE;
3953     ecode++;
3954     break;
3955
3956     /* Assert before internal newline if multiline, or before a terminating
3957     newline unless endonly is set, else end of subject unless noteol is set. */
3958
3959     case OP_DOLL:
3960     if ((ims & PCRE_MULTILINE) != 0)
3961       {
3962       if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3963         else { if (md->noteol) return FALSE; }
3964       ecode++;
3965       break;
3966       }
3967     else
3968       {
3969       if (md->noteol) return FALSE;
3970       if (!md->endonly)
3971         {
3972         if (eptr < md->end_subject - 1 ||
3973            (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3974
3975         ecode++;
3976         break;
3977         }
3978       }
3979     /* ... else fall through */
3980
3981     /* End of subject assertion (\z) */
3982
3983     case OP_EOD:
3984     if (eptr < md->end_subject) return FALSE;
3985     ecode++;
3986     break;
3987
3988     /* End of subject or ending \n assertion (\Z) */
3989
3990     case OP_EODN:
3991     if (eptr < md->end_subject - 1 ||
3992        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3993     ecode++;
3994     break;
3995
3996     /* Word boundary assertions */
3997
3998     case OP_NOT_WORD_BOUNDARY:
3999     case OP_WORD_BOUNDARY:
4000       {
4001       BOOL prev_is_word = (eptr != md->start_subject) &&
4002         ((md->ctypes[eptr[-1]] & ctype_word) != 0);
4003       BOOL cur_is_word = (eptr < md->end_subject) &&
4004         ((md->ctypes[*eptr] & ctype_word) != 0);
4005       if ((*ecode++ == OP_WORD_BOUNDARY)?
4006            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
4007         return FALSE;
4008       }
4009     break;
4010
4011     /* Match a single character type; inline for speed */
4012
4013     case OP_ANY:
4014     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4015       return FALSE;
4016     if (eptr++ >= md->end_subject) return FALSE;
4017 #ifdef SUPPORT_UTF8
4018     if (md->utf8)
4019       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4020 #endif
4021     ecode++;
4022     break;
4023
4024     case OP_NOT_DIGIT:
4025     if (eptr >= md->end_subject ||
4026        (md->ctypes[*eptr++] & ctype_digit) != 0)
4027       return FALSE;
4028     ecode++;
4029     break;
4030
4031     case OP_DIGIT:
4032     if (eptr >= md->end_subject ||
4033        (md->ctypes[*eptr++] & ctype_digit) == 0)
4034       return FALSE;
4035     ecode++;
4036     break;
4037
4038     case OP_NOT_WHITESPACE:
4039     if (eptr >= md->end_subject ||
4040        (md->ctypes[*eptr++] & ctype_space) != 0)
4041       return FALSE;
4042     ecode++;
4043     break;
4044
4045     case OP_WHITESPACE:
4046     if (eptr >= md->end_subject ||
4047        (md->ctypes[*eptr++] & ctype_space) == 0)
4048       return FALSE;
4049     ecode++;
4050     break;
4051
4052     case OP_NOT_WORDCHAR:
4053     if (eptr >= md->end_subject ||
4054        (md->ctypes[*eptr++] & ctype_word) != 0)
4055       return FALSE;
4056     ecode++;
4057     break;
4058
4059     case OP_WORDCHAR:
4060     if (eptr >= md->end_subject ||
4061        (md->ctypes[*eptr++] & ctype_word) == 0)
4062       return FALSE;
4063     ecode++;
4064     break;
4065
4066     /* Match a back reference, possibly repeatedly. Look past the end of the
4067     item to see if there is repeat information following. The code is similar
4068     to that for character classes, but repeated for efficiency. Then obey
4069     similar code to character type repeats - written out again for speed.
4070     However, if the referenced string is the empty string, always treat
4071     it as matched, any number of times (otherwise there could be infinite
4072     loops). */
4073
4074     case OP_REF:
4075       {
4076       int length;
4077       int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4078       ecode += 3;                                     /* Advance past item */
4079
4080       /* If the reference is unset, set the length to be longer than the amount
4081       of subject left; this ensures that every attempt at a match fails. We
4082       can't just fail here, because of the possibility of quantifiers with zero
4083       minima. */
4084
4085       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4086         md->end_subject - eptr + 1 :
4087         md->offset_vector[offset+1] - md->offset_vector[offset];
4088
4089       /* Set up for repetition, or handle the non-repeated case */
4090
4091       switch (*ecode)
4092         {
4093         case OP_CRSTAR:
4094         case OP_CRMINSTAR:
4095         case OP_CRPLUS:
4096         case OP_CRMINPLUS:
4097         case OP_CRQUERY:
4098         case OP_CRMINQUERY:
4099         c = *ecode++ - OP_CRSTAR;
4100         minimize = (c & 1) != 0;
4101         min = rep_min[c];                 /* Pick up values from tables; */
4102         max = rep_max[c];                 /* zero for max => infinity */
4103         if (max == 0) max = INT_MAX;
4104         break;
4105
4106         case OP_CRRANGE:
4107         case OP_CRMINRANGE:
4108         minimize = (*ecode == OP_CRMINRANGE);
4109         min = (ecode[1] << 8) + ecode[2];
4110         max = (ecode[3] << 8) + ecode[4];
4111         if (max == 0) max = INT_MAX;
4112         ecode += 5;
4113         break;
4114
4115         default:               /* No repeat follows */
4116         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4117         eptr += length;
4118         continue;              /* With the main loop */
4119         }
4120
4121       /* If the length of the reference is zero, just continue with the
4122       main loop. */
4123
4124       if (length == 0) continue;
4125
4126       /* First, ensure the minimum number of matches are present. We get back
4127       the length of the reference string explicitly rather than passing the
4128       address of eptr, so that eptr can be a register variable. */
4129
4130       for (i = 1; i <= min; i++)
4131         {
4132         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4133         eptr += length;
4134         }
4135
4136       /* If min = max, continue at the same level without recursion.
4137       They are not both allowed to be zero. */
4138
4139       if (min == max) continue;
4140
4141       /* If minimizing, keep trying and advancing the pointer */
4142
4143       if (minimize)
4144         {
4145         for (i = min;; i++)
4146           {
4147           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4148             return TRUE;
4149           if (i >= max || !match_ref(offset, eptr, length, md, ims))
4150             return FALSE;
4151           eptr += length;
4152           }
4153         /* Control never gets here */
4154         }
4155
4156       /* If maximizing, find the longest string and work backwards */
4157
4158       else
4159         {
4160         const uschar *pp = eptr;
4161         for (i = min; i < max; i++)
4162           {
4163           if (!match_ref(offset, eptr, length, md, ims)) break;
4164           eptr += length;
4165           }
4166         while (eptr >= pp)
4167           {
4168           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4169             return TRUE;
4170           eptr -= length;
4171           }
4172         return FALSE;
4173         }
4174       }
4175     /* Control never gets here */
4176
4177
4178
4179     /* Match a character class, possibly repeatedly. Look past the end of the
4180     item to see if there is repeat information following. Then obey similar
4181     code to character type repeats - written out again for speed. */
4182
4183     case OP_CLASS:
4184       {
4185       const uschar *data = ecode + 1;  /* Save for matching */
4186       ecode += 33;                     /* Advance past the item */
4187
4188       switch (*ecode)
4189         {
4190         case OP_CRSTAR:
4191         case OP_CRMINSTAR:
4192         case OP_CRPLUS:
4193         case OP_CRMINPLUS:
4194         case OP_CRQUERY:
4195         case OP_CRMINQUERY:
4196         c = *ecode++ - OP_CRSTAR;
4197         minimize = (c & 1) != 0;
4198         min = rep_min[c];                 /* Pick up values from tables; */
4199         max = rep_max[c];                 /* zero for max => infinity */
4200         if (max == 0) max = INT_MAX;
4201         break;
4202
4203         case OP_CRRANGE:
4204         case OP_CRMINRANGE:
4205         minimize = (*ecode == OP_CRMINRANGE);
4206         min = (ecode[1] << 8) + ecode[2];
4207         max = (ecode[3] << 8) + ecode[4];
4208         if (max == 0) max = INT_MAX;
4209         ecode += 5;
4210         break;
4211
4212         default:               /* No repeat follows */
4213         min = max = 1;
4214         break;
4215         }
4216
4217       /* First, ensure the minimum number of matches are present. */
4218
4219       for (i = 1; i <= min; i++)
4220         {
4221         if (eptr >= md->end_subject) return FALSE;
4222         GETCHARINC(c, eptr)         /* Get character; increment eptr */
4223
4224 #ifdef SUPPORT_UTF8
4225         /* We do not yet support class members > 255 */
4226         if (c > 255) return FALSE;
4227 #endif
4228
4229         if ((data[c/8] & (1 << (c&7))) != 0) continue;
4230         return FALSE;
4231         }
4232
4233       /* If max == min we can continue with the main loop without the
4234       need to recurse. */
4235
4236       if (min == max) continue;
4237
4238       /* If minimizing, keep testing the rest of the expression and advancing
4239       the pointer while it matches the class. */
4240
4241       if (minimize)
4242         {
4243         for (i = min;; i++)
4244           {
4245           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4246             return TRUE;
4247           if (i >= max || eptr >= md->end_subject) return FALSE;
4248           GETCHARINC(c, eptr)       /* Get character; increment eptr */
4249
4250 #ifdef SUPPORT_UTF8
4251           /* We do not yet support class members > 255 */
4252           if (c > 255) return FALSE;
4253 #endif
4254           if ((data[c/8] & (1 << (c&7))) != 0) continue;
4255           return FALSE;
4256           }
4257         /* Control never gets here */
4258         }
4259
4260       /* If maximizing, find the longest possible run, then work backwards. */
4261
4262       else
4263         {
4264         const uschar *pp = eptr;
4265         int len = 1;
4266         for (i = min; i < max; i++)
4267           {
4268           if (eptr >= md->end_subject) break;
4269           GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4270
4271 #ifdef SUPPORT_UTF8
4272           /* We do not yet support class members > 255 */
4273           if (c > 255) break;
4274 #endif
4275           if ((data[c/8] & (1 << (c&7))) == 0) break;
4276           eptr += len;
4277           }
4278
4279         while (eptr >= pp)
4280           {
4281           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4282             return TRUE;
4283
4284 #ifdef SUPPORT_UTF8
4285           BACKCHAR(eptr)
4286 #endif
4287           }
4288         return FALSE;
4289         }
4290       }
4291     /* Control never gets here */
4292
4293     /* Match a run of characters */
4294
4295     case OP_CHARS:
4296       {
4297       register int length = ecode[1];
4298       ecode += 2;
4299
4300 #ifdef DEBUG    /* Sigh. Some compilers never learn. */
4301       if (eptr >= md->end_subject)
4302         printf("matching subject <null> against pattern ");
4303       else
4304         {
4305         printf("matching subject ");
4306         pchars(eptr, length, TRUE, md);
4307         printf(" against pattern ");
4308         }
4309       pchars(ecode, length, FALSE, md);
4310       printf("\n");
4311 #endif
4312
4313       if (length > md->end_subject - eptr) return FALSE;
4314       if ((ims & PCRE_CASELESS) != 0)
4315         {
4316         while (length-- > 0)
4317           if (md->lcc[*ecode++] != md->lcc[*eptr++])
4318             return FALSE;
4319         }
4320       else
4321         {
4322         while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4323         }
4324       }
4325     break;
4326
4327     /* Match a single character repeatedly; different opcodes share code. */
4328
4329     case OP_EXACT:
4330     min = max = (ecode[1] << 8) + ecode[2];
4331     ecode += 3;
4332     goto REPEATCHAR;
4333
4334     case OP_UPTO:
4335     case OP_MINUPTO:
4336     min = 0;
4337     max = (ecode[1] << 8) + ecode[2];
4338     minimize = *ecode == OP_MINUPTO;
4339     ecode += 3;
4340     goto REPEATCHAR;
4341
4342     case OP_STAR:
4343     case OP_MINSTAR:
4344     case OP_PLUS:
4345     case OP_MINPLUS:
4346     case OP_QUERY:
4347     case OP_MINQUERY:
4348     c = *ecode++ - OP_STAR;
4349     minimize = (c & 1) != 0;
4350     min = rep_min[c];                 /* Pick up values from tables; */
4351     max = rep_max[c];                 /* zero for max => infinity */
4352     if (max == 0) max = INT_MAX;
4353
4354     /* Common code for all repeated single-character matches. We can give
4355     up quickly if there are fewer than the minimum number of characters left in
4356     the subject. */
4357
4358     REPEATCHAR:
4359     if (min > md->end_subject - eptr) return FALSE;
4360     c = *ecode++;
4361
4362     /* The code is duplicated for the caseless and caseful cases, for speed,
4363     since matching characters is likely to be quite common. First, ensure the
4364     minimum number of matches are present. If min = max, continue at the same
4365     level without recursing. Otherwise, if minimizing, keep trying the rest of
4366     the expression and advancing one matching character if failing, up to the
4367     maximum. Alternatively, if maximizing, find the maximum number of
4368     characters and work backwards. */
4369
4370     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4371       max, eptr));
4372
4373     if ((ims & PCRE_CASELESS) != 0)
4374       {
4375       c = md->lcc[c];
4376       for (i = 1; i <= min; i++)
4377         if (c != md->lcc[*eptr++]) return FALSE;
4378       if (min == max) continue;
4379       if (minimize)
4380         {
4381         for (i = min;; i++)
4382           {
4383           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4384             return TRUE;
4385           if (i >= max || eptr >= md->end_subject ||
4386               c != md->lcc[*eptr++])
4387             return FALSE;
4388           }
4389         /* Control never gets here */
4390         }
4391       else
4392         {
4393         const uschar *pp = eptr;
4394         for (i = min; i < max; i++)
4395           {
4396           if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4397           eptr++;
4398           }
4399         while (eptr >= pp)
4400           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4401             return TRUE;
4402         return FALSE;
4403         }
4404       /* Control never gets here */
4405       }
4406
4407     /* Caseful comparisons */
4408
4409     else
4410       {
4411       for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4412       if (min == max) continue;
4413       if (minimize)
4414         {
4415         for (i = min;; i++)
4416           {
4417           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4418             return TRUE;
4419           if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4420           }
4421         /* Control never gets here */
4422         }
4423       else
4424         {
4425         const uschar *pp = eptr;
4426         for (i = min; i < max; i++)
4427           {
4428           if (eptr >= md->end_subject || c != *eptr) break;
4429           eptr++;
4430           }
4431         while (eptr >= pp)
4432          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4433            return TRUE;
4434         return FALSE;
4435         }
4436       }
4437     /* Control never gets here */
4438
4439     /* Match a negated single character */
4440
4441     case OP_NOT:
4442     if (eptr >= md->end_subject) return FALSE;
4443     ecode++;
4444     if ((ims & PCRE_CASELESS) != 0)
4445       {
4446       if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4447       }
4448     else
4449       {
4450       if (*ecode++ == *eptr++) return FALSE;
4451       }
4452     break;
4453
4454     /* Match a negated single character repeatedly. This is almost a repeat of
4455     the code for a repeated single character, but I haven't found a nice way of
4456     commoning these up that doesn't require a test of the positive/negative
4457     option for each character match. Maybe that wouldn't add very much to the
4458     time taken, but character matching *is* what this is all about... */
4459
4460     case OP_NOTEXACT:
4461     min = max = (ecode[1] << 8) + ecode[2];
4462     ecode += 3;
4463     goto REPEATNOTCHAR;
4464
4465     case OP_NOTUPTO:
4466     case OP_NOTMINUPTO:
4467     min = 0;
4468     max = (ecode[1] << 8) + ecode[2];
4469     minimize = *ecode == OP_NOTMINUPTO;
4470     ecode += 3;
4471     goto REPEATNOTCHAR;
4472
4473     case OP_NOTSTAR:
4474     case OP_NOTMINSTAR:
4475     case OP_NOTPLUS:
4476     case OP_NOTMINPLUS:
4477     case OP_NOTQUERY:
4478     case OP_NOTMINQUERY:
4479     c = *ecode++ - OP_NOTSTAR;
4480     minimize = (c & 1) != 0;
4481     min = rep_min[c];                 /* Pick up values from tables; */
4482     max = rep_max[c];                 /* zero for max => infinity */
4483     if (max == 0) max = INT_MAX;
4484
4485     /* Common code for all repeated single-character matches. We can give
4486     up quickly if there are fewer than the minimum number of characters left in
4487     the subject. */
4488
4489     REPEATNOTCHAR:
4490     if (min > md->end_subject - eptr) return FALSE;
4491     c = *ecode++;
4492
4493     /* The code is duplicated for the caseless and caseful cases, for speed,
4494     since matching characters is likely to be quite common. First, ensure the
4495     minimum number of matches are present. If min = max, continue at the same
4496     level without recursing. Otherwise, if minimizing, keep trying the rest of
4497     the expression and advancing one matching character if failing, up to the
4498     maximum. Alternatively, if maximizing, find the maximum number of
4499     characters and work backwards. */
4500
4501     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4502       max, eptr));
4503
4504     if ((ims & PCRE_CASELESS) != 0)
4505       {
4506       c = md->lcc[c];
4507       for (i = 1; i <= min; i++)
4508         if (c == md->lcc[*eptr++]) return FALSE;
4509       if (min == max) continue;
4510       if (minimize)
4511         {
4512         for (i = min;; i++)
4513           {
4514           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4515             return TRUE;
4516           if (i >= max || eptr >= md->end_subject ||
4517               c == md->lcc[*eptr++])
4518             return FALSE;
4519           }
4520         /* Control never gets here */
4521         }
4522       else
4523         {
4524         const uschar *pp = eptr;
4525         for (i = min; i < max; i++)
4526           {
4527           if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4528           eptr++;
4529           }
4530         while (eptr >= pp)
4531           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4532             return TRUE;
4533         return FALSE;
4534         }
4535       /* Control never gets here */
4536       }
4537
4538     /* Caseful comparisons */
4539
4540     else
4541       {
4542       for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4543       if (min == max) continue;
4544       if (minimize)
4545         {
4546         for (i = min;; i++)
4547           {
4548           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4549             return TRUE;
4550           if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4551           }
4552         /* Control never gets here */
4553         }
4554       else
4555         {
4556         const uschar *pp = eptr;
4557         for (i = min; i < max; i++)
4558           {
4559           if (eptr >= md->end_subject || c == *eptr) break;
4560           eptr++;
4561           }
4562         while (eptr >= pp)
4563          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4564            return TRUE;
4565         return FALSE;
4566         }
4567       }
4568     /* Control never gets here */
4569
4570     /* Match a single character type repeatedly; several different opcodes
4571     share code. This is very similar to the code for single characters, but we
4572     repeat it in the interests of efficiency. */
4573
4574     case OP_TYPEEXACT:
4575     min = max = (ecode[1] << 8) + ecode[2];
4576     minimize = TRUE;
4577     ecode += 3;
4578     goto REPEATTYPE;
4579
4580     case OP_TYPEUPTO:
4581     case OP_TYPEMINUPTO:
4582     min = 0;
4583     max = (ecode[1] << 8) + ecode[2];
4584     minimize = *ecode == OP_TYPEMINUPTO;
4585     ecode += 3;
4586     goto REPEATTYPE;
4587
4588     case OP_TYPESTAR:
4589     case OP_TYPEMINSTAR:
4590     case OP_TYPEPLUS:
4591     case OP_TYPEMINPLUS:
4592     case OP_TYPEQUERY:
4593     case OP_TYPEMINQUERY:
4594     c = *ecode++ - OP_TYPESTAR;
4595     minimize = (c & 1) != 0;
4596     min = rep_min[c];                 /* Pick up values from tables; */
4597     max = rep_max[c];                 /* zero for max => infinity */
4598     if (max == 0) max = INT_MAX;
4599
4600     /* Common code for all repeated single character type matches */
4601
4602     REPEATTYPE:
4603     ctype = *ecode++;      /* Code for the character type */
4604
4605     /* First, ensure the minimum number of matches are present. Use inline
4606     code for maximizing the speed, and do the type test once at the start
4607     (i.e. keep it out of the loop). Also we can test that there are at least
4608     the minimum number of bytes before we start, except when doing '.' in
4609     UTF8 mode. Leave the test in in all cases; in the special case we have
4610     to test after each character. */
4611
4612     if (min > md->end_subject - eptr) return FALSE;
4613     if (min > 0) switch(ctype)
4614       {
4615       case OP_ANY:
4616 #ifdef SUPPORT_UTF8
4617       if (md->utf8)
4618         {
4619         for (i = 1; i <= min; i++)
4620           {
4621           if (eptr >= md->end_subject ||
4622              (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4623             return FALSE;
4624           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4625           }
4626         break;
4627         }
4628 #endif
4629       /* Non-UTF8 can be faster */
4630       if ((ims & PCRE_DOTALL) == 0)
4631         { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4632       else eptr += min;
4633       break;
4634
4635       case OP_NOT_DIGIT:
4636       for (i = 1; i <= min; i++)
4637         if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4638       break;
4639
4640       case OP_DIGIT:
4641       for (i = 1; i <= min; i++)
4642         if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4643       break;
4644
4645       case OP_NOT_WHITESPACE:
4646       for (i = 1; i <= min; i++)
4647         if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4648       break;
4649
4650       case OP_WHITESPACE:
4651       for (i = 1; i <= min; i++)
4652         if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4653       break;
4654
4655       case OP_NOT_WORDCHAR:
4656       for (i = 1; i <= min; i++)
4657         if ((md->ctypes[*eptr++] & ctype_word) != 0)
4658           return FALSE;
4659       break;
4660
4661       case OP_WORDCHAR:
4662       for (i = 1; i <= min; i++)
4663         if ((md->ctypes[*eptr++] & ctype_word) == 0)
4664           return FALSE;
4665       break;
4666       }
4667
4668     /* If min = max, continue at the same level without recursing */
4669
4670     if (min == max) continue;
4671
4672     /* If minimizing, we have to test the rest of the pattern before each
4673     subsequent match. */
4674
4675     if (minimize)
4676       {
4677       for (i = min;; i++)
4678         {
4679         if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4680         if (i >= max || eptr >= md->end_subject) return FALSE;
4681
4682         c = *eptr++;
4683         switch(ctype)
4684           {
4685           case OP_ANY:
4686           if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4687 #ifdef SUPPORT_UTF8
4688           if (md->utf8)
4689             while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4690 #endif
4691           break;
4692
4693           case OP_NOT_DIGIT:
4694           if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4695           break;
4696
4697           case OP_DIGIT:
4698           if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4699           break;
4700
4701           case OP_NOT_WHITESPACE:
4702           if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4703           break;
4704
4705           case OP_WHITESPACE:
4706           if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4707           break;
4708
4709           case OP_NOT_WORDCHAR:
4710           if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4711           break;
4712
4713           case OP_WORDCHAR:
4714           if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4715           break;
4716           }
4717         }
4718       /* Control never gets here */
4719       }
4720
4721     /* If maximizing it is worth using inline code for speed, doing the type
4722     test once at the start (i.e. keep it out of the loop). */
4723
4724     else
4725       {
4726       const uschar *pp = eptr;
4727       switch(ctype)
4728         {
4729         case OP_ANY:
4730
4731         /* Special code is required for UTF8, but when the maximum is unlimited
4732         we don't need it. */
4733
4734 #ifdef SUPPORT_UTF8
4735         if (md->utf8 && max < INT_MAX)
4736           {
4737           if ((ims & PCRE_DOTALL) == 0)
4738             {
4739             for (i = min; i < max; i++)
4740               {
4741               if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4742               while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743               }
4744             }
4745           else
4746             {
4747             for (i = min; i < max; i++)
4748               {
4749               eptr++;
4750               while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4751               }
4752             }
4753           break;
4754           }
4755 #endif
4756         /* Non-UTF8 can be faster */
4757         if ((ims & PCRE_DOTALL) == 0)
4758           {
4759           for (i = min; i < max; i++)
4760             {
4761             if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4762             eptr++;
4763             }
4764           }
4765         else
4766           {
4767           c = max - min;
4768           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4769           eptr += c;
4770           }
4771         break;
4772
4773         case OP_NOT_DIGIT:
4774         for (i = min; i < max; i++)
4775           {
4776           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4777             break;
4778           eptr++;
4779           }
4780         break;
4781
4782         case OP_DIGIT:
4783         for (i = min; i < max; i++)
4784           {
4785           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4786             break;
4787           eptr++;
4788           }
4789         break;
4790
4791         case OP_NOT_WHITESPACE:
4792         for (i = min; i < max; i++)
4793           {
4794           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4795             break;
4796           eptr++;
4797           }
4798         break;
4799
4800         case OP_WHITESPACE:
4801         for (i = min; i < max; i++)
4802           {
4803           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4804             break;
4805           eptr++;
4806           }
4807         break;
4808
4809         case OP_NOT_WORDCHAR:
4810         for (i = min; i < max; i++)
4811           {
4812           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4813             break;
4814           eptr++;
4815           }
4816         break;
4817
4818         case OP_WORDCHAR:
4819         for (i = min; i < max; i++)
4820           {
4821           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4822             break;
4823           eptr++;
4824           }
4825         break;
4826         }
4827
4828       while (eptr >= pp)
4829         {
4830         if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4831           return TRUE;
4832 #ifdef SUPPORT_UTF8
4833         if (md->utf8)
4834           while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4835 #endif
4836         }
4837       return FALSE;
4838       }
4839     /* Control never gets here */
4840
4841     /* There's been some horrible disaster. */
4842
4843     default:
4844     DPRINTF(("Unknown opcode %d\n", *ecode));
4845     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4846     return FALSE;
4847     }
4848
4849   /* Do not stick any code in here without much thought; it is assumed
4850   that "continue" in the code above comes out to here to repeat the main
4851   loop. */
4852
4853   }             /* End of main loop */
4854 /* Control never reaches here */
4855 }
4856
4857
4858
4859
4860 /*************************************************
4861 *         Execute a Regular Expression           *
4862 *************************************************/
4863
4864 /* This function applies a compiled re to a subject string and picks out
4865 portions of the string if it matches. Two elements in the vector are set for
4866 each substring: the offsets to the start and end of the substring.
4867
4868 Arguments:
4869   external_re     points to the compiled expression
4870   external_extra  points to "hints" from pcre_study() or is NULL
4871   subject         points to the subject string
4872   length          length of subject string (may contain binary zeros)
4873   start_offset    where to start in the subject string
4874   options         option bits
4875   offsets         points to a vector of ints to be filled in with offsets
4876   offsetcount     the number of elements in the vector
4877
4878 Returns:          > 0 => success; value is the number of elements filled in
4879                   = 0 => success, but offsets is not big enough
4880                    -1 => failed to match
4881                  < -1 => some kind of unexpected problem
4882 */
4883
4884 int
4885 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4886   const char *subject, int length, int start_offset, int options, int *offsets,
4887   int offsetcount)
4888 {
4889 int resetcount, ocount;
4890 int first_char = -1;
4891 int req_char = -1;
4892 int req_char2 = -1;
4893 unsigned long int ims = 0;
4894 match_data match_block;
4895 const uschar *start_bits = NULL;
4896 const uschar *start_match = (const uschar *)subject + start_offset;
4897 const uschar *end_subject;
4898 const uschar *req_char_ptr = start_match - 1;
4899 const real_pcre *re = (const real_pcre *)external_re;
4900 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4901 BOOL using_temporary_offsets = FALSE;
4902 BOOL anchored;
4903 BOOL startline;
4904
4905 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4906
4907 if (re == NULL || subject == NULL ||
4908    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4909 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4910
4911 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4912 startline = (re->options & PCRE_STARTLINE) != 0;
4913
4914 match_block.start_pattern = re->code;
4915 match_block.start_subject = (const uschar *)subject;
4916 match_block.end_subject = match_block.start_subject + length;
4917 end_subject = match_block.end_subject;
4918
4919 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4920 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4921
4922 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4923 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4924 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4925
4926 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4927
4928 match_block.lcc = re->tables + lcc_offset;
4929 match_block.ctypes = re->tables + ctypes_offset;
4930
4931 /* The ims options can vary during the matching as a result of the presence
4932 of (?ims) items in the pattern. They are kept in a local variable so that
4933 restoring at the exit of a group is easy. */
4934
4935 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4936
4937 /* If the expression has got more back references than the offsets supplied can
4938 hold, we get a temporary bit of working store to use during the matching.
4939 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4940 of 3. */
4941
4942 ocount = offsetcount - (offsetcount % 3);
4943
4944 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4945   {
4946   ocount = re->top_backref * 3 + 3;
4947   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4948   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4949   using_temporary_offsets = TRUE;
4950   DPRINTF(("Got memory to hold back references\n"));
4951   }
4952 else match_block.offset_vector = offsets;
4953
4954 match_block.offset_end = ocount;
4955 match_block.offset_max = (2*ocount)/3;
4956 match_block.offset_overflow = FALSE;
4957
4958 /* Compute the minimum number of offsets that we need to reset each time. Doing
4959 this makes a huge difference to execution time when there aren't many brackets
4960 in the pattern. */
4961
4962 resetcount = 2 + re->top_bracket * 2;
4963 if (resetcount > offsetcount) resetcount = ocount;
4964
4965 /* Reset the working variable associated with each extraction. These should
4966 never be used unless previously set, but they get saved and restored, and so we
4967 initialize them to avoid reading uninitialized locations. */
4968
4969 if (match_block.offset_vector != NULL)
4970   {
4971   register int *iptr = match_block.offset_vector + ocount;
4972   register int *iend = iptr - resetcount/2 + 1;
4973   while (--iptr >= iend) *iptr = -1;
4974   }
4975
4976 /* Set up the first character to match, if available. The first_char value is
4977 never set for an anchored regular expression, but the anchoring may be forced
4978 at run time, so we have to test for anchoring. The first char may be unset for
4979 an unanchored pattern, of course. If there's no first char and the pattern was
4980 studied, there may be a bitmap of possible first characters. */
4981
4982 if (!anchored)
4983   {
4984   if ((re->options & PCRE_FIRSTSET) != 0)
4985     {
4986     first_char = re->first_char;
4987     if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4988     }
4989   else
4990     if (!startline && extra != NULL &&
4991       (extra->options & PCRE_STUDY_MAPPED) != 0)
4992         start_bits = extra->start_bits;
4993   }
4994
4995 /* For anchored or unanchored matches, there may be a "last known required
4996 character" set. If the PCRE_CASELESS is set, implying that the match starts
4997 caselessly, or if there are any changes of this flag within the regex, set up
4998 both cases of the character. Otherwise set the two values the same, which will
4999 avoid duplicate testing (which takes significant time). This covers the vast
5000 majority of cases. It will be suboptimal when the case flag changes in a regex
5001 and the required character in fact is caseful. */
5002
5003 if ((re->options & PCRE_REQCHSET) != 0)
5004   {
5005   req_char = re->req_char;
5006   req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
5007     (re->tables + fcc_offset)[req_char] : req_char;
5008   }
5009
5010 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5011 the loop runs just once. */
5012
5013 do
5014   {
5015   int rc;
5016   register int *iptr = match_block.offset_vector;
5017   register int *iend = iptr + resetcount;
5018
5019   /* Reset the maximum number of extractions we might see. */
5020
5021   while (iptr < iend) *iptr++ = -1;
5022
5023   /* Advance to a unique first char if possible */
5024
5025   if (first_char >= 0)
5026     {
5027     if ((ims & PCRE_CASELESS) != 0)
5028       while (start_match < end_subject &&
5029              match_block.lcc[*start_match] != first_char)
5030         start_match++;
5031     else
5032       while (start_match < end_subject && *start_match != first_char)
5033         start_match++;
5034     }
5035
5036   /* Or to just after \n for a multiline match if possible */
5037
5038   else if (startline)
5039     {
5040     if (start_match > match_block.start_subject + start_offset)
5041       {
5042       while (start_match < end_subject && start_match[-1] != NEWLINE)
5043         start_match++;
5044       }
5045     }
5046
5047   /* Or to a non-unique first char after study */
5048
5049   else if (start_bits != NULL)
5050     {
5051     while (start_match < end_subject)
5052       {
5053       register int c = *start_match;
5054       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5055       }
5056     }
5057
5058 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
5059   printf(">>>> Match against: ");
5060   pchars(start_match, end_subject - start_match, TRUE, &match_block);
5061   printf("\n");
5062 #endif
5063
5064   /* If req_char is set, we know that that character must appear in the subject
5065   for the match to succeed. If the first character is set, req_char must be
5066   later in the subject; otherwise the test starts at the match point. This
5067   optimization can save a huge amount of backtracking in patterns with nested
5068   unlimited repeats that aren't going to match. We don't know what the state of
5069   case matching may be when this character is hit, so test for it in both its
5070   cases if necessary. However, the different cased versions will not be set up
5071   unless PCRE_CASELESS was given or the casing state changes within the regex.
5072   Writing separate code makes it go faster, as does using an autoincrement and
5073   backing off on a match. */
5074
5075   if (req_char >= 0)
5076     {
5077     register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5078
5079     /* We don't need to repeat the search if we haven't yet reached the
5080     place we found it at last time. */
5081
5082     if (p > req_char_ptr)
5083       {
5084       /* Do a single test if no case difference is set up */
5085
5086       if (req_char == req_char2)
5087         {
5088         while (p < end_subject)
5089           {
5090           if (*p++ == req_char) { p--; break; }
5091           }
5092         }
5093
5094       /* Otherwise test for either case */
5095
5096       else
5097         {
5098         while (p < end_subject)
5099           {
5100           register int pp = *p++;
5101           if (pp == req_char || pp == req_char2) { p--; break; }
5102           }
5103         }
5104
5105       /* If we can't find the required character, break the matching loop */
5106
5107       if (p >= end_subject) break;
5108
5109       /* If we have found the required character, save the point where we
5110       found it, so that we don't search again next time round the loop if
5111       the start hasn't passed this character yet. */
5112
5113       req_char_ptr = p;
5114       }
5115     }
5116
5117   /* When a match occurs, substrings will be set for all internal extractions;
5118   we just need to set up the whole thing as substring 0 before returning. If
5119   there were too many extractions, set the return code to zero. In the case
5120   where we had to get some local store to hold offsets for backreferences, copy
5121   those back references that we can. In this case there need not be overflow
5122   if certain parts of the pattern were not used. */
5123
5124   match_block.start_match = start_match;
5125   if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5126     continue;
5127
5128   /* Copy the offset information from temporary store if necessary */
5129
5130   if (using_temporary_offsets)
5131     {
5132     if (offsetcount >= 4)
5133       {
5134       memcpy(offsets + 2, match_block.offset_vector + 2,
5135         (offsetcount - 2) * sizeof(int));
5136       DPRINTF(("Copied offsets from temporary memory\n"));
5137       }
5138     if (match_block.end_offset_top > offsetcount)
5139       match_block.offset_overflow = TRUE;
5140
5141     DPRINTF(("Freeing temporary memory\n"));
5142     (pcre_free)(match_block.offset_vector);
5143     }
5144
5145   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5146
5147   if (offsetcount < 2) rc = 0; else
5148     {
5149     offsets[0] = start_match - match_block.start_subject;
5150     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5151     }
5152
5153   DPRINTF((">>>> returning %d\n", rc));
5154   return rc;
5155   }
5156
5157 /* This "while" is the end of the "do" above */
5158
5159 while (!anchored &&
5160        match_block.errorcode == PCRE_ERROR_NOMATCH &&
5161        start_match++ < end_subject);
5162
5163 if (using_temporary_offsets)
5164   {
5165   DPRINTF(("Freeing temporary memory\n"));
5166   (pcre_free)(match_block.offset_vector);
5167   }
5168
5169 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5170
5171 return match_block.errorcode;
5172 }
5173
5174 /* End of pcre.c */