From 22d43ce4a44ca138823a3494a8465c5431f19ab3 Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Thu, 26 Sep 2024 12:31:17 +0000 Subject: [PATCH] Simplify class range processing --- src/pcre2_compile.c | 336 ++++++++++++++------------------------ src/pcre2_compile.h | 10 ++ src/pcre2_compile_class.c | 64 +++++--- src/pcre2_intmodedep.h | 2 - testdata/testinput5 | 8 +- testdata/testoutput5 | 24 ++- 6 files changed, 206 insertions(+), 238 deletions(-) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 1efd1e1b4..b354ca9be 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -5251,19 +5251,14 @@ for (;;) /************************************************* -* Add a character or range to a class (internal) * +* External entry point for add range to class * *************************************************/ -/* This function packages up the logic of adding a character or range of -characters to a class. The character values in the arguments will be within the -valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is -called only from within the "add to class" group of functions, some of which -are recursive and mutually recursive. The external entry point is -add_to_class(). +/* This function sets the overall range for characters < 256. +It also handles non-utf case folding. Arguments: classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data options the options bits cb compile data start start of range character @@ -5274,8 +5269,8 @@ Returns: the number of < 256 characters added */ static unsigned int -add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, compile_block *cb, uint32_t start, uint32_t end) +add_to_class(uint8_t *classbits, uint32_t options, compile_block *cb, + uint32_t start, uint32_t end) { uint32_t c; uint32_t classbits_end = (end <= 0xff ? end : 0xff); @@ -5299,15 +5294,6 @@ if ((options & PCRE2_CASELESS) != 0) } } -/* Now handle the originally supplied range. Adjust the final value according -to the bit length - this means that the same lists of (e.g.) horizontal spaces -can be used in all cases. */ - -if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) - end = MAX_NON_UTF_CHAR; - -if (start > cb->class_range_start && end < cb->class_range_end) return n8; - /* Use the bitmap for characters < 256. Otherwise use extra data.*/ for (c = start; c <= classbits_end; c++) @@ -5317,89 +5303,10 @@ for (c = start; c <= classbits_end; c++) n8++; } -#ifdef SUPPORT_WIDE_CHARS -if (start <= 0xff) start = 0xff + 1; - -if (end >= start) - { - PCRE2_UCHAR *uchardata = *uchardptr; - -#ifdef SUPPORT_UNICODE - if ((options & PCRE2_UTF) != 0) - { - if (start < end) - { - *uchardata++ = XCL_RANGE; - uchardata += PRIV(ord2utf)(start, uchardata); - uchardata += PRIV(ord2utf)(end, uchardata); - } - else if (start == end) - { - *uchardata++ = XCL_SINGLE; - uchardata += PRIV(ord2utf)(start, uchardata); - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Without UTF support, character values are constrained by the bit length, - and can only be > 256 for 16-bit and 32-bit libraries. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 - {} -#else - if (start < end) - { - *uchardata++ = XCL_RANGE; - *uchardata++ = start; - *uchardata++ = end; - } - else if (start == end) - { - *uchardata++ = XCL_SINGLE; - *uchardata++ = start; - } -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - *uchardptr = uchardata; /* Updata extra data pointer */ - } -#else /* SUPPORT_WIDE_CHARS */ - (void)uchardptr; /* Avoid compiler warning */ -#endif /* SUPPORT_WIDE_CHARS */ - return n8; /* Number of 8-bit characters */ } - -/************************************************* -* External entry point for add range to class * -*************************************************/ - -/* This function sets the overall range so that the internal functions can try -to avoid duplication when handling case-independence. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options bits - cb compile data - start start of range character - end end of range character - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static unsigned int -add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, - compile_block *cb, uint32_t start, uint32_t end) -{ -cb->class_range_start = start; -cb->class_range_end = end; -return add_to_class_internal(classbits, uchardptr, options, cb, start, end); -} - - #if PCRE2_CODE_UNIT_WIDTH == 8 /************************************************* * External entry point for add list to class * @@ -5413,7 +5320,6 @@ case-independence. Arguments: classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data options the options bits cb contains pointers to tables etc. p points to row of 32-bit values, terminated by NOTACHAR @@ -5423,8 +5329,8 @@ Returns: the number of < 256 characters added */ static unsigned int -add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, compile_block *cb, const uint32_t *p) +add_list_to_class(uint8_t *classbits, uint32_t options, compile_block *cb, + const uint32_t *p) { unsigned int n8 = 0; while (p[0] < 256) @@ -5432,9 +5338,7 @@ while (p[0] < 256) unsigned int n = 0; while(p[n+1] == p[0] + n + 1) n++; - cb->class_range_start = p[0]; - cb->class_range_end = p[n]; - n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); + n8 += add_to_class(classbits, options, cb, p[0], p[n]); p += n + 1; } @@ -5454,7 +5358,6 @@ vertical whitespace to a class. The list must be in order. Arguments: classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data options the options bits xoptions the extra options bits cb contains pointers to tables etc. @@ -5465,16 +5368,16 @@ Returns: the number of < 256 characters added */ static unsigned int -add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, compile_block *cb, const uint32_t *p) +add_not_list_to_class(uint8_t *classbits, uint32_t options, compile_block *cb, + const uint32_t *p) { unsigned int n8 = 0; if (p[0] > 0) - n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); + n8 += add_to_class(classbits, options, cb, 0, p[0] - 1); while (p[0] < 256) { while (p[1] == p[0] + 1) p++; - n8 += add_to_class(classbits, uchardptr, options, cb, + n8 += add_to_class(classbits, options, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1); p++; } @@ -5635,15 +5538,11 @@ BOOL ucp = (options & PCRE2_UCP) != 0; BOOL utf = FALSE; #endif -/* Helper variables for OP_XCLASS opcode (for characters > 255). We define -class_uchardata always so that it can be passed to add_to_class() always, -though it will not be used in non-UTF 8-bit cases. This avoids having to supply -alternative calls for the different cases. */ +/* Helper variables for OP_XCLASS opcode (for characters > 255). */ -PCRE2_UCHAR *class_uchardata; #ifdef SUPPORT_WIDE_CHARS +PCRE2_UCHAR *class_uchardata; BOOL xclass; -PCRE2_UCHAR *class_uchardata_base; class_ranges* cranges; #endif @@ -5681,7 +5580,6 @@ for (;; pptr++) #endif BOOL negate_class; BOOL should_flip_negation; - BOOL match_all_or_no_wide_chars; BOOL possessive_quantifier; BOOL note_group_empty; int class_has_8bitchar; @@ -5951,7 +5849,7 @@ for (;; pptr++) need to insert specific matching or non-matching code for wide characters. */ - should_flip_negation = match_all_or_no_wide_chars = FALSE; + should_flip_negation = FALSE; /* Extended class (xclass) will be used when characters > 255 might match. */ @@ -5992,7 +5890,6 @@ for (;; pptr++) xclass = FALSE; class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ - class_uchardata_base = class_uchardata; /* Save the start */ #endif /* For optimization purposes, we track some properties of the class: @@ -6048,13 +5945,20 @@ for (;; pptr++) case PC_GRAPH: case PC_PRINT: case PC_PUNCT: - *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; - *class_uchardata++ = (PCRE2_UCHAR) - ((posix_class == PC_GRAPH)? PT_PXGRAPH : - (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT); - *class_uchardata++ = 0; + + if (lengthptr != NULL) + *lengthptr += 3; + else + { + *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; + *class_uchardata++ = (PCRE2_UCHAR) + ((posix_class == PC_GRAPH)? PT_PXGRAPH : + (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT); + *class_uchardata++ = 0; + } + xclass = TRUE; xclass_has_prop = TRUE; - goto CONTINUE_CLASS; + continue; /* For the other POSIX classes (ex: ascii) we are going to fall through to the non-UCP case and build a bit map for @@ -6073,10 +5977,6 @@ for (;; pptr++) utf mode, since no wide characters can exist otherwise. */ default: -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (utf) -#endif - match_all_or_no_wide_chars |= local_negate; break; } } @@ -6126,7 +6026,7 @@ for (;; pptr++) /* Every class contains at least one < 256 character. */ class_has_8bitchar = 1; - goto CONTINUE_CLASS; /* End of POSIX handling */ + continue; /* End of POSIX handling */ } /* Other than POSIX classes, the only items we should encounter are @@ -6207,8 +6107,8 @@ for (;; pptr++) #ifdef SUPPORT_UNICODE if (cranges != NULL) break; #endif - (void)add_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, cb, PRIV(hspace_list)); + (void)add_list_to_class(classbits, options & ~PCRE2_CASELESS, + cb, PRIV(hspace_list)); #else PCRE2_ASSERT(cranges != NULL); #endif @@ -6219,8 +6119,8 @@ for (;; pptr++) #ifdef SUPPORT_UNICODE if (cranges != NULL) break; #endif - (void)add_not_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, cb, PRIV(hspace_list)); + (void)add_not_list_to_class(classbits, options & ~PCRE2_CASELESS, + cb, PRIV(hspace_list)); #else PCRE2_ASSERT(cranges != NULL); #endif @@ -6231,8 +6131,8 @@ for (;; pptr++) #ifdef SUPPORT_UNICODE if (cranges != NULL) break; #endif - (void)add_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, cb, PRIV(vspace_list)); + (void)add_list_to_class(classbits, options & ~PCRE2_CASELESS, + cb, PRIV(vspace_list)); #else PCRE2_ASSERT(cranges != NULL); #endif @@ -6243,8 +6143,8 @@ for (;; pptr++) #ifdef SUPPORT_UNICODE if (cranges != NULL) break; #endif - (void)add_not_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, cb, PRIV(vspace_list)); + (void)add_not_list_to_class(classbits, options & ~PCRE2_CASELESS, + cb, PRIV(vspace_list)); #else PCRE2_ASSERT(cranges != NULL); #endif @@ -6271,9 +6171,15 @@ for (;; pptr++) pdata = 0; } - *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; - *class_uchardata++ = ptype; - *class_uchardata++ = pdata; + if (lengthptr != NULL) + *lengthptr += 3; + else + { + *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; + *class_uchardata++ = ptype; + *class_uchardata++ = pdata; + } + xclass = TRUE; xclass_has_prop = TRUE; class_has_8bitchar--; /* Undo! */ } @@ -6281,7 +6187,7 @@ for (;; pptr++) #endif } - goto CONTINUE_CLASS; + continue; } /* End handling \d-type escapes */ /* A literal character may be followed by a range meta. At parse time @@ -6338,37 +6244,34 @@ for (;; pptr++) if (C <= CHAR_i) { class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, - cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc); + add_to_class(classbits, options, cb, C + uc, + ((D < CHAR_i)? D : CHAR_i) + uc); C = CHAR_j; } if (C <= D && C <= CHAR_r) { class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, - cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc); + add_to_class(classbits, options, cb, C + uc, + ((D < CHAR_r)? D : CHAR_r) + uc); C = CHAR_s; } if (C <= D) { class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, - cb, C + uc, D + uc); + add_to_class(classbits, options, cb, C + uc, D + uc); } } else #endif /* Not an EBCDIC special range */ - class_has_8bitchar += add_to_class(classbits, &class_uchardata, - options, cb, c, d); - goto CONTINUE_CLASS; /* Go get the next char in the class */ + class_has_8bitchar += add_to_class(classbits, options, cb, c, d); #else PCRE2_ASSERT(cranges != NULL); - continue; #endif + continue; } /* End of range handling */ /* Character ranges are ignored when class_ranges is present. */ @@ -6378,62 +6281,96 @@ for (;; pptr++) #endif /* Handle a single character. */ - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, meta, meta); + class_has_8bitchar += add_to_class(classbits, options, cb, meta, meta); #else PCRE2_ASSERT(cranges != NULL); continue; #endif } + } /* End of main class-processing loop */ - /* Continue to the next item in the class. */ +#ifdef SUPPORT_WIDE_CHARS + if (cranges != NULL) + { + uint32_t *range = (uint32_t*)(cranges + 1); + uint32_t *end = range + cranges->range_list_size; - CONTINUE_CLASS: + while (range < end && range[0] < 256) + { + /* Add range to bitset. */ + class_has_8bitchar += + add_to_class(classbits, options, cb, range[0], range[1]); -#ifdef SUPPORT_WIDE_CHARS - /* If any wide characters or Unicode properties have been encountered, - set xclass = TRUE. Then, in the pre-compile phase, accumulate the length - of the extra data and reset the pointer. This is so that very large - classes that contain a zillion wide characters or Unicode property tests - do not overwrite the workspace (which is on the stack). */ + if (range[1] > 255) break; + range += 2; + } - if (class_uchardata > class_uchardata_base) + if (!xclass_has_prop && range < end && range[0] <= 256 && + range[1] >= (utf ? MAX_UTF_CODE_POINT : MAX_UCHAR_VALUE)) + { + PCRE2_ASSERT(range + 2 == end); + should_flip_negation = TRUE; + range = end; + } + + while (range < end) { + uint32_t range_start = range[0]; + uint32_t range_end = range[1]; + + range += 2; xclass = TRUE; + + if (range_start < 256) range_start = 256; + if (lengthptr != NULL) { - *lengthptr += class_uchardata - class_uchardata_base; - class_uchardata = class_uchardata_base; - } - } -#endif +#ifdef SUPPORT_UNICODE + if (utf) + { + *lengthptr += 1; - continue; /* Needed to avoid error when not supporting wide chars */ - } /* End of main class-processing loop */ + if (range_start < range_end) + *lengthptr += PRIV(ord2utf)(range_start, class_uchardata); -#ifdef SUPPORT_WIDE_CHARS - if (cranges != NULL) - { - uint32_t *range = (uint32_t*)(cranges + 1); - uint32_t *end = range + cranges->range_list_size; + *lengthptr += PRIV(ord2utf)(range_end, class_uchardata); + continue; + } +#endif /* SUPPORT_UNICODE */ - while (range < end) - { - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, - range[0], range[1]); + *lengthptr += range_start < range_end ? 3 : 2; + continue; + } - if (class_uchardata > class_uchardata_base) +#ifdef SUPPORT_UNICODE + if (utf) { - xclass = TRUE; - if (lengthptr != NULL) + if (range_start < range_end) { - *lengthptr += class_uchardata - class_uchardata_base; - class_uchardata = class_uchardata_base; + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(range_start, class_uchardata); } + else + *class_uchardata++ = XCL_SINGLE; + + class_uchardata += PRIV(ord2utf)(range_end, class_uchardata); + continue; } +#endif /* SUPPORT_UNICODE */ - range += 2; + /* Without UTF support, character values are constrained by the bit length, + and can only be > 256 for 16-bit and 32-bit libraries. */ +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (range_start < range_end) + { + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = range_start; + } + else + *class_uchardata++ = XCL_SINGLE; + + *class_uchardata++ = range_end; +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ } if (lengthptr == NULL) @@ -6475,35 +6412,8 @@ for (;; pptr++) the bitmap in the actual compiled code. */ #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ - if (xclass && ( -#ifdef SUPPORT_UNICODE - (options & PCRE2_UCP) != 0 || -#endif - xclass_has_prop || !should_flip_negation)) + if (xclass) { - if (match_all_or_no_wide_chars || ( -#if PCRE2_CODE_UNIT_WIDTH == 8 - utf && -#endif - should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0)) - { - *class_uchardata++ = XCL_RANGE; - if (utf) /* Will always be utf in the 8-bit library */ - { - class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); - class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); - } - else /* Can only happen for the 16-bit & 32-bit libraries */ - { -#if PCRE2_CODE_UNIT_WIDTH == 16 - *class_uchardata++ = 0x100; - *class_uchardata++ = 0xffffu; -#elif PCRE2_CODE_UNIT_WIDTH == 32 - *class_uchardata++ = 0x100; - *class_uchardata++ = 0xffffffffu; -#endif - } - } *class_uchardata++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; code += LINK_SIZE; diff --git a/src/pcre2_compile.h b/src/pcre2_compile.h index 3858cba97..9bb48168c 100644 --- a/src/pcre2_compile.h +++ b/src/pcre2_compile.h @@ -173,6 +173,16 @@ therefore no need for it to have a length entry, so use a high value. */ #define META_DATA(x) (x & 0x0000ffffu) #define META_DIFF(x,y) ((x-y)>>16) +/* Macro for the highest character value. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +#define MAX_UCHAR_VALUE 0xff +#elif PCRE2_CODE_UNIT_WIDTH == 16 +#define MAX_UCHAR_VALUE 0xffff +#else +#define MAX_UCHAR_VALUE 0xffffffff +#endif + /* Merge intersecting ranges of classes. */ class_ranges *PRIV(optimize_class)(uint32_t *start_ptr, diff --git a/src/pcre2_compile_class.c b/src/pcre2_compile_class.c index c01a8c756..db4412637 100644 --- a/src/pcre2_compile_class.c +++ b/src/pcre2_compile_class.c @@ -241,6 +241,22 @@ while (*p != NOTACHAR) return result; } +static uint32_t +get_highest_char(uint32_t options) +{ +(void)options; /* Avoid compiler warning. */ + +#if PCRE2_CODE_UNIT_WIDTH == 8 +return MAX_UTF_CODE_POINT; +#else +#ifdef SUPPORT_UNICODE +return (options & PARSE_CLASS_UTF) ? MAX_UTF_CODE_POINT : MAX_UCHAR_VALUE; +#else +return MAX_UCHAR_VALUE; +#endif +#endif +} + /* Add a negated character list to a buffer. */ static size_t append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer) @@ -249,7 +265,6 @@ const uint32_t *n; uint32_t start = 0; size_t result = 2; -(void)options; /* Avoid compiler warning. */ PCRE2_ASSERT(*p > 0); while (*p != NOTACHAR) @@ -274,27 +289,23 @@ while (*p != NOTACHAR) if (buffer != NULL) { buffer[0] = start; -#if PCRE2_CODE_UNIT_WIDTH == 8 - buffer[1] = MAX_UTF_CODE_POINT; -#elif PCRE2_CODE_UNIT_WIDTH == 16 -#ifdef SUPPORT_UNICODE - buffer[1] = (options & PARSE_CLASS_UTF) ? MAX_UTF_CODE_POINT : 0xffff; -#else - buffer[1] = 0xffff; -#endif -#elif PCRE2_CODE_UNIT_WIDTH == 32 -#ifdef SUPPORT_UNICODE - buffer[1] = (options & PARSE_CLASS_UTF) ? MAX_UTF_CODE_POINT : 0xffffffff; -#else - buffer[1] = 0xffffffff; -#endif -#endif + buffer[1] = get_highest_char(options); buffer += 2; } return result; } +static uint32_t * +append_non_ascii_range(uint32_t options, uint32_t *buffer) +{ + if (buffer == NULL) return NULL; + + buffer[0] = 0x100; + buffer[1] = get_highest_char(options); + return buffer + 2; +} + static size_t parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer) { @@ -311,6 +322,13 @@ while (*ptr != META_CLASS_END) meta_arg = META_DATA(*ptr); switch (meta_arg) { + case ESC_D: + case ESC_W: + case ESC_S: + buffer = append_non_ascii_range(options, buffer); + total_size += 2; + break; + case ESC_h: size = append_char_list(PRIV(hspace_list), buffer); total_size += size; @@ -342,8 +360,12 @@ while (*ptr != META_CLASS_END) } ptr++; continue; - case META_POSIX: case META_POSIX_NEG: + buffer = append_non_ascii_range(options, buffer); + total_size += 2; + ptr += 2; + continue; + case META_POSIX: ptr += 2; continue; case META_BIGVALUE: @@ -372,9 +394,9 @@ while (*ptr != META_CLASS_END) #ifdef SUPPORT_UNICODE if (options & PARSE_CLASS_CASELESS_UTF) { - size_t usize = utf_caseless_extend(start_char, *ptr++, options, buffer); - if (buffer != NULL) buffer += usize; - total_size += usize; + size = utf_caseless_extend(start_char, *ptr++, options, buffer); + if (buffer != NULL) buffer += size; + total_size += size; continue; } #endif @@ -487,6 +509,8 @@ while (range_list_size > 0 && dst[1] != ~(uint32_t)0) } cranges->range_list_size = (size_t)(dst + 2 - buffer); + +PCRE2_ASSERT(dst[1] <= get_highest_char(class_options)); return cranges; } diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 519118c0a..c1009f38d 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -766,8 +766,6 @@ typedef struct compile_block { uint32_t backref_map; /* Bitmap of low back refs */ uint32_t nltype; /* Newline type */ uint32_t nllen; /* Newline string length */ - uint32_t class_range_start; /* Overall class range start */ - uint32_t class_range_end; /* Overall class range end */ PCRE2_UCHAR nl[4]; /* Newline string when fixed length */ uint32_t req_varyopt; /* "After variable item" flag for reqbyte */ uint32_t max_varlookbehind; /* Limit for variable lookbehinds */ diff --git a/testdata/testinput5 b/testdata/testinput5 index b74bf68e6..0c9e167a0 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1709,8 +1709,8 @@ /[^\D\P{Nd}]/utf a9b - \x{1d7cf} \= Expect no match + \x{1d7cf} \x{10000} # Hex uses pattern length, not zero-terminated. This tests for overrunning @@ -2567,6 +2567,12 @@ /[\x{16e49}-\x{16e4f}\x{20000}\x{16e40}-\x{16e48}\pN]/Bi,utf +/[\x80-\x{4000}\x90\x{400}-\x{f000}\xa0\x{4000}-\x{10ffff}]++/B,utf + \x{7f}\x{80}\x{100}\x{10fffe}\x{10ffff}\x00 + +/[\x80-\x{4000}\x90\x{400}-\x{f000}\xa0\pN\x{4000}-\x{10ffff}]++/B,utf + \x{7f}\x{80}\x{100}090\x{10fffe}\x{10ffff}\x00 + # Freeing memory on error test /[\x{100}-\x{400}][\x{100}-\x{300}][\x{100}-\x{200}]\8/i,utf diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 1480a292a..6b05679da 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4149,9 +4149,9 @@ No match /[^\D\P{Nd}]/utf a9b 0: 9 - \x{1d7cf} - 0: \x{1d7cf} \= Expect no match + \x{1d7cf} +No match \x{10000} No match @@ -5634,6 +5634,26 @@ Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement End ------------------------------------------------------------------ +/[\x80-\x{4000}\x90\x{400}-\x{f000}\xa0\x{4000}-\x{10ffff}]++/B,utf +------------------------------------------------------------------ + Bra + [\x80-\xff] (neg)++ + Ket + End +------------------------------------------------------------------ + \x{7f}\x{80}\x{100}\x{10fffe}\x{10ffff}\x00 + 0: \x{80}\x{100}\x{10fffe}\x{10ffff} + +/[\x80-\x{4000}\x90\x{400}-\x{f000}\xa0\pN\x{4000}-\x{10ffff}]++/B,utf +------------------------------------------------------------------ + Bra + [\x80-\xff\p{N}\x{100}-\x{10ffff}]++ + Ket + End +------------------------------------------------------------------ + \x{7f}\x{80}\x{100}090\x{10fffe}\x{10ffff}\x00 + 0: \x{80}\x{100}090\x{10fffe}\x{10ffff} + # Freeing memory on error test /[\x{100}-\x{400}][\x{100}-\x{300}][\x{100}-\x{200}]\8/i,utf Failed: error 115 at offset 52: reference to non-existent subpattern