diff --git a/BUILD.bazel b/BUILD.bazel index 30d53a401..cd509a0d2 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -28,6 +28,7 @@ cc_library( "src/pcre2_auto_possess.c", "src/pcre2_chkdint.c", "src/pcre2_compile.c", + "src/pcre2_compile_class.c", "src/pcre2_config.c", "src/pcre2_context.c", "src/pcre2_convert.c", diff --git a/CMakeLists.txt b/CMakeLists.txt index f3d6c3c2f..dd0fff469 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -618,6 +618,7 @@ SET(PCRE2_SOURCES ${PROJECT_BINARY_DIR}/pcre2_chartables.c src/pcre2_chkdint.c src/pcre2_compile.c + src/pcre2_compile_class.c src/pcre2_config.c src/pcre2_context.c src/pcre2_convert.c diff --git a/Makefile.am b/Makefile.am index 1595d00da..466edd8b8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -374,6 +374,8 @@ COMMON_SOURCES = \ src/pcre2_auto_possess.c \ src/pcre2_chkdint.c \ src/pcre2_compile.c \ + src/pcre2_compile.h \ + src/pcre2_compile_class.c \ src/pcre2_config.c \ src/pcre2_context.c \ src/pcre2_convert.c \ diff --git a/NON-AUTOTOOLS-BUILD b/NON-AUTOTOOLS-BUILD index d7becc974..0026606c1 100644 --- a/NON-AUTOTOOLS-BUILD +++ b/NON-AUTOTOOLS-BUILD @@ -105,6 +105,7 @@ example. pcre2_chkdint.c pcre2_chartables.c pcre2_compile.c + pcre2_compile_class.c pcre2_config.c pcre2_context.c pcre2_convert.c diff --git a/PrepareRelease b/PrepareRelease index be5989de5..dcbb8ca45 100755 --- a/PrepareRelease +++ b/PrepareRelease @@ -217,6 +217,8 @@ files="\ src/pcre2_auto_possess.c \ src/pcre2_chkdint.c \ src/pcre2_compile.c \ + src/pcre2_compile.h \ + src/pcre2_compile_class.c \ src/pcre2_config.c \ src/pcre2_context.c \ src/pcre2_convert.c \ diff --git a/README b/README index 47d879d7f..75dcc2809 100644 --- a/README +++ b/README @@ -821,37 +821,38 @@ The distribution should contain the files listed below. ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_chkdint.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_ucptables.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) + src/pcre2posix.c ) + src/pcre2_auto_possess.c ) + src/pcre2_chkdint.c ) + src/pcre2_compile.c ) + src/pcre2_compile_class.c ) + src/pcre2_config.c ) + src/pcre2_context.c ) + src/pcre2_convert.c ) + src/pcre2_dfa_match.c ) + src/pcre2_error.c ) + src/pcre2_extuni.c ) + src/pcre2_find_bracket.c ) + src/pcre2_jit_compile.c ) + src/pcre2_jit_match.c ) sources for the functions in the library, + src/pcre2_jit_misc.c ) and some internal functions that they use + src/pcre2_maketables.c ) + src/pcre2_match.c ) + src/pcre2_match_data.c ) + src/pcre2_newline.c ) + src/pcre2_ord2utf.c ) + src/pcre2_pattern_info.c ) + src/pcre2_script_run.c ) + src/pcre2_serialize.c ) + src/pcre2_string_utils.c ) + src/pcre2_study.c ) + src/pcre2_substitute.c ) + src/pcre2_substring.c ) + src/pcre2_tables.c ) + src/pcre2_ucd.c ) + src/pcre2_ucptables.c ) + src/pcre2_valid_utf.c ) + src/pcre2_xclass.c ) src/pcre2_printint.c debugging function that is used by pcre2test, src/pcre2_fuzzsupport.c function for (optional) fuzzing support @@ -859,6 +860,7 @@ The distribution should contain the files listed below. src/config.h.in template for config.h, when built by "configure" src/pcre2.h.in template for pcre2.h when built by "configure" src/pcre2posix.h header for the external POSIX wrapper API + src/pcre2_compile.h header for internal use src/pcre2_internal.h header for internal use src/pcre2_intmodedep.h a mode-specific internal header src/pcre2_jit_neon_inc.h header used by JIT diff --git a/build.zig b/build.zig index 1af8b6b54..7ccc0fc2b 100644 --- a/build.zig +++ b/build.zig @@ -48,6 +48,7 @@ pub fn build(b: *std.Build) !void { "src/pcre2_auto_possess.c", "src/pcre2_chkdint.c", "src/pcre2_compile.c", + "src/pcre2_compile_class.c", "src/pcre2_config.c", "src/pcre2_context.c", "src/pcre2_convert.c", diff --git a/doc/html/NON-AUTOTOOLS-BUILD.txt b/doc/html/NON-AUTOTOOLS-BUILD.txt index d7becc974..0026606c1 100644 --- a/doc/html/NON-AUTOTOOLS-BUILD.txt +++ b/doc/html/NON-AUTOTOOLS-BUILD.txt @@ -105,6 +105,7 @@ example. pcre2_chkdint.c pcre2_chartables.c pcre2_compile.c + pcre2_compile_class.c pcre2_config.c pcre2_context.c pcre2_convert.c diff --git a/doc/html/README.txt b/doc/html/README.txt index 47d879d7f..6a15c1254 100644 --- a/doc/html/README.txt +++ b/doc/html/README.txt @@ -821,37 +821,38 @@ The distribution should contain the files listed below. ASCII coding; unless --enable-rebuild-chartables is specified, used by copying to pcre2_chartables.c - src/pcre2posix.c ) - src/pcre2_auto_possess.c ) - src/pcre2_chkdint.c ) - src/pcre2_compile.c ) - src/pcre2_config.c ) - src/pcre2_context.c ) - src/pcre2_convert.c ) - src/pcre2_dfa_match.c ) - src/pcre2_error.c ) - src/pcre2_extuni.c ) - src/pcre2_find_bracket.c ) - src/pcre2_jit_compile.c ) - src/pcre2_jit_match.c ) sources for the functions in the library, - src/pcre2_jit_misc.c ) and some internal functions that they use - src/pcre2_maketables.c ) - src/pcre2_match.c ) - src/pcre2_match_data.c ) - src/pcre2_newline.c ) - src/pcre2_ord2utf.c ) - src/pcre2_pattern_info.c ) - src/pcre2_script_run.c ) - src/pcre2_serialize.c ) - src/pcre2_string_utils.c ) - src/pcre2_study.c ) - src/pcre2_substitute.c ) - src/pcre2_substring.c ) - src/pcre2_tables.c ) - src/pcre2_ucd.c ) - src/pcre2_ucptables.c ) - src/pcre2_valid_utf.c ) - src/pcre2_xclass.c ) + src/pcre2posix.c ) + src/pcre2_auto_possess.c ) + src/pcre2_chkdint.c ) + src/pcre2_compile.c ) + src/pcre2_compile_class.c ) + src/pcre2_config.c ) + src/pcre2_context.c ) + src/pcre2_convert.c ) + src/pcre2_dfa_match.c ) + src/pcre2_error.c ) + src/pcre2_extuni.c ) + src/pcre2_find_bracket.c ) + src/pcre2_jit_compile.c ) + src/pcre2_jit_match.c ) sources for the functions in the library, + src/pcre2_jit_misc.c ) and some internal functions that they use + src/pcre2_maketables.c ) + src/pcre2_match.c ) + src/pcre2_match_data.c ) + src/pcre2_newline.c ) + src/pcre2_ord2utf.c ) + src/pcre2_pattern_info.c ) + src/pcre2_script_run.c ) + src/pcre2_serialize.c ) + src/pcre2_string_utils.c ) + src/pcre2_study.c ) + src/pcre2_substitute.c ) + src/pcre2_substring.c ) + src/pcre2_tables.c ) + src/pcre2_ucd.c ) + src/pcre2_ucptables.c ) + src/pcre2_valid_utf.c ) + src/pcre2_xclass.c ) src/pcre2_printint.c debugging function that is used by pcre2test, src/pcre2_fuzzsupport.c function for (optional) fuzzing support diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index bdf2e8c45..29b251c4e 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -47,7 +47,7 @@ POSSIBILITY OF SUCH DAMAGE. #define PSSTART start_pattern /* Field containing processed string start */ #define PSEND end_pattern /* Field containing processed string end */ -#include "pcre2_internal.h" +#include "pcre2_compile.h" /* In rare error cases debugging might require calling pcre2_printint(). */ @@ -108,20 +108,8 @@ them will be able to (i.e. assume a 64-bit world). */ #define SIZEOFFSET 2 #endif -/* Macros for manipulating elements of the parsed pattern vector. */ - -#define META_CODE(x) (x & 0xffff0000u) -#define META_DATA(x) (x & 0x0000ffffu) -#define META_DIFF(x,y) ((x-y)>>16) - /* Function definitions to allow mutual recursion */ -#ifdef SUPPORT_UNICODE -static unsigned int - add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t, - compile_block *, const uint32_t *, unsigned int); -#endif - static int compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *, @@ -199,110 +187,6 @@ don't have to check them every time. */ #define OFLOW_MAX (INT_MAX - 20) -/* Code values for parsed patterns, which are stored in a vector of 32-bit -unsigned ints. Values less than META_END are literal data values. The coding -for identifying the item is in the top 16-bits, leaving 16 bits for the -additional data that some of them need. The META_CODE, META_DATA, and META_DIFF -macros are used to manipulate parsed pattern elements. - -NOTE: When these definitions are changed, the table of extra lengths for each -code (meta_extra_lengths, just below) must be updated to remain in step. */ - -#define META_END 0x80000000u /* End of pattern */ - -#define META_ALT 0x80010000u /* alternation */ -#define META_ATOMIC 0x80020000u /* atomic group */ -#define META_BACKREF 0x80030000u /* Back ref */ -#define META_BACKREF_BYNAME 0x80040000u /* \k'name' */ -#define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */ -#define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */ -#define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */ -#define META_CAPTURE 0x80080000u /* Capturing parenthesis */ -#define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */ -#define META_CLASS 0x800a0000u /* start non-empty class */ -#define META_CLASS_EMPTY 0x800b0000u /* empty class */ -#define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */ -#define META_CLASS_END 0x800d0000u /* end of non-empty class */ -#define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */ -#define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */ -#define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */ -#define META_COND_NAME 0x80110000u /* (?()... */ -#define META_COND_NUMBER 0x80120000u /* (?(digits)... */ -#define META_COND_RNAME 0x80130000u /* (?(R&name)... */ -#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */ -#define META_COND_VERSION 0x80150000u /* (?(VERSIONx.y)... */ -#define META_SCS_NAME 0x80160000u /* (*scan_substring:()... */ -#define META_SCS_NUMBER 0x80170000u /* (*scan_substring:(digits)... */ -#define META_SCS_NEXT_NAME 0x80180000u /* Next of scan_substring */ -#define META_SCS_NEXT_NUMBER 0x80190000u /* Next digits of scan_substring */ -#define META_DOLLAR 0x801a0000u /* $ metacharacter */ -#define META_DOT 0x801b0000u /* . metacharacter */ -#define META_ESCAPE 0x801c0000u /* \d and friends */ -#define META_KET 0x801d0000u /* closing parenthesis */ -#define META_NOCAPTURE 0x801e0000u /* no capture parens */ -#define META_OPTIONS 0x801f0000u /* (?i) and friends */ -#define META_POSIX 0x80200000u /* POSIX class item */ -#define META_POSIX_NEG 0x80210000u /* negative POSIX class item */ -#define META_RANGE_ESCAPED 0x80220000u /* range with at least one escape */ -#define META_RANGE_LITERAL 0x80230000u /* range defined literally */ -#define META_RECURSE 0x80240000u /* Recursion */ -#define META_RECURSE_BYNAME 0x80250000u /* (?&name) */ -#define META_SCRIPT_RUN 0x80260000u /* (*script_run:...) */ - -/* These must be kept together to make it easy to check that an assertion -is present where expected in a conditional group. */ - -#define META_LOOKAHEAD 0x80270000u /* (?= */ -#define META_LOOKAHEADNOT 0x80280000u /* (?! */ -#define META_LOOKBEHIND 0x80290000u /* (?<= */ -#define META_LOOKBEHINDNOT 0x802a0000u /* (?0 the CASESET offset for char with multiple other cases; - for this return, *ocptr contains the original -*/ - -static int -get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, - uint32_t *odptr, BOOL restricted) -{ -uint32_t c, othercase, next; -unsigned int co; - -/* Find the first character that has an other case. If it has multiple other -cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the -multi-case entries that begin with ASCII values. In 32-bit mode, a value -greater than the Unicode maximum ends the range. */ - -for (c = *cptr; c <= d; c++) - { -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (c > MAX_UTF_CODE_POINT) return -1; -#endif - if ((co = UCD_CASESET(c)) != 0 && - (!restricted || PRIV(ucd_caseless_sets)[co] > 127)) - { - *ocptr = c++; /* Character that has the set */ - *cptr = c; /* Rest of input range */ - return (int)co; - } - - /* This is not a valid multiple-case character. Check that the single other - case is different to the original. We don't need to check "restricted" here - because the non-ASCII characters with multiple cases that include an ASCII - character don't have a different "othercase". */ - - if ((othercase = UCD_OTHERCASE(c)) != c) break; - } - -if (c > d) return -1; /* Reached end of range */ - -/* Found a character that has a single other case. Search for the end of the -range, which is either the end of the input range, or a character that has zero -or more than one other cases. */ - -*ocptr = othercase; -next = othercase + 1; - -for (++c; c <= d; c++) - { - if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; - next++; - } - -*odptr = next - 1; /* End of othercase range */ -*cptr = c; /* Rest of input range */ -return 0; -} - - - -/************************************************* -* Get nocase ranges * -*************************************************/ - -/* This function returns the next nocase range after a character -using binary search. The character might be included in the range. - -Arguments: - c current character - -Yield: range (start/end pair) -*/ - -static const uint32_t* -get_nocase_range(uint32_t c) -{ -uint32_t left = 0; -uint32_t right = PRIV(ucd_nocase_ranges_size); -uint32_t middle; - -if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right; - -while (TRUE) - { - /* Range end of the middle element. */ - middle = ((left + right) >> 1) | 0x1; - - if (PRIV(ucd_nocase_ranges)[middle] <= c) - left = middle + 1; - else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c) - right = middle - 1; - else - return PRIV(ucd_nocase_ranges) + (middle - 1); - } -} -#endif /* SUPPORT_UNICODE */ - - - /************************************************* * Add a character or range to a class (internal) * *************************************************/ @@ -5526,8 +5272,7 @@ Returns: the number of < 256 characters added static unsigned int add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start, - uint32_t end) + uint32_t options, compile_block *cb, uint32_t start, uint32_t end) { uint32_t c; uint32_t classbits_end = (end <= 0xff ? end : 0xff); @@ -5540,86 +5285,15 @@ restriction is in force). Sometimes we can just extend the original range. */ if ((options & PCRE2_CASELESS) != 0) { -#ifdef SUPPORT_UNICODE - if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) - { - int rc; - uint32_t oc, od, skip_start; - const uint32_t *skip_range; - - options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ - c = start; - skip_range = get_nocase_range(c); - skip_start = skip_range[0]; - if (c > skip_start) - { - c = skip_range[1]; - skip_range += 2; - skip_start = skip_range[0]; - } - - while ((rc = get_othercase_range(&c, end, &oc, &od, - (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0) +#ifndef SUPPORT_UNICODE + if ((options & (PCRE2_UTF|PCRE2_UCP)) == 0) +#endif /* SUPPORT_UNICODE */ + /* Not UTF mode */ + for (c = start; c <= classbits_end; c++) { - if (c > skip_start) - { - if (c < skip_range[1]) - { - c = skip_range[1]; - skip_range += 2; - skip_start = skip_range[0]; - } - else - { - skip_range = get_nocase_range(c); - skip_start = skip_range[0]; - - if (c > skip_start) - { - c = skip_range[1]; - skip_range += 2; - skip_start = skip_range[0]; - } - } - } - - /* Handle a single character that has more than one other case. */ - - if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, - options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc); - - /* Do nothing if the other case range is within the original range. */ - - else if (oc >= cb->class_range_start && od <= cb->class_range_end) - continue; - - /* Extend the original range if there is overlap, noting that if oc < c, - we can't have od > end because a subrange is always shorter than the - basic range. Otherwise, use a recursive call to add the additional range. - */ - - else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ - else if (od > end && oc <= end + 1) - { - end = od; /* Extend upwards */ - if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); - } - else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, - cb, oc, od); + SETBIT(classbits, cb->fcc[c]); + n8++; } - } - else -#else - (void)xoptions; /* Avoid compiler warning */ -#endif /* SUPPORT_UNICODE */ - - /* Not UTF mode */ - - for (c = start; c <= classbits_end; c++) - { - SETBIT(classbits, cb->fcc[c]); - n8++; - } } /* Now handle the originally supplied range. Adjust the final value according @@ -5694,53 +5368,6 @@ return n8; /* Number of 8-bit characters */ -#ifdef SUPPORT_UNICODE -/************************************************* -* Add a list of characters to a class (internal) * -*************************************************/ - -/* This function is used for adding a list of case-equivalent characters to a -class when in UTF mode. This function is called only from within -add_to_class_internal(), with which it is mutually recursive. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options bits - xoptions the extra options bits - cb contains pointers to tables etc. - p points to row of 32-bit values, terminated by NOTACHAR - except character to omit; this is used when adding lists of - case-equivalent characters to avoid including the one we - already know about - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static unsigned int -add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p, - unsigned int except) -{ -unsigned int n8 = 0; -while (p[0] < NOTACHAR) - { - unsigned int n = 0; - if (p[0] != except) - { - while(p[n+1] == p[0] + n + 1) n++; - n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb, - p[0], p[n]); - } - p += n + 1; - } -return n8; -} -#endif - - - /************************************************* * External entry point for add range to class * *************************************************/ @@ -5763,12 +5390,11 @@ Returns: the number of < 256 characters added static unsigned int add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, - uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end) + compile_block *cb, uint32_t start, uint32_t end) { cb->class_range_start = start; cb->class_range_end = end; -return add_to_class_internal(classbits, uchardptr, options, xoptions, cb, - start, end); +return add_to_class_internal(classbits, uchardptr, options, cb, start, end); } @@ -5798,8 +5424,8 @@ Returns: the number of < 256 characters added */ static unsigned int -add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, - uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except) +add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, + uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except) { unsigned int n8 = 0; while (p[0] < NOTACHAR) @@ -5810,8 +5436,7 @@ while (p[0] < NOTACHAR) while(p[n+1] == p[0] + n + 1) n++; cb->class_range_start = p[0]; cb->class_range_end = p[n]; - n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb, - p[0], p[n]); + n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); } p += n + 1; } @@ -5841,16 +5466,16 @@ Returns: the number of < 256 characters added static unsigned int add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p) + uint32_t options, compile_block *cb, const uint32_t *p) { BOOL utf = (options & PCRE2_UTF) != 0; unsigned int n8 = 0; if (p[0] > 0) - n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1); + n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); while (p[0] < NOTACHAR) { while (p[1] == p[0] + 1) p++; - n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1, + n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1, (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); p++; } @@ -6019,6 +5644,8 @@ PCRE2_UCHAR *class_uchardata; #ifdef SUPPORT_WIDE_CHARS BOOL xclass; PCRE2_UCHAR *class_uchardata_base; +uint32_t* class_ranges; +size_t class_ranges_size; #endif /* Set up the default and non-default settings for greediness */ @@ -6331,6 +5958,22 @@ for (;; pptr++) might match. */ #ifdef SUPPORT_WIDE_CHARS +#if PCRE2_CODE_UNIT_WIDTH == 8 + class_ranges = NULL; + class_ranges_size = 0; + + if (utf) +#endif + { + class_ranges = PRIV(optimize_class)(pptr, options, &class_ranges_size, cb); + + if (class_ranges == NULL && class_ranges_size != 0) + { + *errorcodeptr = ERR21; + return 0; + } + } + xclass = FALSE; class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ class_uchardata_base = class_uchardata; /* Save the start */ @@ -6545,24 +6188,24 @@ for (;; pptr++) case ESC_h: (void)add_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list), + options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR); break; case ESC_H: (void)add_not_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list)); + options & ~PCRE2_CASELESS, cb, PRIV(hspace_list)); break; case ESC_v: (void)add_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list), + options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR); break; case ESC_V: (void)add_not_list_to_class(classbits, &class_uchardata, - options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list)); + options & ~PCRE2_CASELESS, cb, PRIV(vspace_list)); break; /* If Unicode is not supported, \P and \p are not allowed and are @@ -6630,6 +6273,11 @@ for (;; pptr++) if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; +#ifdef SUPPORT_WIDE_CHARS + /* Character ranges are ignored when class_ranges is present. */ + if (class_ranges != NULL) continue; +#endif + /* In an EBCDIC environment, Perl treats alphabetic ranges specially because there are holes in the encoding, and simply using the range A-Z (for example) would include the characters in the holes. This @@ -6648,7 +6296,7 @@ for (;; pptr++) if (C <= CHAR_i) { class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, xoptions, + add_to_class(classbits, &class_uchardata, options, cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc); C = CHAR_j; } @@ -6656,7 +6304,7 @@ for (;; pptr++) if (C <= D && C <= CHAR_r) { class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, xoptions, + add_to_class(classbits, &class_uchardata, options, cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc); C = CHAR_s; } @@ -6664,7 +6312,7 @@ for (;; pptr++) if (C <= D) { class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, xoptions, + add_to_class(classbits, &class_uchardata, options, cb, C + uc, D + uc); } } @@ -6673,16 +6321,19 @@ for (;; pptr++) /* Not an EBCDIC special range */ class_has_8bitchar += add_to_class(classbits, &class_uchardata, - options, xoptions, cb, c, d); + options, cb, c, d); goto CONTINUE_CLASS; /* Go get the next char in the class */ } /* End of range handling */ +#ifdef SUPPORT_WIDE_CHARS + /* Character ranges are ignored when class_ranges is present. */ + if (class_ranges != NULL) continue; +#endif /* Handle a single character. */ class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, xoptions, cb, - meta, meta); + add_to_class(classbits, &class_uchardata, options, cb, meta, meta); } /* Continue to the next item in the class. */ @@ -6710,6 +6361,36 @@ for (;; pptr++) continue; /* Needed to avoid error when not supporting wide chars */ } /* End of main class-processing loop */ +#ifdef SUPPORT_WIDE_CHARS + if (class_ranges != NULL) + { + uint32_t *range = class_ranges; + uint32_t *end = class_ranges + class_ranges_size; + + do + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, + range[0], range[1]); + + if (class_uchardata > class_uchardata_base) + { + xclass = TRUE; + if (lengthptr != NULL) + { + *lengthptr += class_uchardata - class_uchardata_base; + class_uchardata = class_uchardata_base; + } + } + + range += 2; + } + while (range < end); + + cb->cx->memctl.free(class_ranges, cb->cx->memctl.memory_data); + } +#endif + /* If this class is the first thing in the branch, there can be no first char setting, whatever the repeat count. Any reqcu setting must remain unchanged after any kind of repeat. */ diff --git a/src/pcre2_compile.h b/src/pcre2_compile.h new file mode 100644 index 000000000..3fff760bc --- /dev/null +++ b/src/pcre2_compile.h @@ -0,0 +1,183 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE2 is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2024 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +#ifndef PCRE2_COMPILE_H_IDEMPOTENT_GUARD +#define PCRE2_COMPILE_H_IDEMPOTENT_GUARD + +#include "pcre2_internal.h" + +/* Compile time error code numbers. They are given names so that they can more +easily be tracked. When a new number is added, the tables called eint1 and +eint2 in pcre2posix.c may need to be updated, and a new error text must be +added to compile_error_texts in pcre2_error.c. Also, the error codes in +pcre2.h.in must be updated - their values are exactly 100 greater than these +values. */ + +enum { ERR0 = COMPILE_ERROR_BASE, + ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, + ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, + ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, + ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, + ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, + ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, + ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, + ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, + ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90, + ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100, + ERR101 }; + +/* Code values for parsed patterns, which are stored in a vector of 32-bit +unsigned ints. Values less than META_END are literal data values. The coding +for identifying the item is in the top 16-bits, leaving 16 bits for the +additional data that some of them need. The META_CODE, META_DATA, and META_DIFF +macros are used to manipulate parsed pattern elements. + +NOTE: When these definitions are changed, the table of extra lengths for each +code (meta_extra_lengths, just below) must be updated to remain in step. */ + +#define META_END 0x80000000u /* End of pattern */ + +#define META_ALT 0x80010000u /* alternation */ +#define META_ATOMIC 0x80020000u /* atomic group */ +#define META_BACKREF 0x80030000u /* Back ref */ +#define META_BACKREF_BYNAME 0x80040000u /* \k'name' */ +#define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */ +#define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */ +#define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */ +#define META_CAPTURE 0x80080000u /* Capturing parenthesis */ +#define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */ +#define META_CLASS 0x800a0000u /* start non-empty class */ +#define META_CLASS_EMPTY 0x800b0000u /* empty class */ +#define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */ +#define META_CLASS_END 0x800d0000u /* end of non-empty class */ +#define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */ +#define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */ +#define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */ +#define META_COND_NAME 0x80110000u /* (?()... */ +#define META_COND_NUMBER 0x80120000u /* (?(digits)... */ +#define META_COND_RNAME 0x80130000u /* (?(R&name)... */ +#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */ +#define META_COND_VERSION 0x80150000u /* (?(VERSIONx.y)... */ +#define META_SCS_NAME 0x80160000u /* (*scan_substring:()... */ +#define META_SCS_NUMBER 0x80170000u /* (*scan_substring:(digits)... */ +#define META_SCS_NEXT_NAME 0x80180000u /* Next of scan_substring */ +#define META_SCS_NEXT_NUMBER 0x80190000u /* Next digits of scan_substring */ +#define META_DOLLAR 0x801a0000u /* $ metacharacter */ +#define META_DOT 0x801b0000u /* . metacharacter */ +#define META_ESCAPE 0x801c0000u /* \d and friends */ +#define META_KET 0x801d0000u /* closing parenthesis */ +#define META_NOCAPTURE 0x801e0000u /* no capture parens */ +#define META_OPTIONS 0x801f0000u /* (?i) and friends */ +#define META_POSIX 0x80200000u /* POSIX class item */ +#define META_POSIX_NEG 0x80210000u /* negative POSIX class item */ +#define META_RANGE_ESCAPED 0x80220000u /* range with at least one escape */ +#define META_RANGE_LITERAL 0x80230000u /* range defined literally */ +#define META_RECURSE 0x80240000u /* Recursion */ +#define META_RECURSE_BYNAME 0x80250000u /* (?&name) */ +#define META_SCRIPT_RUN 0x80260000u /* (*script_run:...) */ + +/* These must be kept together to make it easy to check that an assertion +is present where expected in a conditional group. */ + +#define META_LOOKAHEAD 0x80270000u /* (?= */ +#define META_LOOKAHEADNOT 0x80280000u /* (?! */ +#define META_LOOKBEHIND 0x80290000u /* (?<= */ +#define META_LOOKBEHINDNOT 0x802a0000u /* (?>16) + +/* Merge intersecting ranges of classes. */ + +uint32_t *PRIV(optimize_class)(uint32_t *start_ptr, uint32_t options, + size_t *buffer_size, compile_block* cb); + +#endif /* PCRE2_COMPILE_H_IDEMPOTENT_GUARD */ + +/* End of pcre2_compile.h */ diff --git a/src/pcre2_compile_class.c b/src/pcre2_compile_class.c new file mode 100644 index 000000000..bbeae23fe --- /dev/null +++ b/src/pcre2_compile_class.c @@ -0,0 +1,355 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2024 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre2_compile.h" + +#ifdef SUPPORT_WIDE_CHARS + +/* Heapsort algorithm. */ + +static void do_heapify(uint32_t *buffer, size_t size, size_t i) +{ +size_t max; +size_t left; +size_t right; +uint32_t tmp1, tmp2; + +while (TRUE) + { + max = i; + left = (i << 1) + 2; + right = left + 2; + + if (left < size && buffer[left] > buffer[max]) max = left; + if (right < size && buffer[right] > buffer[max]) max = right; + if (i == max) return; + + /* Swap items. */ + tmp1 = buffer[i]; + tmp2 = buffer[i + 1]; + buffer[i] = buffer[max]; + buffer[i + 1] = buffer[max + 1]; + buffer[max] = tmp1; + buffer[max + 1] = tmp2; + i = max; + } +} + +#ifdef SUPPORT_UNICODE + +#define PARSE_CLASS_CASELESS_UTF 0x1 +#define PARSE_CLASS_RESTRICTED_UTF 0x2 + +static const uint32_t* +get_nocase_range(uint32_t c) +{ +uint32_t left = 0; +uint32_t right = PRIV(ucd_nocase_ranges_size); +uint32_t middle; + +if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right; + +while (TRUE) + { + /* Range end of the middle element. */ + middle = ((left + right) >> 1) | 0x1; + + if (PRIV(ucd_nocase_ranges)[middle] <= c) + left = middle + 1; + else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c) + right = middle - 1; + else + return PRIV(ucd_nocase_ranges) + (middle - 1); + } +} + +static size_t +utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options, + uint32_t *buffer) +{ +uint32_t new_start = start; +uint32_t new_end = end; +uint32_t c = start; +const uint32_t *list; +uint32_t tmp[3]; +size_t result = 2; +const uint32_t *skip_range = get_nocase_range(c); +uint32_t skip_start = skip_range[0]; + +#if PCRE2_CODE_UNIT_WIDTH == 32 +if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT; +#endif + +while (c <= end) + { + if (c > skip_start) + { + c = skip_range[1]; + skip_range += 2; + skip_start = skip_range[0]; + continue; + } + + /* Compute caseless set. */ + uint32_t co = UCD_CASESET(c); + + if (co != 0 && (!(options & PARSE_CLASS_RESTRICTED_UTF) + || PRIV(ucd_caseless_sets)[co] > 127)) + list = PRIV(ucd_caseless_sets) + co; + else + { + co = UCD_OTHERCASE(c); + list = tmp; + tmp[0] = c; + tmp[1] = NOTACHAR; + + if (co != c) + { + tmp[1] = co; + tmp[2] = NOTACHAR; + } + } + c++; + + /* Add characters. */ + do + { + if (*list < new_start) + { + if (*list + 1 == new_start) + { + new_start--; + continue; + } + } + else if (*list > new_end) + { + if (*list - 1 == new_end) + { + new_end++; + continue; + } + } + else continue; + + result += 2; + if (buffer != NULL) + { + buffer[0] = *list; + buffer[1] = *list; + buffer += 2; + } + } + while (*(++list) != NOTACHAR); + } + + if (buffer != NULL) + { + buffer[0] = new_start; + buffer[1] = new_end; + buffer += 2; + } + return result; +} + +#endif + +static size_t +parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer) +{ +size_t total_size = 0; +uint32_t meta_arg; +uint32_t start_char; + +(void)options; /* Avoid compiler warning. */ + +while (*ptr != META_CLASS_END) + { + switch (META_CODE(*ptr)) + { + case META_ESCAPE: + meta_arg = META_DATA(*ptr); + if (meta_arg == ESC_P || meta_arg == ESC_p) ptr++; + ptr++; + continue; + case META_POSIX: + case META_POSIX_NEG: + ptr += 2; + continue; + case META_BIGVALUE: + /* Character literal */ + ptr++; + break; + default: + PCRE2_ASSERT(*ptr < META_END); + break; + } + + start_char = *ptr; + + if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED) + { + ptr += 2; + PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE); + + if (*ptr == META_BIGVALUE) ptr++; + } + +#ifdef SUPPORT_UNICODE + if (options & PARSE_CLASS_CASELESS_UTF) + { + size_t size = utf_caseless_extend(start_char, *ptr++, options, buffer); + if (buffer != NULL) buffer += size; + total_size += size; + continue; + } +#endif + + if (buffer != NULL) + { + buffer[0] = start_char; + buffer[1] = *ptr; + buffer += 2; + } + + ptr++; + total_size += 2; + } + + return total_size; +} + +uint32_t *PRIV(optimize_class)(uint32_t *start_ptr, uint32_t options, + size_t *buffer_size, compile_block* cb) +{ +uint32_t *ptr = start_ptr + 1; +uint32_t *buffer; +uint32_t *dst; +size_t size = 0, i; +uint32_t tmp1, tmp2; + +PCRE2_ASSERT(*start_ptr == META_CLASS || *start_ptr == META_CLASS_NOT); + +#ifdef SUPPORT_UNICODE +if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP))) + options = PARSE_CLASS_CASELESS_UTF; +else + options = 0; + +if (cb->cx->extra_options & PCRE2_EXTRA_CASELESS_RESTRICT) + options |= PARSE_CLASS_RESTRICTED_UTF; +#endif + +/* Compute required space for the range. */ + +size = parse_class(start_ptr + 1, options, NULL); + +*buffer_size = size; +if (size == 0) return NULL; + +/* Allocate and buffer. */ + +buffer = (uint32_t*) + cb->cx->memctl.malloc(size * sizeof(uint32_t), cb->cx->memctl.memory_data); + +if (buffer == NULL) return NULL; + +parse_class(start_ptr + 1, options, buffer); + +if (size == 2) return buffer; + +/* In-place sorting of ranges. */ + +i = (((size >> 2) - 1) << 1); +while (TRUE) + { + do_heapify(buffer, size, i); + if (i == 0) break; + i -= 2; + } + +i = size - 2; +while (TRUE) + { + tmp1 = buffer[i]; + tmp2 = buffer[i + 1]; + buffer[i] = buffer[0]; + buffer[i + 1] = buffer[1]; + buffer[0] = tmp1; + buffer[1] = tmp2; + + do_heapify(buffer, i, 0); + if (i == 0) break; + i -= 2; + } + +/* Merge ranges whenever possible. */ +dst = buffer; +ptr = buffer + 2; +size -= 2; + +/* The second condition is a very rare corner case, where the end of the last +range is the maximum character. This range cannot be extended further. */ + +while (size > 0 && dst[1] != ~(uint32_t)0) + { + if (dst[1] + 1 < ptr[0]) + { + dst += 2; + dst[0] = ptr[0]; + dst[1] = ptr[1]; + } + else if (dst[1] < ptr[1]) dst[1] = ptr[1]; + + ptr += 2; + size -= 2; + } + +*buffer_size = (size_t)(dst + 2 - buffer); +return buffer; +} + +#endif /* SUPPORT_WIDE_CHARS */ + +/* End of pcre2_compile_class.c */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index dd251ef8b..3cbb56bb3 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -2050,6 +2050,7 @@ is available. */ #define _pcre2_valid_utf PCRE2_SUFFIX(_pcre2_valid_utf_) #define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_) #define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_) +#define _pcre2_optimize_class PCRE2_SUFFIX(_pcre2_optimize_class_) extern int _pcre2_auto_possessify(PCRE2_UCHAR *, const compile_block *); diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index a798cdd4f..f2ed042b7 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -731,7 +731,7 @@ typedef struct compile_block { const uint8_t *cbits; /* Points to character type table */ const uint8_t *ctypes; /* Points to table of type maps */ PCRE2_UCHAR *start_workspace; /* The start of working space */ - PCRE2_UCHAR * start_code; /* The start of the compiled code */ + PCRE2_UCHAR *start_code; /* The start of the compiled code */ PCRE2_SPTR start_pattern; /* The start of the pattern */ PCRE2_SPTR end_pattern; /* The end of the pattern */ PCRE2_UCHAR *name_table; /* The name/number table */ diff --git a/testdata/testinput5 b/testdata/testinput5 index 8a48cec6f..b240c7e41 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2539,4 +2539,20 @@ /abc/utf,substitute_extended,python_octal abc\=replace=\400 +# Character range merging tests + +/[\x{1200}\s\x{1202}\d\x{1201}]+/B,utf,ucp + \x{11ff}\x{1200}\x{1201}\x{1202}\x{1203} + +/[\x{2000}-\x{2500}\x{2100}-\x{2600}\d\x{1800}-\x{1fff}]+/B,utf,ucp + \x{17ff}\x{1800}\x{2600}\x{2601} + +/[\x{10008}\x{10003}\x{10006}\x{10004}\x{10007}]+/B,utf + \x{10002}\x{10005}\x{10003}\x{10004}\x{10006}\x{10007}\x{10008}\x{10009} + +/[\x{100}-\x{400}]+/Bi,utf + qS\x{ff}\x{100}\x{a7c5}\x{401} + \x{2c63}\x{2c64}\x{2c65}\x{2c66}\x{2c67} + \x{a7af}\x{a7b0}\x{a7b1}\x{a7b2}\x{a7b3} + # End of testinput5 diff --git a/testdata/testoutput10 b/testdata/testoutput10 index 1d1b7f09d..ddc8e3e37 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1443,7 +1443,7 @@ No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra - [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] + [Zz-\xff\x{100}-\x{101}\x{178}\x{39c}\x{3bc}\x{1e9e}\x{212b}] Ket End ------------------------------------------------------------------ @@ -1480,7 +1480,7 @@ No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra - [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] + [Zz-\xff\x{100}-\x{101}\x{178}\x{39c}\x{3bc}\x{1e9e}\x{212b}] Ket End ------------------------------------------------------------------ diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index e7a63167d..11eda4cda 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1285,7 +1285,7 @@ No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra - [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] + [Zz-\xff\x{100}-\x{101}\x{178}\x{39c}\x{3bc}\x{1e9e}\x{212b}] Ket End ------------------------------------------------------------------ @@ -1331,7 +1331,7 @@ No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra - [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] + [Zz-\xff\x{100}-\x{101}\x{178}\x{39c}\x{3bc}\x{1e9e}\x{212b}] Ket End ------------------------------------------------------------------ diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index ad01800ac..b4a64eab9 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1279,7 +1279,7 @@ No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra - [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] + [Zz-\xff\x{100}-\x{101}\x{178}\x{39c}\x{3bc}\x{1e9e}\x{212b}] Ket End ------------------------------------------------------------------ @@ -1325,7 +1325,7 @@ No match /[z-\x{100}]/IBi,utf ------------------------------------------------------------------ Bra - [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}] + [Zz-\xff\x{100}-\x{101}\x{178}\x{39c}\x{3bc}\x{1e9e}\x{212b}] Ket End ------------------------------------------------------------------ diff --git a/testdata/testoutput5 b/testdata/testoutput5 index dd8710424..f8ccaa654 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -3950,7 +3950,7 @@ Subject length lower bound = 1 /[A-`]/iB,utf ------------------------------------------------------------------ Bra - [A-z\x{212a}\x{17f}] + [A-z\x{17f}\x{212a}] Ket End ------------------------------------------------------------------ @@ -5103,7 +5103,7 @@ No match /[Ss]+/iB,utf ------------------------------------------------------------------ Bra - [Ss\x{17f}\x{17f}]++ + [Ss\x{17f}]++ Ket End ------------------------------------------------------------------ @@ -5121,7 +5121,7 @@ No match /[S\x{17f}]/iB,utf ------------------------------------------------------------------ Bra - [Ss\x{17f}\x{17f}] + [Ss\x{17f}] Ket End ------------------------------------------------------------------ @@ -5137,7 +5137,7 @@ No match /[\x{17f}s]/iB,utf ------------------------------------------------------------------ Bra - [Ss\x{17f}\x{17f}] + [Ss\x{17f}] Ket End ------------------------------------------------------------------ @@ -5153,7 +5153,7 @@ No match /[\x{4b}\x{6b}]/iB,utf ------------------------------------------------------------------ Bra - [Kk\x{212a}\x{212a}] + [Kk\x{212a}] Ket End ------------------------------------------------------------------ @@ -5537,4 +5537,50 @@ Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF abc\=replace=\400 Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string +# Character range merging tests + +/[\x{1200}\s\x{1202}\d\x{1201}]+/B,utf,ucp +------------------------------------------------------------------ + Bra + [\p{Xsp}\p{Nd}\x{1200}-\x{1202}]++ + Ket + End +------------------------------------------------------------------ + \x{11ff}\x{1200}\x{1201}\x{1202}\x{1203} + 0: \x{1200}\x{1201}\x{1202} + +/[\x{2000}-\x{2500}\x{2100}-\x{2600}\d\x{1800}-\x{1fff}]+/B,utf,ucp +------------------------------------------------------------------ + Bra + [\p{Nd}\x{1800}-\x{2600}]++ + Ket + End +------------------------------------------------------------------ + \x{17ff}\x{1800}\x{2600}\x{2601} + 0: \x{1800}\x{2600} + +/[\x{10008}\x{10003}\x{10006}\x{10004}\x{10007}]+/B,utf +------------------------------------------------------------------ + Bra + [\x{10003}-\x{10004}\x{10006}-\x{10008}]++ + Ket + End +------------------------------------------------------------------ + \x{10002}\x{10005}\x{10003}\x{10004}\x{10006}\x{10007}\x{10008}\x{10009} + 0: \x{10003}\x{10004}\x{10006}\x{10007}\x{10008} + +/[\x{100}-\x{400}]+/Bi,utf +------------------------------------------------------------------ + Bra + [Ss\xb5\xff\x{100}-\x{400}\x{450}\x{1fbe}\x{2126}\x{2c62}\x{2c64}-\x{2c66}\x{2c6d}-\x{2c70}\x{2c7e}-\x{2c7f}\x{a78d}\x{a7aa}-\x{a7ae}\x{a7b0}-\x{a7b2}\x{a7c5}]++ + Ket + End +------------------------------------------------------------------ + qS\x{ff}\x{100}\x{a7c5}\x{401} + 0: S\x{ff}\x{100}\x{a7c5} + \x{2c63}\x{2c64}\x{2c65}\x{2c66}\x{2c67} + 0: \x{2c64}\x{2c65}\x{2c66} + \x{a7af}\x{a7b0}\x{a7b1}\x{a7b2}\x{a7b3} + 0: \x{a7b0}\x{a7b1}\x{a7b2} + # End of testinput5 diff --git a/testdata/testoutput7 b/testdata/testoutput7 index 36b47e3c1..e1c45e559 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -3816,7 +3816,7 @@ No match /[Ss]+/iB,utf ------------------------------------------------------------------ Bra - [Ss\x{17f}\x{17f}]++ + [Ss\x{17f}]++ Ket End ------------------------------------------------------------------ @@ -3834,7 +3834,7 @@ No match /[S\x{17f}]/iB,utf ------------------------------------------------------------------ Bra - [Ss\x{17f}\x{17f}] + [Ss\x{17f}] Ket End ------------------------------------------------------------------ @@ -3850,7 +3850,7 @@ No match /[\x{17f}s]/iB,utf ------------------------------------------------------------------ Bra - [Ss\x{17f}\x{17f}] + [Ss\x{17f}] Ket End ------------------------------------------------------------------ @@ -3866,7 +3866,7 @@ No match /[\x{4b}\x{6b}]/iB,utf ------------------------------------------------------------------ Bra - [Kk\x{212a}\x{212a}] + [Kk\x{212a}] Ket End ------------------------------------------------------------------ diff --git a/vms/configure.com b/vms/configure.com index c6024e87c..7e3bf1495 100644 --- a/vms/configure.com +++ b/vms/configure.com @@ -1040,6 +1040,9 @@ PCRE2_CHKDINT.OBJ : PCRE2_CHKDINT.C PCRE2_COMPILE.OBJ : PCRE2_COMPILE.C $(CC) $(CFLAGS) $(MMS$SOURCE) /OBJ=$(MMS$TARGET) +PCRE2_COMPILE.OBJ : PCRE2_COMPILE_CLASS.C + $(CC) $(CFLAGS) $(MMS$SOURCE) /OBJ=$(MMS$TARGET) + PCRE2_CONFIG.OBJ : PCRE2_CONFIG.C $(CC) $(CFLAGS) $(MMS$SOURCE) /OBJ=$(MMS$TARGET)