Even in 8-bit mode, perform range computation for char classes if UCP…

… flag is set When testing another patch, I discovered that PCRE2Project#474 caused a small change in the behavior of character classes when caseless mode and UCP were enabled. Thank you to Zoltan Herczeg for suggesting a fix. Closes PCRE2ProjectGH-526.
alexdowad · Oct 15, 2024 · bd74ff7 · bd74ff7
1 parent c9bf833
commit bd74ff7
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 2 deletions.
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
@@ -5891,7 +5891,7 @@ for (;; pptr++)
 #if PCRE2_CODE_UNIT_WIDTH == 8
  cranges = NULL;
 
- if (utf)
+ if (utf || ucp)
 #endif
  {
  if (lengthptr != NULL)
@@ -6388,6 +6388,12 @@ for (;; pptr++)
  range = end;
  }
 
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ /* If code unit width is 8 bits, and UCP flag is set, but UTF flag is not, we still
+ * generate cranges, but in that case we should not process any crange > 0xFF,
+ * because it's impossible to encounter code points > 0xFF in the subject string */
+ if (utf)
+#endif
  while (range < end)
  {
  uint32_t range_start = range[0];
@@ -6479,6 +6485,9 @@ for (;; pptr++)
 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
  if ((xclass_props & XCLASS_REQUIRED) != 0)
  {
+ /* We should never generate a (useless) xclass in 8-bit library when UTF flag is false */
+ PCRE2_ASSERT(PCRE2_CODE_UNIT_WIDTH != 8 || utf);
+
  *class_uchardata++ = XCL_END; /* Marks the end of extra data */
  *code++ = OP_XCLASS;
  code += LINK_SIZE;

diff --git a/src/pcre2_compile_class.c b/src/pcre2_compile_class.c
@@ -125,7 +125,7 @@ const uint32_t *skip_range = get_nocase_range(c);
 uint32_t skip_start = skip_range[0];
 
 #if PCRE2_CODE_UNIT_WIDTH == 8
-PCRE2_ASSERT(options & PARSE_CLASS_UTF);
+PCRE2_ASSERT(options & (PARSE_CLASS_UTF | PARSE_CLASS_CASELESS_UTF));
 #endif
 
 #if PCRE2_CODE_UNIT_WIDTH == 32

diff --git a/testdata/testinput10 b/testdata/testinput10
@@ -623,6 +623,9 @@
 /X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
  X\x{c1}Y
 
+/[a\x{c1}]/iI,ucp
+ \x{e1}
+
 # Without UTF or UCP characters > 127 have only one case in the default locale.
 
 /X(\x{e1})Y/replace=>\U$1<,substitute_extended

diff --git a/testdata/testoutput10 b/testdata/testoutput10
@@ -1883,6 +1883,14 @@ Subject length lower bound = 1
  X\x{c1}Y
  1: >\xe1<
 
+/[a\x{c1}]/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1
+Subject length lower bound = 1
+ \x{e1}
+ 0: \xe1
+
 # Without UTF or UCP characters > 127 have only one case in the default locale.
 
 /X(\x{e1})Y/replace=>\U$1<,substitute_extended