diff --git a/ext/mbstring/php_mbregex.c b/ext/mbstring/php_mbregex.c index fd103abf195aa..acbcd24c47eb2 100644 --- a/ext/mbstring/php_mbregex.c +++ b/ext/mbstring/php_mbregex.c @@ -649,6 +649,180 @@ _php_mb_regex_init_options(const char *parg, int narg, OnigOptionType *option, O } /* }}} */ + +/* + * Callbacks for named subpatterns + */ + +/* {{{ struct mb_ereg_groups_iter_arg */ +typedef struct mb_regex_groups_iter_args { + zval *groups; + char *search_str; + int search_len; + OnigRegion *region; +} mb_regex_groups_iter_args; +/* }}} */ + +/* {{{ mb_ereg_groups_iter */ +static int +mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg) +{ + mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg; + int i, gn, ref, beg, end; + + for (i = 0; i < ngroup_num; i++) { + gn = group_nums[i]; + ref = onig_name_to_backref_number(reg, name, name_end, args->region); + if (ref != gn) { + /* + * In case of duplicate groups, keep only the last suceeding one + * to be consistent with preg_match with the PCRE_DUPNAMES option. + */ + continue; + } + beg = args->region->beg[gn]; + end = args->region->end[gn]; + if (beg >= 0 && beg < end && end <= args->search_len) { + add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg); + } else { + add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0); + } + } + + return 0; +} +/* }}} */ + +/* + * Helper for _php_mb_regex_ereg_replace_exec + */ +/* {{{ mb_regex_substitute */ +static inline void mb_regex_substitute( + smart_str *pbuf, + const char *subject, + size_t subject_len, + char *replace, + size_t replace_len, + php_mb_regex_t *regexp, + OnigRegion *regs, + const mbfl_encoding *enc +) { + char *p, *sp, *eos; + int no; /* bakreference group number */ + int clen; /* byte-length of the current character */ + + p = replace; + eos = replace + replace_len; + + while (p < eos) { + clen = (int) php_mb_mbchar_bytes_ex(p, enc); + if (clen != 1 || p == eos || p[0] != '\\') { + /* skip anything that's not an ascii backslash */ + smart_str_appendl(pbuf, p, clen); + p += clen; + continue; + } + sp = p; /* save position */ + clen = (int) php_mb_mbchar_bytes_ex(++p, enc); + if (clen != 1 || p == eos) { + /* skip escaped multibyte char */ + p += clen; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + no = -1; + switch (p[0]) { + case '0': + no = 0; + p++; + break; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + if (!onig_noname_group_capture_is_active(regexp)) { + /* + * FIXME: + * Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern. + * For now we just ignore them, but in the future we might want to raise a warning + * and abort the whole replace operation. + */ + p++; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + no = p[0] - '0'; + p++; + break; + case 'k': + clen = (int) php_mb_mbchar_bytes_ex(++p, enc); + if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) { + /* not a backref delimiter */ + p += clen; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + /* try to consume everything until next delimiter */ + char delim = p[0] == '<' ? '>' : '\''; + char *name, *name_end; + char maybe_num = 1; + name_end = name = p + 1; + while (name_end < eos) { + clen = (int) php_mb_mbchar_bytes_ex(name_end, enc); + if (clen != 1) { + name_end += clen; + maybe_num = 0; + continue; + } + if (name_end[0] == delim) break; + if (maybe_num && !isdigit(name_end[0])) maybe_num = 0; + name_end++; + } + p = name_end + 1; + if (name_end - name < 1 || name_end >= eos) { + /* the backref was empty or we failed to find the end delimiter */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + /* we have either a name or a number */ + if (maybe_num) { + if (!onig_noname_group_capture_is_active(regexp)) { + /* see above note on mixing numbered & named backrefs */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (name_end - name == 1) { + no = name[0] - '0'; + break; + } + if (name[0] == '0') { + /* 01 is not a valid number */ + break; + } + no = (int) strtoul(name, NULL, 10); + break; + } + no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs); + break; + default: + p += clen; + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (no < 0 || no >= regs->num_regs) { + /* invalid group number reference, keep the escape sequence in the output */ + smart_str_appendl(pbuf, sp, p - sp); + continue; + } + if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) { + smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]); + } + } + + if (p < eos) { + smart_str_appendl(pbuf, p, eos - p); + } +} +/* }}} */ + /* * php functions */ @@ -764,6 +938,11 @@ static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase) add_index_bool(array, i, 0); } } + + if (onig_number_of_names(re) > 0) { + mb_regex_groups_iter_args args = {array, string, string_len, regs}; + onig_foreach_name(re, mb_regex_groups_iter, &args); + } } if (match_len == 0) { @@ -810,14 +989,12 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp char *string; size_t string_len; - char *p; php_mb_regex_t *re; OnigSyntaxType *syntax; OnigRegion *regs = NULL; smart_str out_buf = {0}; smart_str eval_buf = {0}; smart_str *pbuf; - size_t i; int err, eval, n; OnigUChar *pos; OnigUChar *string_lim; @@ -927,38 +1104,11 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp break; } if (err >= 0) { -#if moriyoshi_0 - if (regs->beg[0] == regs->end[0]) { - php_error_docref(NULL, E_WARNING, "Empty regular expression"); - break; - } -#endif /* copy the part of the string before the match */ smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos)); if (!is_callable) { - /* copy replacement and backrefs */ - i = 0; - p = replace; - while (i < replace_len) { - int fwd = (int) php_mb_mbchar_bytes_ex(p, enc); - n = -1; - if ((replace_len - i) >= 2 && fwd == 1 && - p[0] == '\\' && p[1] >= '0' && p[1] <= '9') { - n = p[1] - '0'; - } - if (n >= 0 && n < regs->num_regs) { - if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && (size_t)regs->end[n] <= string_len) { - smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]); - } - p += 2; - i += 2; - } else { - smart_str_appendl(pbuf, p, fwd); - p += fwd; - i += fwd; - } - } + mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc); } if (eval) { @@ -998,6 +1148,10 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp for (i = 0; i < regs->num_regs; i++) { add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]); } + if (onig_number_of_names(re) > 0) { + mb_regex_groups_iter_args args = {&subpats, string, string_len, regs}; + onig_foreach_name(re, mb_regex_groups_iter, &args); + } ZVAL_COPY_VALUE(&args[0], &subpats); /* null terminate buffer */ @@ -1291,6 +1445,15 @@ _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode) add_index_bool(return_value, i, 0); } } + if (onig_number_of_names(MBREX(search_re)) > 0) { + mb_regex_groups_iter_args args = { + return_value, + Z_STRVAL(MBREX(search_str)), + Z_STRLEN(MBREX(search_str)), + MBREX(search_regs) + }; + onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args); + } break; default: RETVAL_TRUE; @@ -1417,6 +1580,15 @@ PHP_FUNCTION(mb_ereg_search_getregs) add_index_bool(return_value, i, 0); } } + if (onig_number_of_names(MBREX(search_re)) > 0) { + mb_regex_groups_iter_args args = { + return_value, + Z_STRVAL(MBREX(search_str)), + len, + MBREX(search_regs) + }; + onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args); + } } else { RETVAL_FALSE; } @@ -1445,7 +1617,7 @@ PHP_FUNCTION(mb_ereg_search_setpos) if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) { position += Z_STRLEN(MBREX(search_str)); } - + if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) { php_error_docref(NULL, E_WARNING, "Position is out of range"); MBREX(search_pos) = 0; diff --git a/ext/mbstring/tests/mb_ereg_dupnames.phpt b/ext/mbstring/tests/mb_ereg_dupnames.phpt new file mode 100644 index 0000000000000..fcc428c3604ef --- /dev/null +++ b/ext/mbstring/tests/mb_ereg_dupnames.phpt @@ -0,0 +1,37 @@ +--TEST-- +Testing mb_ereg() duplicate named groups +--SKIPIF-- + +--FILE-- +?)|(?!))'; + mb_ereg($pattern, '中?', $m); + var_dump($m); + mb_ereg($pattern, '中!', $m); + var_dump($m); +?> +--EXPECT-- +array(4) { + [0]=> + string(6) "中?" + [1]=> + string(3) "?" + [2]=> + bool(false) + ["punct"]=> + string(3) "?" +} +array(4) { + [0]=> + string(6) "中!" + [1]=> + bool(false) + [2]=> + string(3) "!" + ["punct"]=> + string(3) "!" +} diff --git a/ext/mbstring/tests/mb_ereg_named_subpatterns.phpt b/ext/mbstring/tests/mb_ereg_named_subpatterns.phpt new file mode 100644 index 0000000000000..ed0f85baa3150 --- /dev/null +++ b/ext/mbstring/tests/mb_ereg_named_subpatterns.phpt @@ -0,0 +1,50 @@ +--TEST-- +Testing mb_ereg() named subpatterns +--SKIPIF-- + +--FILE-- +\s*)(?\w+)', ' 中国', $m); + var_dump($m); + mb_ereg('(?\s*)(?\w+)', '国', $m); + var_dump($m); + mb_ereg('(\s*)(?\w+)', ' 中国', $m); + var_dump($m); +?> +--EXPECT-- +array(5) { + [0]=> + string(8) " 中国" + [1]=> + string(2) " " + [2]=> + string(6) "中国" + ["wsp"]=> + string(2) " " + ["word"]=> + string(6) "中国" +} +array(5) { + [0]=> + string(3) "国" + [1]=> + bool(false) + [2]=> + string(3) "国" + ["wsp"]=> + bool(false) + ["word"]=> + string(3) "国" +} +array(3) { + [0]=> + string(8) " 中国" + [1]=> + string(6) "中国" + ["word"]=> + string(6) "中国" +} diff --git a/ext/mbstring/tests/mb_ereg_replace_callback.phpt b/ext/mbstring/tests/mb_ereg_replace_callback.phpt index 98a380957479f..1e15dcc433dbb 100644 --- a/ext/mbstring/tests/mb_ereg_replace_callback.phpt +++ b/ext/mbstring/tests/mb_ereg_replace_callback.phpt @@ -8,8 +8,16 @@ function_exists('mb_ereg_replace_callback') or die("skip mb_ereg_replace_callbac --FILE-- \w+) (?\d+).*', function ($m) { + return sprintf("%s-%s", $m['digit'], $m['word']); +}, $str), "\n"; ?> --EXPECT-- abc(3) 123(3) #",;(4) $foo(4) +123-abc diff --git a/ext/mbstring/tests/mb_ereg_replace_named_subpatterns.phpt b/ext/mbstring/tests/mb_ereg_replace_named_subpatterns.phpt new file mode 100644 index 0000000000000..1bf8a2714c709 --- /dev/null +++ b/ext/mbstring/tests/mb_ereg_replace_named_subpatterns.phpt @@ -0,0 +1,37 @@ +--TEST-- +mb_ereg_replace() with named subpatterns +--SKIPIF-- + +--FILE-- + syntax + echo mb_ereg_replace('(?\s*)(?\w+)(?\s*)', '\k_\k_\k', 'a b c d e' ), "\n"; + // \k'word' syntax + echo mb_ereg_replace('(?[a-z]+)',"<\k'word'>", 'abc def ghi'), PHP_EOL; + // numbered captures with \k syntax + echo mb_ereg_replace('(1)(2)(3)(4)(5)(6)(7)(8)(9)(a)(\10)', '\k<0>-\k<10>-', '123456789aa'), PHP_EOL; + // numbered captures with \k'n' syntax + echo mb_ereg_replace('(1)(2)(3)(4)(5)(6)(7)(8)(9)(a)(\10)', "\k'0'-\k'10'-", '123456789aa'), PHP_EOL; + // backref 0 works, but 01 is ignored + echo mb_ereg_replace('a', "\k'0'_\k<01>", 'a'), PHP_EOL; + // Numbered backref is ignored if named backrefs are present + echo mb_ereg_replace('(?A)\k', '-\1-', 'AA'), PHP_EOL; + // An empty backref is ignored + echo mb_ereg_replace('(\w)\1', '-\k<>-', 'AA'), PHP_EOL; + // An unclosed backref is ignored + echo mb_ereg_replace('(?\w+)', '-\k + +--EXPECT-- +_a_ _b_ _c_ _d_ _e_ + +123456789aa-a- +123456789aa-a- +a_\k<01> +-\1- +-\k<>- +-\k +--FILE-- +\s*)(?\w+)(?[?!])'); + var_dump(mb_ereg_search_getregs()); +?> +--EXPECT-- +array(7) { + [0]=> + string(11) " 中国?" + [1]=> + string(2) " " + [2]=> + string(6) "中国" + [3]=> + string(3) "?" + ["punct"]=> + string(3) "?" + ["wsp"]=> + string(2) " " + ["word"]=> + string(6) "中国" +}