Skip to content

Commit 0595f37

Browse files
author
ju1ius
committed
adds support for named subpatterns to mb_ereg_replace
Named subpatterns are now passed to `mb_ereg_replace_callback`. This commit also adds a subset of the oniguruma back-reference syntax for replacements: * `\k<name>` and `\k'name'` for named subpatterns. * `\k<n>` and `\k'n'` for numbered subpatterns These last two notations allow referencing numbered groups where n > 9.
1 parent 4ba0e9c commit 0595f37

File tree

3 files changed

+181
-31
lines changed

3 files changed

+181
-31
lines changed

ext/mbstring/php_mbregex.c

Lines changed: 135 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,136 @@ mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngrou
693693
}
694694
/* }}} */
695695

696+
/*
697+
* Helper for _php_mb_regex_ereg_replace_exec
698+
*/
699+
/* {{{ mb_regex_substitute */
700+
static inline void mb_regex_substitute(
701+
smart_str *pbuf,
702+
char *subject,
703+
size_t subject_len,
704+
char *replace,
705+
size_t replace_len,
706+
php_mb_regex_t *regexp,
707+
OnigRegion *regs,
708+
const mbfl_encoding *enc
709+
) {
710+
char *p, *sp, *eos;
711+
int no; /* bakreference group number */
712+
int clen; /* byte-length of the current character */
713+
714+
p = replace;
715+
eos = replace + replace_len;
716+
717+
while (p < eos) {
718+
clen = (int) php_mb_mbchar_bytes_ex(p, enc);
719+
if (clen != 1 || p == eos || p[0] != '\\') {
720+
/* skip anything that's not an ascii backslash */
721+
smart_str_appendl(pbuf, p, clen);
722+
p += clen;
723+
continue;
724+
}
725+
sp = p; /* save position */
726+
clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
727+
if (clen != 1 || p == eos) {
728+
/* skip escaped multibyte char */
729+
p += clen;
730+
smart_str_appendl(pbuf, sp, p - sp);
731+
continue;
732+
}
733+
no = -1;
734+
switch (p[0]) {
735+
case '0':
736+
no = 0;
737+
p++;
738+
break;
739+
case '1': case '2': case '3': case '4':
740+
case '5': case '6': case '7': case '8': case '9':
741+
if (!onig_noname_group_capture_is_active(regexp)) {
742+
/*
743+
* FIXME:
744+
* Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
745+
* For now we just ignore them, but in the future we might want to raise a warning
746+
* and abort the whole replace operation.
747+
*/
748+
p++;
749+
smart_str_appendl(pbuf, sp, p - sp);
750+
continue;
751+
}
752+
no = p[0] - '0';
753+
p++;
754+
break;
755+
case 'k':
756+
clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
757+
if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
758+
/* not a backref delimiter */
759+
p += clen;
760+
smart_str_appendl(pbuf, sp, p - sp);
761+
continue;
762+
}
763+
/* try to consume everything until next delimiter */
764+
char delim = p[0] == '<' ? '>' : '\'';
765+
char *name, *name_end;
766+
int maybe_num = 1;
767+
name_end = name = p + 1;
768+
while (name_end < eos) {
769+
clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
770+
if (clen != 1) {
771+
name_end += clen;
772+
maybe_num = 0;
773+
continue;
774+
}
775+
if (name_end[0] == delim) break;
776+
if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
777+
name_end++;
778+
}
779+
p = name_end + 1;
780+
if (name_end - name < 1 || name_end >= eos) {
781+
/* the backref was empty or we failed to find the end delimiter */
782+
smart_str_appendl(pbuf, sp, p - sp);
783+
continue;
784+
}
785+
/* we have either a name or a number */
786+
if (maybe_num) {
787+
if (!onig_noname_group_capture_is_active(regexp)) {
788+
/* see above note on mixing numbered & named backrefs */
789+
smart_str_appendl(pbuf, sp, p - sp);
790+
continue;
791+
}
792+
if (name_end - name == 1) {
793+
no = name[0] - '0';
794+
break;
795+
}
796+
if (name[0] == '0') {
797+
/* 01 is not a valid number */
798+
break;
799+
}
800+
no = (int) strtoul(name, NULL, 10);
801+
break;
802+
}
803+
no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
804+
break;
805+
default:
806+
p += clen;
807+
smart_str_appendl(pbuf, sp, p - sp);
808+
continue;
809+
}
810+
if (no < 0 || no >= regs->num_regs) {
811+
/* invalid group number reference, keep the escape sequence in the output */
812+
smart_str_appendl(pbuf, sp, p - sp);
813+
continue;
814+
}
815+
if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
816+
smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
817+
}
818+
}
819+
820+
if (p < eos) {
821+
smart_str_appendl(pbuf, p, eos - p);
822+
}
823+
}
824+
/* }}} */
825+
696826
/*
697827
* php functions
698828
*/
@@ -859,14 +989,12 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
859989
char *string;
860990
size_t string_len;
861991

862-
char *p;
863992
php_mb_regex_t *re;
864993
OnigSyntaxType *syntax;
865994
OnigRegion *regs = NULL;
866995
smart_str out_buf = {0};
867996
smart_str eval_buf = {0};
868997
smart_str *pbuf;
869-
size_t i;
870998
int err, eval, n;
871999
OnigUChar *pos;
8721000
OnigUChar *string_lim;
@@ -976,38 +1104,11 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
9761104
break;
9771105
}
9781106
if (err >= 0) {
979-
#if moriyoshi_0
980-
if (regs->beg[0] == regs->end[0]) {
981-
php_error_docref(NULL, E_WARNING, "Empty regular expression");
982-
break;
983-
}
984-
#endif
9851107
/* copy the part of the string before the match */
9861108
smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
9871109

9881110
if (!is_callable) {
989-
/* copy replacement and backrefs */
990-
i = 0;
991-
p = replace;
992-
while (i < replace_len) {
993-
int fwd = (int) php_mb_mbchar_bytes_ex(p, enc);
994-
n = -1;
995-
if ((replace_len - i) >= 2 && fwd == 1 &&
996-
p[0] == '\\' && p[1] >= '0' && p[1] <= '9') {
997-
n = p[1] - '0';
998-
}
999-
if (n >= 0 && n < regs->num_regs) {
1000-
if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && (size_t)regs->end[n] <= string_len) {
1001-
smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]);
1002-
}
1003-
p += 2;
1004-
i += 2;
1005-
} else {
1006-
smart_str_appendl(pbuf, p, fwd);
1007-
p += fwd;
1008-
i += fwd;
1009-
}
1010-
}
1111+
mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
10111112
}
10121113

10131114
if (eval) {
@@ -1047,6 +1148,10 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
10471148
for (i = 0; i < regs->num_regs; i++) {
10481149
add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
10491150
}
1151+
if (onig_number_of_names(re) > 0) {
1152+
mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1153+
onig_foreach_name(re, mb_regex_groups_iter, &args);
1154+
}
10501155

10511156
ZVAL_COPY_VALUE(&args[0], &subpats);
10521157
/* null terminate buffer */

ext/mbstring/tests/mb_ereg_replace_callback.phpt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,16 @@ function_exists('mb_ereg_replace_callback') or die("skip mb_ereg_replace_callbac
88
--FILE--
99
<?php
1010
$str = 'abc 123 #",; $foo';
11-
echo mb_ereg_replace_callback('(\S+)', function($m){return $m[1].'('.strlen($m[1]).')';}, $str);
11+
12+
echo mb_ereg_replace_callback('(\S+)', function ($m) {
13+
return $m[1].'('.strlen($m[1]).')';
14+
}, $str), "\n";
15+
16+
echo mb_ereg_replace_callback('(?<word>\w+) (?<digit>\d+).*', function ($m) {
17+
return sprintf("%s-%s", $m['digit'], $m['word']);
18+
}, $str), "\n";
1219
?>
1320
--EXPECT--
1421
abc(3) 123(3) #",;(4) $foo(4)
22+
123-abc
1523

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
--TEST--
2+
mb_ereg_replace() with named subpatterns
3+
--SKIPIF--
4+
<?php
5+
extension_loaded('mbstring') or die('skip mbstring not available');
6+
function_exists('mb_ereg_replace') or die("skip mb_ereg_replace() is not available in this build");
7+
?>
8+
--FILE--
9+
<?php
10+
mb_regex_set_options('');
11+
// \k<word> syntax
12+
echo mb_ereg_replace('(?<a>\s*)(?<b>\w+)(?<c>\s*)', '\k<a>_\k<b>_\k<c>', 'a b c d e' ), "\n";
13+
// \k'word' syntax
14+
echo mb_ereg_replace('(?<word>[a-z]+)',"<\k'word'>", 'abc def ghi'), PHP_EOL;
15+
// numbered captures with \k<n> syntax
16+
echo mb_ereg_replace('(1)(2)(3)(4)(5)(6)(7)(8)(9)(a)(\10)', '\k<0>-\k<10>-', '123456789aa'), PHP_EOL;
17+
// numbered captures with \k'n' syntax
18+
echo mb_ereg_replace('(1)(2)(3)(4)(5)(6)(7)(8)(9)(a)(\10)', "\k'0'-\k'10'-", '123456789aa'), PHP_EOL;
19+
// backref 0 works, but 01 is ignored
20+
echo mb_ereg_replace('a', "\k'0'_\k<01>", 'a'), PHP_EOL;
21+
// Numbered backref is ignored if named backrefs are present
22+
echo mb_ereg_replace('(?<a>A)\k<a>', '-\1-', 'AA'), PHP_EOL;
23+
// An empty backref is ignored
24+
echo mb_ereg_replace('(\w)\1', '-\k<>-', 'AA'), PHP_EOL;
25+
// An unclosed backref is ignored
26+
echo mb_ereg_replace('(?<a>\w+)', '-\k<a', 'AA'), PHP_EOL;
27+
?>
28+
29+
--EXPECT--
30+
_a_ _b_ _c_ _d_ _e_
31+
<abc> <def> <ghi>
32+
123456789aa-a-
33+
123456789aa-a-
34+
a_\k<01>
35+
-\1-
36+
-\k<>-
37+
-\k<a

0 commit comments

Comments
 (0)