Skip to content

Commit af8b549

Browse files
author
ju1ius
committed
adds support for named subpatterns to mb_ereg_replace
Named subpatterns are now passed to `mb_ereg_replace_callback`. This commit also adds a subset of the oniguruma back-reference syntax for replacements: * `\k<name>` and `\k'name'` for named subpatterns. * `\k<n>` and `\k'n'` for numbered subpatterns These last two notations allow referencing numbered groups where n > 9.
1 parent 4ba0e9c commit af8b549

File tree

3 files changed

+164
-31
lines changed

3 files changed

+164
-31
lines changed

ext/mbstring/php_mbregex.c

Lines changed: 128 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,129 @@ mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngrou
693693
}
694694
/* }}} */
695695

696+
/*
697+
* Helper for _php_mb_regex_ereg_replace_exec
698+
*/
699+
/* {{{ mb_regex_substitute */
700+
static inline void mb_regex_substitute(
701+
smart_str *pbuf,
702+
char *subject,
703+
size_t subject_len,
704+
char *replace,
705+
size_t replace_len,
706+
php_mb_regex_t *regexp,
707+
OnigRegion *regs,
708+
const mbfl_encoding *enc
709+
) {
710+
char *p, *sp, *eos;
711+
int no; /* bakreference group number */
712+
int clen; /* byte-length of the current character */
713+
714+
p = replace;
715+
eos = replace + replace_len;
716+
717+
while (p < eos) {
718+
clen = (int) php_mb_mbchar_bytes_ex(p, enc);
719+
if (clen != 1 || p == eos || p[0] != '\\') {
720+
/* skip anything that's not an ascii backslash */
721+
smart_str_appendl(pbuf, p, clen);
722+
p += clen;
723+
continue;
724+
}
725+
sp = p; /* save position */
726+
clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
727+
if (clen != 1 || p == eos) {
728+
/* skip escaped multibyte char */
729+
p += clen;
730+
smart_str_appendl(pbuf, sp, p - sp);
731+
continue;
732+
}
733+
no = -1;
734+
switch (p[0]) {
735+
case '0':
736+
no = 0;
737+
p++;
738+
break;
739+
case '1': case '2': case '3': case '4':
740+
case '5': case '6': case '7': case '8': case '9':
741+
if (!onig_noname_group_capture_is_active(regexp)) {
742+
/*
743+
* FIXME:
744+
* Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
745+
* For now we just ignore them, but in the future we might want to raise a warning
746+
* and abort the whole replace operation.
747+
*/
748+
p++;
749+
smart_str_appendl(pbuf, sp, p - sp);
750+
continue;
751+
}
752+
no = p[0] - '0';
753+
p++;
754+
break;
755+
case 'k':
756+
clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
757+
if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
758+
/* not a backref delimiter */
759+
p += clen;
760+
smart_str_appendl(pbuf, sp, p - sp);
761+
continue;
762+
}
763+
/* try to consume everything until next delimiter */
764+
char delim = p[0] == '<' ? '>' : '\'';
765+
char *name, *name_end;
766+
int maybe_num = 1;
767+
name_end = name = p + 1;
768+
while (name_end < eos) {
769+
clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
770+
if (clen != 1) {
771+
name_end += clen;
772+
maybe_num = 0;
773+
continue;
774+
}
775+
if (name_end[0] == delim) break;
776+
if (maybe_num && !isdigit(name_end[0])) {
777+
maybe_num = 0;
778+
}
779+
name_end++;
780+
}
781+
if (name_end < eos) {
782+
p = name_end + 1;
783+
/* we have either a name or a number */
784+
if (maybe_num) {
785+
/* check for NaN */
786+
if (name[0] == '0' && name_end - name > 1) {
787+
break;
788+
}
789+
no = (int) strtoul(name, NULL, 10);
790+
break;
791+
}
792+
no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
793+
break;
794+
}
795+
/* we failed to find a '>' */
796+
smart_str_appendl(pbuf, sp, p - sp);
797+
continue;
798+
default:
799+
p += clen;
800+
smart_str_appendl(pbuf, sp, p - sp);
801+
continue;
802+
}
803+
if (no < 0 || no >= regs->num_regs) {
804+
/* invalid group number reference, keep the escape sequence in the output */
805+
smart_str_appendl(pbuf, sp, p - sp);
806+
continue;
807+
}
808+
if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
809+
smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
810+
}
811+
}
812+
813+
if (p < eos) {
814+
smart_str_appendl(pbuf, p, eos - p);
815+
}
816+
}
817+
/* }}} */
818+
696819
/*
697820
* php functions
698821
*/
@@ -859,14 +982,12 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
859982
char *string;
860983
size_t string_len;
861984

862-
char *p;
863985
php_mb_regex_t *re;
864986
OnigSyntaxType *syntax;
865987
OnigRegion *regs = NULL;
866988
smart_str out_buf = {0};
867989
smart_str eval_buf = {0};
868990
smart_str *pbuf;
869-
size_t i;
870991
int err, eval, n;
871992
OnigUChar *pos;
872993
OnigUChar *string_lim;
@@ -976,38 +1097,11 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
9761097
break;
9771098
}
9781099
if (err >= 0) {
979-
#if moriyoshi_0
980-
if (regs->beg[0] == regs->end[0]) {
981-
php_error_docref(NULL, E_WARNING, "Empty regular expression");
982-
break;
983-
}
984-
#endif
9851100
/* copy the part of the string before the match */
9861101
smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));
9871102

9881103
if (!is_callable) {
989-
/* copy replacement and backrefs */
990-
i = 0;
991-
p = replace;
992-
while (i < replace_len) {
993-
int fwd = (int) php_mb_mbchar_bytes_ex(p, enc);
994-
n = -1;
995-
if ((replace_len - i) >= 2 && fwd == 1 &&
996-
p[0] == '\\' && p[1] >= '0' && p[1] <= '9') {
997-
n = p[1] - '0';
998-
}
999-
if (n >= 0 && n < regs->num_regs) {
1000-
if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && (size_t)regs->end[n] <= string_len) {
1001-
smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]);
1002-
}
1003-
p += 2;
1004-
i += 2;
1005-
} else {
1006-
smart_str_appendl(pbuf, p, fwd);
1007-
p += fwd;
1008-
i += fwd;
1009-
}
1010-
}
1104+
mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
10111105
}
10121106

10131107
if (eval) {
@@ -1047,6 +1141,10 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
10471141
for (i = 0; i < regs->num_regs; i++) {
10481142
add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
10491143
}
1144+
if (onig_number_of_names(re) > 0) {
1145+
mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
1146+
onig_foreach_name(re, mb_regex_groups_iter, &args);
1147+
}
10501148

10511149
ZVAL_COPY_VALUE(&args[0], &subpats);
10521150
/* null terminate buffer */

ext/mbstring/tests/mb_ereg_replace_callback.phpt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,16 @@ function_exists('mb_ereg_replace_callback') or die("skip mb_ereg_replace_callbac
88
--FILE--
99
<?php
1010
$str = 'abc 123 #",; $foo';
11-
echo mb_ereg_replace_callback('(\S+)', function($m){return $m[1].'('.strlen($m[1]).')';}, $str);
11+
12+
echo mb_ereg_replace_callback('(\S+)', function ($m) {
13+
return $m[1].'('.strlen($m[1]).')';
14+
}, $str), "\n";
15+
16+
echo mb_ereg_replace_callback('(?<word>\w+) (?<digit>\d+).*', function ($m) {
17+
return sprintf("%s-%s", $m['digit'], $m['word']);
18+
}, $str), "\n";
1219
?>
1320
--EXPECT--
1421
abc(3) 123(3) #",;(4) $foo(4)
22+
123-abc
1523

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
--TEST--
2+
mb_ereg_replace() with named subpatterns
3+
--SKIPIF--
4+
<?php
5+
extension_loaded('mbstring') or die('skip mbstring not available');
6+
function_exists('mb_ereg_replace') or die("skip mb_ereg_replace() is not available in this build");
7+
?>
8+
--FILE--
9+
<?php
10+
mb_regex_set_options('');
11+
echo mb_ereg_replace('(?<a>\s*)(?<b>\w+)(?<c>\s*)', '\k<a>_\k<b>_\k<c>', 'a b c d e' ), "\n";
12+
// \k'word' syntax
13+
echo mb_ereg_replace('(?<word>[a-z]+)',"<\k'word'>", 'abc def ghi'), PHP_EOL;
14+
// numbered captures with \k<n> syntax
15+
echo mb_ereg_replace('(1)(2)(3)(4)(5)(6)(7)(8)(9)(a)(\10)', '\0-\k<10>-', '123456789aa'), PHP_EOL;
16+
// backref 0 works, but 01 is ignored
17+
echo mb_ereg_replace('a', "\k'0'_\k<01>", 'a'), PHP_EOL;
18+
// Numbered backref is ignored if named backrefs are present
19+
echo mb_ereg_replace('(?<a>A)\k<a>', '-\1-', 'AA'), PHP_EOL;
20+
?>
21+
22+
--EXPECT--
23+
_a_ _b_ _c_ _d_ _e_
24+
<abc> <def> <ghi>
25+
123456789aa-a-
26+
a_\k<01>
27+
-\1-

0 commit comments

Comments
 (0)