Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 203 additions & 31 deletions ext/mbstring/php_mbregex.c
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,180 @@ _php_mb_regex_init_options(const char *parg, int narg, OnigOptionType *option, O
}
/* }}} */


/*
* Callbacks for named subpatterns
*/

/* {{{ struct mb_ereg_groups_iter_arg */
typedef struct mb_regex_groups_iter_args {
zval *groups;
char *search_str;
int search_len;
OnigRegion *region;
} mb_regex_groups_iter_args;
/* }}} */

/* {{{ mb_ereg_groups_iter */
static int
mb_regex_groups_iter(const OnigUChar* name, const OnigUChar* name_end, int ngroup_num, int* group_nums, regex_t* reg, void* parg)
{
mb_regex_groups_iter_args *args = (mb_regex_groups_iter_args *) parg;
int i, gn, ref, beg, end;

for (i = 0; i < ngroup_num; i++) {
gn = group_nums[i];
ref = onig_name_to_backref_number(reg, name, name_end, args->region);
if (ref != gn) {
/*
* In case of duplicate groups, keep only the last suceeding one
* to be consistent with preg_match with the PCRE_DUPNAMES option.
*/
continue;
}
beg = args->region->beg[gn];
end = args->region->end[gn];
if (beg >= 0 && beg < end && end <= args->search_len) {
add_assoc_stringl_ex(args->groups, (char *)name, name_end - name, &args->search_str[beg], end - beg);
} else {
add_assoc_bool_ex(args->groups, (char *)name, name_end - name, 0);
}
}

return 0;
}
/* }}} */

/*
* Helper for _php_mb_regex_ereg_replace_exec
*/
/* {{{ mb_regex_substitute */
static inline void mb_regex_substitute(
smart_str *pbuf,
const char *subject,
size_t subject_len,
char *replace,
size_t replace_len,
php_mb_regex_t *regexp,
OnigRegion *regs,
const mbfl_encoding *enc
) {
char *p, *sp, *eos;
int no; /* bakreference group number */
int clen; /* byte-length of the current character */

p = replace;
eos = replace + replace_len;

while (p < eos) {
clen = (int) php_mb_mbchar_bytes_ex(p, enc);
if (clen != 1 || p == eos || p[0] != '\\') {
/* skip anything that's not an ascii backslash */
smart_str_appendl(pbuf, p, clen);
p += clen;
continue;
}
sp = p; /* save position */
clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
if (clen != 1 || p == eos) {
/* skip escaped multibyte char */
p += clen;
smart_str_appendl(pbuf, sp, p - sp);
continue;
}
no = -1;
switch (p[0]) {
case '0':
no = 0;
p++;
break;
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
if (!onig_noname_group_capture_is_active(regexp)) {
/*
* FIXME:
* Oniguruma throws a compile error if numbered backrefs are used with named groups in the pattern.
* For now we just ignore them, but in the future we might want to raise a warning
* and abort the whole replace operation.
*/
p++;
smart_str_appendl(pbuf, sp, p - sp);
continue;
}
no = p[0] - '0';
p++;
break;
case 'k':
clen = (int) php_mb_mbchar_bytes_ex(++p, enc);
if (clen != 1 || p == eos || (p[0] != '<' && p[0] != '\'')) {
/* not a backref delimiter */
p += clen;
smart_str_appendl(pbuf, sp, p - sp);
continue;
}
/* try to consume everything until next delimiter */
char delim = p[0] == '<' ? '>' : '\'';
char *name, *name_end;
char maybe_num = 1;
name_end = name = p + 1;
while (name_end < eos) {
clen = (int) php_mb_mbchar_bytes_ex(name_end, enc);
if (clen != 1) {
name_end += clen;
maybe_num = 0;
continue;
}
if (name_end[0] == delim) break;
if (maybe_num && !isdigit(name_end[0])) maybe_num = 0;
name_end++;
}
p = name_end + 1;
if (name_end - name < 1 || name_end >= eos) {
/* the backref was empty or we failed to find the end delimiter */
smart_str_appendl(pbuf, sp, p - sp);
continue;
}
/* we have either a name or a number */
if (maybe_num) {
if (!onig_noname_group_capture_is_active(regexp)) {
/* see above note on mixing numbered & named backrefs */
smart_str_appendl(pbuf, sp, p - sp);
continue;
}
if (name_end - name == 1) {
no = name[0] - '0';
break;
}
if (name[0] == '0') {
/* 01 is not a valid number */
break;
}
no = (int) strtoul(name, NULL, 10);
break;
}
no = onig_name_to_backref_number(regexp, (OnigUChar *)name, (OnigUChar *)name_end, regs);
break;
default:
p += clen;
smart_str_appendl(pbuf, sp, p - sp);
continue;
}
if (no < 0 || no >= regs->num_regs) {
/* invalid group number reference, keep the escape sequence in the output */
smart_str_appendl(pbuf, sp, p - sp);
continue;
}
if (regs->beg[no] >= 0 && regs->beg[no] < regs->end[no] && (size_t)regs->end[no] <= subject_len) {
smart_str_appendl(pbuf, subject + regs->beg[no], regs->end[no] - regs->beg[no]);
}
}

if (p < eos) {
smart_str_appendl(pbuf, p, eos - p);
}
}
/* }}} */

/*
* php functions
*/
Expand Down Expand Up @@ -764,6 +938,11 @@ static void _php_mb_regex_ereg_exec(INTERNAL_FUNCTION_PARAMETERS, int icase)
add_index_bool(array, i, 0);
}
}

if (onig_number_of_names(re) > 0) {
mb_regex_groups_iter_args args = {array, string, string_len, regs};
onig_foreach_name(re, mb_regex_groups_iter, &args);
}
}

if (match_len == 0) {
Expand Down Expand Up @@ -810,14 +989,12 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
char *string;
size_t string_len;

char *p;
php_mb_regex_t *re;
OnigSyntaxType *syntax;
OnigRegion *regs = NULL;
smart_str out_buf = {0};
smart_str eval_buf = {0};
smart_str *pbuf;
size_t i;
int err, eval, n;
OnigUChar *pos;
OnigUChar *string_lim;
Expand Down Expand Up @@ -927,38 +1104,11 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
break;
}
if (err >= 0) {
#if moriyoshi_0
if (regs->beg[0] == regs->end[0]) {
php_error_docref(NULL, E_WARNING, "Empty regular expression");
break;
}
#endif
/* copy the part of the string before the match */
smart_str_appendl(&out_buf, (char *)pos, (size_t)((OnigUChar *)(string + regs->beg[0]) - pos));

if (!is_callable) {
/* copy replacement and backrefs */
i = 0;
p = replace;
while (i < replace_len) {
int fwd = (int) php_mb_mbchar_bytes_ex(p, enc);
n = -1;
if ((replace_len - i) >= 2 && fwd == 1 &&
p[0] == '\\' && p[1] >= '0' && p[1] <= '9') {
n = p[1] - '0';
}
if (n >= 0 && n < regs->num_regs) {
if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && (size_t)regs->end[n] <= string_len) {
smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]);
}
p += 2;
i += 2;
} else {
smart_str_appendl(pbuf, p, fwd);
p += fwd;
i += fwd;
}
}
mb_regex_substitute(pbuf, string, string_len, replace, replace_len, re, regs, enc);
}

if (eval) {
Expand Down Expand Up @@ -998,6 +1148,10 @@ static void _php_mb_regex_ereg_replace_exec(INTERNAL_FUNCTION_PARAMETERS, OnigOp
for (i = 0; i < regs->num_regs; i++) {
add_next_index_stringl(&subpats, string + regs->beg[i], regs->end[i] - regs->beg[i]);
}
if (onig_number_of_names(re) > 0) {
mb_regex_groups_iter_args args = {&subpats, string, string_len, regs};
onig_foreach_name(re, mb_regex_groups_iter, &args);
}

ZVAL_COPY_VALUE(&args[0], &subpats);
/* null terminate buffer */
Expand Down Expand Up @@ -1291,6 +1445,15 @@ _php_mb_regex_ereg_search_exec(INTERNAL_FUNCTION_PARAMETERS, int mode)
add_index_bool(return_value, i, 0);
}
}
if (onig_number_of_names(MBREX(search_re)) > 0) {
mb_regex_groups_iter_args args = {
return_value,
Z_STRVAL(MBREX(search_str)),
Z_STRLEN(MBREX(search_str)),
MBREX(search_regs)
};
onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
}
break;
default:
RETVAL_TRUE;
Expand Down Expand Up @@ -1417,6 +1580,15 @@ PHP_FUNCTION(mb_ereg_search_getregs)
add_index_bool(return_value, i, 0);
}
}
if (onig_number_of_names(MBREX(search_re)) > 0) {
mb_regex_groups_iter_args args = {
return_value,
Z_STRVAL(MBREX(search_str)),
len,
MBREX(search_regs)
};
onig_foreach_name(MBREX(search_re), mb_regex_groups_iter, &args);
}
} else {
RETVAL_FALSE;
}
Expand Down Expand Up @@ -1445,7 +1617,7 @@ PHP_FUNCTION(mb_ereg_search_setpos)
if ((position < 0) && (!Z_ISUNDEF(MBREX(search_str))) && (Z_TYPE(MBREX(search_str)) == IS_STRING)) {
position += Z_STRLEN(MBREX(search_str));
}

if (position < 0 || (!Z_ISUNDEF(MBREX(search_str)) && Z_TYPE(MBREX(search_str)) == IS_STRING && (size_t)position > Z_STRLEN(MBREX(search_str)))) {
php_error_docref(NULL, E_WARNING, "Position is out of range");
MBREX(search_pos) = 0;
Expand Down
37 changes: 37 additions & 0 deletions ext/mbstring/tests/mb_ereg_dupnames.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
--TEST--
Testing mb_ereg() duplicate named groups
--SKIPIF--
<?php
if (!extension_loaded('mbstring')) die('skip mbstring not enabled');
function_exists('mb_ereg') or die("skip mb_ereg() is not available in this build");
?>
--FILE--
<?php
mb_regex_encoding("UTF-8");
$pattern = '\w+((?<punct>?)|(?<punct>!))';
mb_ereg($pattern, '中?', $m);
var_dump($m);
mb_ereg($pattern, '中!', $m);
var_dump($m);
?>
--EXPECT--
array(4) {
[0]=>
string(6) "中?"
[1]=>
string(3) "?"
[2]=>
bool(false)
["punct"]=>
string(3) "?"
}
array(4) {
[0]=>
string(6) "中!"
[1]=>
bool(false)
[2]=>
string(3) "!"
["punct"]=>
string(3) "!"
}
50 changes: 50 additions & 0 deletions ext/mbstring/tests/mb_ereg_named_subpatterns.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
--TEST--
Testing mb_ereg() named subpatterns
--SKIPIF--
<?php
if (!extension_loaded('mbstring')) die('skip mbstring not enabled');
function_exists('mb_ereg') or die("skip mb_ereg() is not available in this build");
?>
--FILE--
<?php
mb_regex_encoding("UTF-8");
mb_ereg('(?<wsp>\s*)(?<word>\w+)', ' 中国', $m);
var_dump($m);
mb_ereg('(?<wsp>\s*)(?<word>\w+)', '国', $m);
var_dump($m);
mb_ereg('(\s*)(?<word>\w+)', ' 中国', $m);
var_dump($m);
?>
--EXPECT--
array(5) {
[0]=>
string(8) " 中国"
[1]=>
string(2) " "
[2]=>
string(6) "中国"
["wsp"]=>
string(2) " "
["word"]=>
string(6) "中国"
}
array(5) {
[0]=>
string(3) "国"
[1]=>
bool(false)
[2]=>
string(3) "国"
["wsp"]=>
bool(false)
["word"]=>
string(3) "国"
}
array(3) {
[0]=>
string(8) " 中国"
[1]=>
string(6) "中国"
["word"]=>
string(6) "中国"
}
Loading