Skip to content

Commit 4102fe6

Browse files
committed
Fix GH-20262: array_unique() SORT_REGULAR fails to deduplicate with mixed strings
array_unique() with SORT_REGULAR was failing to remove duplicate numeric strings when mixed with alphanumeric strings due to non-transitive comparison issues in the sort-based algorithm. Implemented hash-bucketing optimization for SORT_REGULAR that preserves full type coercion semantics while improving performance from O(n²) to O(n). Closes GH-20262
1 parent 5887c76 commit 4102fe6

File tree

3 files changed

+352
-0
lines changed

3 files changed

+352
-0
lines changed

ext/standard/array.c

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4964,6 +4964,9 @@ PHP_FUNCTION(array_unique)
49644964
bucket_compare_func_t cmp;
49654965
struct bucketindex *arTmp, *cmpdata, *lastkept;
49664966
uint32_t i, idx;
4967+
zend_long num_key;
4968+
zend_string *str_key;
4969+
zval *val;
49674970

49684971
ZEND_PARSE_PARAMETERS_START(1, 2)
49694972
Z_PARAM_ARRAY(array)
@@ -4976,6 +4979,120 @@ PHP_FUNCTION(array_unique)
49764979
return;
49774980
}
49784981

4982+
if (sort_type == PHP_SORT_REGULAR) {
4983+
/* Hash-bucketing solution for SORT_REGULAR */
4984+
#define UNIQUE_HASH_BUCKETS 256
4985+
4986+
typedef struct {
4987+
zval **values;
4988+
uint32_t count;
4989+
uint32_t capacity;
4990+
} value_bucket;
4991+
4992+
value_bucket *buckets = ecalloc(UNIQUE_HASH_BUCKETS, sizeof(value_bucket));
4993+
cmp = php_get_data_compare_func_unstable(sort_type, 0);
4994+
array_init(return_value);
4995+
4996+
ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(array), num_key, str_key, val) {
4997+
/* Compute hash for this value */
4998+
zend_ulong hash = 0;
4999+
5000+
if (Z_TYPE_P(val) == IS_LONG) {
5001+
hash = Z_LVAL_P(val);
5002+
} else if (Z_TYPE_P(val) == IS_DOUBLE) {
5003+
hash = (zend_ulong)Z_DVAL_P(val);
5004+
} else if (Z_TYPE_P(val) == IS_TRUE) {
5005+
hash = 1; /* true hashes like integer 1 */
5006+
} else if (Z_TYPE_P(val) == IS_FALSE) {
5007+
hash = 0; /* false hashes like integer 0 */
5008+
} else if (Z_TYPE_P(val) == IS_NULL) {
5009+
hash = 0; /* null hashes like integer 0 */
5010+
} else if (Z_TYPE_P(val) == IS_STRING) {
5011+
/* Check if numeric string */
5012+
zend_long lval;
5013+
double dval;
5014+
zend_uchar type = is_numeric_string(Z_STRVAL_P(val), Z_STRLEN_P(val), &lval, &dval, 0);
5015+
5016+
if (type == IS_LONG) {
5017+
hash = (zend_ulong)lval; /* '5' and '05' hash the same */
5018+
} else if (type == IS_DOUBLE) {
5019+
hash = (zend_ulong)dval;
5020+
} else {
5021+
/* Non-numeric string */
5022+
if (Z_STRLEN_P(val) == 0) {
5023+
hash = 0; /* Empty string might equal false/null */
5024+
} else {
5025+
hash = zend_string_hash_val(Z_STR_P(val));
5026+
}
5027+
}
5028+
} else if (Z_TYPE_P(val) == IS_OBJECT) {
5029+
/* Hash objects by class name */
5030+
zend_class_entry *ce = Z_OBJCE_P(val);
5031+
hash = zend_string_hash_val(ce->name);
5032+
} else if (Z_TYPE_P(val) == IS_ARRAY) {
5033+
/* Hash arrays by size and first value */
5034+
hash = zend_hash_num_elements(Z_ARRVAL_P(val));
5035+
5036+
/* XOR with hash of first element if it's a simple type */
5037+
zval *first_elem = zend_hash_get_current_data(Z_ARRVAL_P(val));
5038+
if (first_elem) {
5039+
if (Z_TYPE_P(first_elem) == IS_LONG) {
5040+
hash ^= Z_LVAL_P(first_elem);
5041+
} else if (Z_TYPE_P(first_elem) == IS_STRING) {
5042+
hash ^= zend_string_hash_val(Z_STR_P(first_elem));
5043+
}
5044+
}
5045+
} else {
5046+
/* Other types */
5047+
hash = Z_TYPE_P(val);
5048+
}
5049+
5050+
uint32_t bucket_idx = hash % UNIQUE_HASH_BUCKETS;
5051+
value_bucket *bucket = &buckets[bucket_idx];
5052+
5053+
/* Check if duplicate exists in this bucket */
5054+
bool is_duplicate = false;
5055+
for (uint32_t i = 0; i < bucket->count; i++) {
5056+
Bucket b1 = {.val = *val}, b2 = {.val = *bucket->values[i]};
5057+
if (cmp(&b1, &b2) == 0) {
5058+
is_duplicate = true;
5059+
break;
5060+
}
5061+
}
5062+
5063+
if (!is_duplicate) {
5064+
/* Add to bucket */
5065+
if (bucket->count >= bucket->capacity) {
5066+
bucket->capacity = bucket->capacity ? bucket->capacity * 2 : 4;
5067+
bucket->values = erealloc(bucket->values, bucket->capacity * sizeof(zval*));
5068+
}
5069+
bucket->values[bucket->count++] = val;
5070+
5071+
/* Add to result */
5072+
if (UNEXPECTED(Z_ISREF_P(val) && Z_REFCOUNT_P(val) == 1)) {
5073+
ZVAL_DEREF(val);
5074+
}
5075+
Z_TRY_ADDREF_P(val);
5076+
5077+
if (str_key) {
5078+
zend_hash_add_new(Z_ARRVAL_P(return_value), str_key, val);
5079+
} else {
5080+
zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, val);
5081+
}
5082+
}
5083+
} ZEND_HASH_FOREACH_END();
5084+
5085+
/* Cleanup buckets */
5086+
for (uint32_t i = 0; i < UNIQUE_HASH_BUCKETS; i++) {
5087+
if (buckets[i].values) {
5088+
efree(buckets[i].values);
5089+
}
5090+
}
5091+
efree(buckets);
5092+
5093+
return;
5094+
}
5095+
49795096
if (sort_type == PHP_SORT_STRING) {
49805097
HashTable seen;
49815098
zend_long num_key;
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
--TEST--
2+
Test array_unique() function : SORT_REGULAR type coercion behavior
3+
--FILE--
4+
<?php
5+
echo "*** Testing array_unique() with SORT_REGULAR ***\n";
6+
7+
// Test 1: Integer and string representations (coerce)
8+
echo "\n-- Integer and string coercion --\n";
9+
var_dump(array_unique([1, "1", 2, "2"], SORT_REGULAR));
10+
11+
// Test 2: Boolean coercion
12+
echo "\n-- Boolean coercion --\n";
13+
var_dump(array_unique([true, 1, false, 0], SORT_REGULAR));
14+
15+
// Test 3: NULL coercion with empty string and "0"
16+
echo "\n-- NULL coercion --\n";
17+
var_dump(array_unique([null, "", false, 0, "0"], SORT_REGULAR));
18+
19+
// Test 4: Float coercion
20+
echo "\n-- Float coercion --\n";
21+
var_dump(array_unique([1, 1.0, "1", "1.0"], SORT_REGULAR));
22+
23+
// Test 5: Numeric strings coerce
24+
echo "\n-- Numeric strings --\n";
25+
var_dump(array_unique(["10", 10, "10.0", 10.0], SORT_REGULAR));
26+
27+
// Test 6: Leading zeros make strings distinct
28+
echo "\n-- Leading zeros --\n";
29+
var_dump(array_unique(["05", "5", 5], SORT_REGULAR));
30+
31+
// Test 7: Partial numeric strings don't coerce
32+
echo "\n-- Partial numeric strings --\n";
33+
var_dump(array_unique(["5abc", "5", 5], SORT_REGULAR));
34+
35+
// Test 8: Whitespace in numeric strings
36+
echo "\n-- Whitespace in numeric strings --\n";
37+
var_dump(array_unique(["5", " 5", "5 ", 5], SORT_REGULAR));
38+
39+
// Test 9: Case sensitivity for non-numeric strings
40+
echo "\n-- Case sensitivity --\n";
41+
var_dump(array_unique(["abc", "ABC", "Abc"], SORT_REGULAR));
42+
43+
// Test 10: Exponential notation coerces
44+
echo "\n-- Exponential notation --\n";
45+
var_dump(array_unique([1000, "1e3", "1000", 1e3], SORT_REGULAR));
46+
47+
// Test 11: Negative numbers
48+
echo "\n-- Negative numbers --\n";
49+
var_dump(array_unique([-5, "-5", -5.0], SORT_REGULAR));
50+
51+
// Test 12: Arrays as values
52+
echo "\n-- Arrays --\n";
53+
var_dump(array_unique([[1, 2], [1, 2], [1, 3]], SORT_REGULAR));
54+
55+
// Test 13: NaN handling (NaN != NaN)
56+
echo "\n-- NaN handling --\n";
57+
var_dump(array_unique([NAN, NAN, 1], SORT_REGULAR));
58+
59+
// Test 14: INF handling
60+
echo "\n-- INF handling --\n";
61+
var_dump(array_unique([INF, INF, -INF, -INF], SORT_REGULAR));
62+
63+
// Test 15: Bug GH-20262 - mixed numeric and alphanumeric
64+
echo "\n-- Bug GH-20262 case --\n";
65+
var_dump(array_unique(['5', '10', '3A', '5'], SORT_REGULAR));
66+
67+
// Test 16: SORT_REGULAR vs SORT_STRING comparison
68+
echo "\n-- SORT_REGULAR vs SORT_STRING --\n";
69+
$input = [true, 1, "1"];
70+
echo "SORT_REGULAR: ";
71+
var_dump(array_unique($input, SORT_REGULAR));
72+
echo "SORT_STRING: ";
73+
var_dump(array_unique($input, SORT_STRING));
74+
75+
echo "\nDone\n";
76+
?>
77+
--EXPECT--
78+
*** Testing array_unique() with SORT_REGULAR ***
79+
80+
-- Integer and string coercion --
81+
array(2) {
82+
[0]=>
83+
int(1)
84+
[2]=>
85+
int(2)
86+
}
87+
88+
-- Boolean coercion --
89+
array(2) {
90+
[0]=>
91+
bool(true)
92+
[2]=>
93+
bool(false)
94+
}
95+
96+
-- NULL coercion --
97+
array(2) {
98+
[0]=>
99+
NULL
100+
[4]=>
101+
string(1) "0"
102+
}
103+
104+
-- Float coercion --
105+
array(1) {
106+
[0]=>
107+
int(1)
108+
}
109+
110+
-- Numeric strings --
111+
array(1) {
112+
[0]=>
113+
string(2) "10"
114+
}
115+
116+
-- Leading zeros --
117+
array(1) {
118+
[0]=>
119+
string(2) "05"
120+
}
121+
122+
-- Partial numeric strings --
123+
array(2) {
124+
[0]=>
125+
string(4) "5abc"
126+
[1]=>
127+
string(1) "5"
128+
}
129+
130+
-- Whitespace in numeric strings --
131+
array(1) {
132+
[0]=>
133+
string(1) "5"
134+
}
135+
136+
-- Case sensitivity --
137+
array(3) {
138+
[0]=>
139+
string(3) "abc"
140+
[1]=>
141+
string(3) "ABC"
142+
[2]=>
143+
string(3) "Abc"
144+
}
145+
146+
-- Exponential notation --
147+
array(1) {
148+
[0]=>
149+
int(1000)
150+
}
151+
152+
-- Negative numbers --
153+
array(2) {
154+
[0]=>
155+
int(-5)
156+
[2]=>
157+
float(-5)
158+
}
159+
160+
-- Arrays --
161+
array(2) {
162+
[0]=>
163+
array(2) {
164+
[0]=>
165+
int(1)
166+
[1]=>
167+
int(2)
168+
}
169+
[2]=>
170+
array(2) {
171+
[0]=>
172+
int(1)
173+
[1]=>
174+
int(3)
175+
}
176+
}
177+
178+
-- NaN handling --
179+
array(3) {
180+
[0]=>
181+
float(NAN)
182+
[1]=>
183+
float(NAN)
184+
[2]=>
185+
int(1)
186+
}
187+
188+
-- INF handling --
189+
array(2) {
190+
[0]=>
191+
float(INF)
192+
[2]=>
193+
float(-INF)
194+
}
195+
196+
-- Bug GH-20262 case --
197+
array(3) {
198+
[0]=>
199+
string(1) "5"
200+
[1]=>
201+
string(2) "10"
202+
[2]=>
203+
string(2) "3A"
204+
}
205+
206+
-- SORT_REGULAR vs SORT_STRING --
207+
SORT_REGULAR: array(1) {
208+
[0]=>
209+
bool(true)
210+
}
211+
SORT_STRING: array(1) {
212+
[0]=>
213+
bool(true)
214+
}
215+
216+
Done
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
--TEST--
2+
Bug GH-20262 (array_unique() with SORT_REGULAR fails to remove duplicates with mixed strings)
3+
--FILE--
4+
<?php
5+
6+
// Original bug report case
7+
$units = ['5', '10', '3A', '5'];
8+
var_dump(array_unique($units, SORT_REGULAR));
9+
10+
?>
11+
--EXPECT--
12+
array(3) {
13+
[0]=>
14+
string(1) "5"
15+
[1]=>
16+
string(2) "10"
17+
[2]=>
18+
string(2) "3A"
19+
}

0 commit comments

Comments
 (0)