Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Hackathon 6th No.10】Add isin API to Paddle -part #64001

Merged
merged 9 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/paddle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@
inner,
inverse,
isfinite,
isin,
isinf,
isnan,
isneginf,
Expand Down Expand Up @@ -730,6 +731,7 @@
'squeeze_',
'to_tensor',
'gather_nd',
'isin',
'isinf',
'isneginf',
'isposinf',
Expand Down
2 changes: 2 additions & 0 deletions python/paddle/tensor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@
inner,
inverse,
isfinite,
isin,
isinf,
isnan,
isneginf,
Expand Down Expand Up @@ -587,6 +588,7 @@
'kron',
'kthvalue',
'isfinite',
'isin',
'isinf',
'isnan',
'isneginf',
Expand Down
184 changes: 184 additions & 0 deletions python/paddle/tensor/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -7969,3 +7969,187 @@ def sinc_(x, name=None):
paddle.sin_(x)
paddle.divide_(x, tmp)
return paddle.where(~paddle.isnan(x), x, paddle.full_like(x, 1.0))


def isin(x, test_x, assume_unique=False, invert=False, name=None):
r"""
Tests if each element of `x` is in `test_x`.

Args:
x (Tensor): The input Tensor. Supported data type: 'bfloat16', 'float16', 'float32', 'float64', 'int32', 'int64'.
test_x (Tensor): Tensor values against which to test for each input element. Supported data type: 'bfloat16', 'float16', 'float32', 'float64', 'int32', 'int64'.
assume_unique (bool, optional): If True, indicates both `x` and `test_x` contain unique elements, which could make the calculation faster. Default: False.
invert (bool, optional): Indicate whether to invert the boolean return tensor. If True, invert the results. Default: False.
name (str, optional): Name for the operation (optional, default is None).For more information, please refer to :ref:`api_guide_Name`.

Returns:
out (Tensor), The output Tensor with the same shape as `x`.

Examples:
.. code-block:: python

>>> import paddle
>>> paddle.set_device('cpu')
>>> x = paddle.to_tensor([-0., -2.1, 2.5, 1.0, -2.1], dtype='float32')
>>> test_x = paddle.to_tensor([-2.1, 2.5], dtype='float32')
>>> res = paddle.isin(x, test_x)
>>> print(res)
Tensor(shape=[5], dtype=bool, place=Place(cpu), stop_gradient=True,
[False, True, True, False, True])

>>> x = paddle.to_tensor([-0., -2.1, 2.5, 1.0, -2.1], dtype='float32')
>>> test_x = paddle.to_tensor([-2.1, 2.5], dtype='float32')
>>> res = paddle.isin(x, test_x, invert=True)
>>> print(res)
Tensor(shape=[5], dtype=bool, place=Place(cpu), stop_gradient=True,
[True, False, False, True, False])

>>> # Set `assume_unique` to True only when `x` and `test_x` contain unique values, otherwise the result may be incorrect.
>>> x = paddle.to_tensor([0., 1., 2.]*20).reshape([20, 3])
>>> test_x = paddle.to_tensor([0., 1.]*20)
>>> correct_result = paddle.isin(x, test_x, assume_unique=False)
>>> print(correct_result)
Tensor(shape=[20, 3], dtype=bool, place=Place(cpu), stop_gradient=True,
[[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False],
[True , True , False]])

>>> incorrect_result = paddle.isin(x, test_x, assume_unique=True)
>>> print(incorrect_result)
Tensor(shape=[20, 3], dtype=bool, place=Place(gpu:0), stop_gradient=True,
[[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , True ],
[True , True , False]])

"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

参考 pytorch 在 _decompose 中的设计:当 test_elements 元素个数较少时直接进行暴力搜索,较多时则采取基于排序的算法:若 assume_unique=True,则用稳定排序的 argsort 进行实现;若 assume_unique=False,则用 searchsorted 进行实现。

示例代码中,能否体现出 assume_unique=True/False 的区别?注释中是否要加入这段说明

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

增加了一个 assume_unique 设置错误导致结果出错的例子;
在下面代码中加了不同分支对应不同做法的注释

if not isinstance(x, (paddle.Tensor, Variable, paddle.pir.Value)):
raise TypeError(f"x must be tensor type, but got {type(x)}")
if not isinstance(test_x, (paddle.Tensor, Variable, paddle.pir.Value)):
raise TypeError(f"x must be tensor type, but got {type(test_x)}")

check_variable_and_dtype(
x,
"x",
[
'uint16',
'float16',
'float32',
'float64',
'int32',
'int64',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we support bfloat16 and float16?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Temporarily not since paddle.searchsorted only support 'float32', 'float64', 'int32' and 'int64'.

],
"isin",
)

check_variable_and_dtype(
test_x,
"test_x",
[
'uint16',
'float16',
'float32',
'float64',
'int32',
'int64',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same date type issue as above

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Temporarily not since paddle.searchsorted only support 'float32', 'float64', 'int32' and 'int64'.

],
"isin",
)

x_zero_dim = False
if len(x.shape) == 0:
x = x.reshape([1])
x_zero_dim = True

size_x = math.prod(x.shape)
size_t = math.prod(test_x.shape)
if size_t < math.pow(size_x, 0.145) * 10.0:
# use brute-force searching if the test_x size is small
if len(x.shape) == 0:
return paddle.zeros([], dtype='bool')

tmp = x.reshape(tuple(x.shape) + ((1,) * test_x.ndim))
cmp = tmp == test_x
dim = tuple(range(-1, -test_x.ndim - 1, -1))
cmp = cmp.any(axis=dim)
if invert:
cmp = ~cmp
else:
x_flat = x.flatten()
test_x_flat = test_x.flatten()
if assume_unique:
# if x and test_x both contain unique elements, use stable argsort method which could be faster
all_elements = paddle.concat([x_flat, test_x_flat])
sorted_index = paddle.argsort(all_elements, stable=True)
sorted_x = all_elements[sorted_index]

duplicate_mask = paddle.full_like(sorted_index, False, dtype='bool')
if not in_dynamic_mode():
duplicate_mask = paddle.static.setitem(
duplicate_mask,
paddle.arange(duplicate_mask.numel() - 1),
sorted_x[1:] == sorted_x[:-1],
)
else:
duplicate_mask[:-1] = sorted_x[1:] == sorted_x[:-1]

if invert:
duplicate_mask = duplicate_mask.logical_not()

mask = paddle.empty_like(duplicate_mask)
if not in_dynamic_or_pir_mode():
mask = paddle.static.setitem(mask, sorted_index, duplicate_mask)
else:
mask[sorted_index] = duplicate_mask

cmp = mask[0 : x.numel()].reshape(x.shape)
else:
# otherwise use searchsorted method
sorted_test_x = paddle.sort(test_x_flat)
idx = paddle.searchsorted(sorted_test_x, x_flat)
test_idx = paddle.where(
idx < sorted_test_x.numel(),
idx,
paddle.zeros_like(idx, 'int64'),
)
cmp = sorted_test_x[test_idx] == x_flat
cmp = cmp.logical_not() if invert else cmp
cmp = cmp.reshape(x.shape)

if x_zero_dim:
return cmp.reshape([])
else:
return cmp
1 change: 1 addition & 0 deletions python/paddle/tensor/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def argsort(x, axis=-1, descending=False, stable=False, name=None):
x,
'x',
[
'uint16',
'float16',
'float32',
'float64',
Expand Down
1 change: 1 addition & 0 deletions test/legacy_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,7 @@ if(WITH_DISTRIBUTE)
endif()

# setting timeout value as 15S
set_tests_properties(test_isin PROPERTIES TIMEOUT 30)
set_tests_properties(test_binomial_op PROPERTIES TIMEOUT 30)
set_tests_properties(test_run PROPERTIES TIMEOUT 120)
set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 180)
Expand Down
Loading