Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds list.unique API #7664

Merged
merged 14 commits into from
Mar 31, 2021
Merged
13 changes: 13 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.types cimport null_equality

cdef extern from "cudf/lists/drop_list_duplicates.hpp" namespace "cudf::lists" nogil:
cdef unique_ptr[column] drop_list_duplicates(
const lists_column_view lists_column,
null_equality nulls_equal
) except +
23 changes: 22 additions & 1 deletion python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@ from cudf._lib.cpp.lists.count_elements cimport (
from cudf._lib.cpp.lists.explode cimport (
explode_outer as cpp_explode_outer
)
from cudf._lib.cpp.lists.drop_list_duplicates cimport (
drop_list_duplicates as cpp_drop_list_duplicates
)
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.cpp.types cimport size_type, null_equality

from cudf._lib.column cimport Column
from cudf._lib.table cimport Table
Expand Down Expand Up @@ -58,3 +61,21 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
column_names=tbl._column_names,
index_names=None if ignore_index else tbl._index_names
)


def drop_list_duplicates(Column col, bool nulls_equal):
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_drop_list_duplicates(list_view.get()[0], c_nulls_equal)
)

result = Column.from_unique_ptr(move(c_result))
return result
35 changes: 34 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import cudf
from cudf._lib.copying import segmented_gather
from cudf._lib.lists import count_elements
from cudf._lib.lists import count_elements, drop_list_duplicates
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethodsMixin
Expand Down Expand Up @@ -285,3 +285,36 @@ def take(self, lists_indices):
raise
else:
return res

def unique(self):
"""
Returns unique element for each list in the column, order for each
unique element is not guaranteed.

Returns
-------
ListColumn

Examples
--------
>>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []])
>>> s
0 [1.0, 1.0, 2.0, nan, nan]
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
1 None
2 [4.0, 4.0]
3 []
dtype: list
>>> s.list.unique() # Order of list element is not guaranteed
0 [1.0, 2.0, nan]
isVoid marked this conversation as resolved.
Show resolved Hide resolved
1 None
2 [4.0]
3 []
dtype: list
"""

if is_list_dtype(self._column.children[1].dtype):
raise NotImplementedError("Nested lists unique is not supported.")

return self._return_or_inplace(
drop_list_duplicates(self._column, nulls_equal=True)
)
21 changes: 21 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,24 @@ def test_take_invalid(invalid, exception):
gs = cudf.Series([[0, 1], [2, 3]])
with exception:
gs.list.take(invalid)


@pytest.mark.parametrize(
("data", "expected"),
[
([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]),
([[1, 1, 2, 2, None, None]], [[1, 2, None]]),
([[2, None, 1, None, 2]], [[1, 2, None]]),
([[], []], [[], []]),
([[], None], [[], None]),
],
)
def test_unique(data, expected):
gs = cudf.Series(data)

got = gs.list.unique()
expected = cudf.Series(expected).list.sort_values()

got = got.list.sort_values()

assert_eq(expected, got)