-
Notifications
You must be signed in to change notification settings - Fork 921
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement joins in pylibcudf (#14972)
Contributes to #13921 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: #14972
- Loading branch information
Showing
10 changed files
with
218 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
==== | ||
join | ||
==== | ||
|
||
.. automodule:: cudf._lib.pylibcudf.join | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,73 +1,41 @@ | ||
# Copyright (c) 2020-2023, NVIDIA CORPORATION. | ||
# Copyright (c) 2020-2024, NVIDIA CORPORATION. | ||
|
||
from cudf.core.buffer import acquire_spill_lock | ||
|
||
from libcpp.memory cimport make_unique, unique_ptr | ||
from libcpp.pair cimport pair | ||
from libcpp.utility cimport move | ||
|
||
from rmm._lib.device_buffer cimport device_buffer | ||
|
||
cimport cudf._lib.cpp.join as cpp_join | ||
from cudf._lib.column cimport Column | ||
from cudf._lib.cpp.column.column cimport column | ||
from cudf._lib.cpp.table.table_view cimport table_view | ||
from cudf._lib.cpp.types cimport data_type, size_type, type_id | ||
from cudf._lib.utils cimport table_view_from_columns | ||
|
||
from cudf._lib import pylibcudf | ||
|
||
# The functions below return the *gathermaps* that represent | ||
# the join result when joining on the keys `lhs` and `rhs`. | ||
|
||
|
||
@acquire_spill_lock() | ||
def join(list lhs, list rhs, how=None): | ||
cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result | ||
cdef table_view c_lhs = table_view_from_columns(lhs) | ||
cdef table_view c_rhs = table_view_from_columns(rhs) | ||
|
||
if how == "inner": | ||
with nogil: | ||
c_result = move(cpp_join.inner_join(c_lhs, c_rhs)) | ||
elif how == "left": | ||
with nogil: | ||
c_result = move(cpp_join.left_join(c_lhs, c_rhs)) | ||
elif how == "outer": | ||
with nogil: | ||
c_result = move(cpp_join.full_join(c_lhs, c_rhs)) | ||
else: | ||
if how == "outer": | ||
how = "full" | ||
if (join_func := getattr(pylibcudf.join, f"{how}_join", None)) is None: | ||
raise ValueError(f"Invalid join type {how}") | ||
|
||
cdef Column left_rows = _gather_map_as_column(move(c_result.first)) | ||
cdef Column right_rows = _gather_map_as_column(move(c_result.second)) | ||
return left_rows, right_rows | ||
left_rows, right_rows = join_func( | ||
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]), | ||
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]), | ||
) | ||
return Column.from_pylibcudf(left_rows), Column.from_pylibcudf(right_rows) | ||
|
||
|
||
@acquire_spill_lock() | ||
def semi_join(list lhs, list rhs, how=None): | ||
# left-semi and left-anti joins | ||
cdef cpp_join.gather_map_type c_result | ||
cdef table_view c_lhs = table_view_from_columns(lhs) | ||
cdef table_view c_rhs = table_view_from_columns(rhs) | ||
|
||
if how == "leftsemi": | ||
with nogil: | ||
c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs)) | ||
elif how == "leftanti": | ||
with nogil: | ||
c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs)) | ||
else: | ||
if ( | ||
join_func := getattr( | ||
pylibcudf.join, f"{how.replace('left', 'left_')}_join", None | ||
) | ||
) is None: | ||
raise ValueError(f"Invalid join type {how}") | ||
|
||
cdef Column left_rows = _gather_map_as_column(move(c_result)) | ||
return left_rows, None | ||
|
||
|
||
cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map): | ||
# help to convert a gather map to a Column | ||
cdef device_buffer c_empty | ||
cdef size_type size = gather_map.get()[0].size() | ||
cdef unique_ptr[column] c_col = move(make_unique[column]( | ||
data_type(type_id.INT32), | ||
size, | ||
gather_map.get()[0].release(), move(c_empty), 0)) | ||
return Column.from_unique_ptr(move(c_col)) | ||
return Column.from_pylibcudf( | ||
join_func( | ||
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]), | ||
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]), | ||
) | ||
), None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from .column cimport Column | ||
from .table cimport Table | ||
|
||
|
||
cpdef tuple inner_join(Table left_keys, Table right_keys) | ||
|
||
cpdef tuple left_join(Table left_keys, Table right_keys) | ||
|
||
cpdef tuple full_join(Table left_keys, Table right_keys) | ||
|
||
cpdef Column left_semi_join(Table left_keys, Table right_keys) | ||
|
||
cpdef Column left_anti_join(Table left_keys, Table right_keys) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from cython.operator import dereference | ||
|
||
from libcpp.memory cimport make_unique | ||
from libcpp.utility cimport move | ||
|
||
from rmm._lib.device_buffer cimport device_buffer | ||
|
||
from cudf._lib.cpp cimport join as cpp_join | ||
from cudf._lib.cpp.column.column cimport column | ||
from cudf._lib.cpp.types cimport data_type, size_type, type_id | ||
|
||
from .column cimport Column | ||
from .table cimport Table | ||
|
||
|
||
cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): | ||
# helper to convert a gather map to a Column | ||
cdef device_buffer c_empty | ||
cdef size_type size = dereference(gather_map.get()).size() | ||
return Column.from_libcudf( | ||
move( | ||
make_unique[column]( | ||
data_type(type_id.INT32), | ||
size, | ||
dereference(gather_map.get()).release(), | ||
move(c_empty), | ||
0 | ||
) | ||
) | ||
) | ||
|
||
|
||
cpdef tuple inner_join(Table left_keys, Table right_keys): | ||
"""Perform an inner join between two tables. | ||
For details, see :cpp:func:`inner_join`. | ||
Parameters | ||
---------- | ||
left_keys : Table | ||
The left table to join. | ||
right_keys : Table | ||
The right table to join. | ||
Returns | ||
------- | ||
Tuple[Column, Column] | ||
A tuple containing the row indices from the left and right tables after the | ||
join. | ||
""" | ||
cdef cpp_join.gather_map_pair_type c_result | ||
with nogil: | ||
c_result = cpp_join.inner_join(left_keys.view(), right_keys.view()) | ||
return ( | ||
_column_from_gather_map(move(c_result.first)), | ||
_column_from_gather_map(move(c_result.second)), | ||
) | ||
|
||
|
||
cpdef tuple left_join(Table left_keys, Table right_keys): | ||
"""Perform a left join between two tables. | ||
For details, see :cpp:func:`left_join`. | ||
Parameters | ||
---------- | ||
left_keys : Table | ||
The left table to join. | ||
right_keys : Table | ||
The right table to join. | ||
Returns | ||
------- | ||
Tuple[Column, Column] | ||
A tuple containing the row indices from the left and right tables after the | ||
join. | ||
""" | ||
cdef cpp_join.gather_map_pair_type c_result | ||
with nogil: | ||
c_result = cpp_join.left_join(left_keys.view(), right_keys.view()) | ||
return ( | ||
_column_from_gather_map(move(c_result.first)), | ||
_column_from_gather_map(move(c_result.second)), | ||
) | ||
|
||
|
||
cpdef tuple full_join(Table left_keys, Table right_keys): | ||
"""Perform a full join between two tables. | ||
For details, see :cpp:func:`full_join`. | ||
Parameters | ||
---------- | ||
left_keys : Table | ||
The left table to join. | ||
right_keys : Table | ||
The right table to join. | ||
Returns | ||
------- | ||
Tuple[Column, Column] | ||
A tuple containing the row indices from the left and right tables after the | ||
join. | ||
""" | ||
cdef cpp_join.gather_map_pair_type c_result | ||
with nogil: | ||
c_result = cpp_join.full_join(left_keys.view(), right_keys.view()) | ||
return ( | ||
_column_from_gather_map(move(c_result.first)), | ||
_column_from_gather_map(move(c_result.second)), | ||
) | ||
|
||
|
||
cpdef Column left_semi_join(Table left_keys, Table right_keys): | ||
"""Perform a left semi join between two tables. | ||
For details, see :cpp:func:`left_semi_join`. | ||
Parameters | ||
---------- | ||
left_keys : Table | ||
The left table to join. | ||
right_keys : Table | ||
The right table to join. | ||
Returns | ||
------- | ||
Column | ||
A column containing the row indices from the left table after the join. | ||
""" | ||
cdef cpp_join.gather_map_type c_result | ||
with nogil: | ||
c_result = cpp_join.left_semi_join(left_keys.view(), right_keys.view()) | ||
return _column_from_gather_map(move(c_result)) | ||
|
||
|
||
cpdef Column left_anti_join(Table left_keys, Table right_keys): | ||
"""Perform a left anti join between two tables. | ||
For details, see :cpp:func:`left_anti_join`. | ||
Parameters | ||
---------- | ||
left_keys : Table | ||
The left table to join. | ||
right_keys : Table | ||
The right table to join. | ||
Returns | ||
------- | ||
Column | ||
A column containing the row indices from the left table after the join. | ||
""" | ||
cdef cpp_join.gather_map_type c_result | ||
with nogil: | ||
c_result = cpp_join.left_anti_join(left_keys.view(), right_keys.view()) | ||
return _column_from_gather_map(move(c_result)) |