Skip to content

Commit

Permalink
Implement joins in pylibcudf (#14972)
Browse files Browse the repository at this point in the history
Contributes to #13921

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #14972
  • Loading branch information
vyasr authored Feb 6, 2024
1 parent 506d575 commit c7e3dc5
Show file tree
Hide file tree
Showing 10 changed files with 218 additions and 62 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ This page provides API documentation for pylibcudf.
copying
gpumemoryview
groupby
join
scalar
table
types
Expand Down
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
====
join
====

.. automodule:: cudf._lib.pylibcudf.join
:members:
9 changes: 5 additions & 4 deletions python/cudf/cudf/_lib/cpp/join.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -13,19 +13,20 @@ from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type

ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type

cdef extern from "cudf/join.hpp" namespace "cudf" nogil:
cdef pair[gather_map_type, gather_map_type] inner_join(
cdef gather_map_pair_type inner_join(
const table_view left_keys,
const table_view right_keys,
) except +

cdef pair[gather_map_type, gather_map_type] left_join(
cdef gather_map_pair_type left_join(
const table_view left_keys,
const table_view right_keys,
) except +

cdef pair[gather_map_type, gather_map_type] full_join(
cdef gather_map_pair_type full_join(
const table_view left_keys,
const table_view right_keys,
) except +
Expand Down
76 changes: 22 additions & 54 deletions python/cudf/cudf/_lib/join.pyx
Original file line number Diff line number Diff line change
@@ -1,73 +1,41 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport make_unique, unique_ptr
from libcpp.pair cimport pair
from libcpp.utility cimport move

from rmm._lib.device_buffer cimport device_buffer

cimport cudf._lib.cpp.join as cpp_join
from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport data_type, size_type, type_id
from cudf._lib.utils cimport table_view_from_columns

from cudf._lib import pylibcudf

# The functions below return the *gathermaps* that represent
# the join result when joining on the keys `lhs` and `rhs`.


@acquire_spill_lock()
def join(list lhs, list rhs, how=None):
cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
cdef table_view c_lhs = table_view_from_columns(lhs)
cdef table_view c_rhs = table_view_from_columns(rhs)

if how == "inner":
with nogil:
c_result = move(cpp_join.inner_join(c_lhs, c_rhs))
elif how == "left":
with nogil:
c_result = move(cpp_join.left_join(c_lhs, c_rhs))
elif how == "outer":
with nogil:
c_result = move(cpp_join.full_join(c_lhs, c_rhs))
else:
if how == "outer":
how = "full"
if (join_func := getattr(pylibcudf.join, f"{how}_join", None)) is None:
raise ValueError(f"Invalid join type {how}")

cdef Column left_rows = _gather_map_as_column(move(c_result.first))
cdef Column right_rows = _gather_map_as_column(move(c_result.second))
return left_rows, right_rows
left_rows, right_rows = join_func(
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]),
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]),
)
return Column.from_pylibcudf(left_rows), Column.from_pylibcudf(right_rows)


@acquire_spill_lock()
def semi_join(list lhs, list rhs, how=None):
# left-semi and left-anti joins
cdef cpp_join.gather_map_type c_result
cdef table_view c_lhs = table_view_from_columns(lhs)
cdef table_view c_rhs = table_view_from_columns(rhs)

if how == "leftsemi":
with nogil:
c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs))
elif how == "leftanti":
with nogil:
c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs))
else:
if (
join_func := getattr(
pylibcudf.join, f"{how.replace('left', 'left_')}_join", None
)
) is None:
raise ValueError(f"Invalid join type {how}")

cdef Column left_rows = _gather_map_as_column(move(c_result))
return left_rows, None


cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
# help to convert a gather map to a Column
cdef device_buffer c_empty
cdef size_type size = gather_map.get()[0].size()
cdef unique_ptr[column] c_col = move(make_unique[column](
data_type(type_id.INT32),
size,
gather_map.get()[0].release(), move(c_empty), 0))
return Column.from_unique_ptr(move(c_col))
return Column.from_pylibcudf(
join_func(
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]),
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]),
)
), None
5 changes: 3 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
# the License.
# =============================================================================

set(cython_sources aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx
groupby.pyx interop.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx
set(cython_sources
aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx groupby.pyx interop.pyx
join.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

# TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
from . cimport aggregation, binaryop, copying, groupby, interop, unary
from . cimport aggregation, binaryop, copying, groupby, interop, join, unary
from .column cimport Column
from .gpumemoryview cimport gpumemoryview
from .scalar cimport Scalar
Expand All @@ -21,6 +21,7 @@ __all__ = [
"gpumemoryview",
"groupby",
"interop",
"join",
"unary",
"types",
]
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from . import aggregation, binaryop, copying, groupby, interop, unary
from . import aggregation, binaryop, copying, groupby, interop, join, unary
from .column import Column
from .gpumemoryview import gpumemoryview
from .scalar import Scalar
Expand All @@ -19,6 +19,7 @@
"gpumemoryview",
"groupby",
"interop",
"join",
"unary",
"types",
]
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ cdef class GroupBy:
c_requests.push_back(move(request._to_libcudf_agg_request()))

cdef pair[unique_ptr[table], vector[aggregation_result]] c_res
# TODO: Need to capture C++ exceptions indicating that an invalid type was used.
# We rely on libcudf to tell us this rather than checking the types beforehand
# ourselves.
with nogil:
c_res = move(dereference(self.c_obj).aggregate(c_requests))
return GroupBy._parse_outputs(move(c_res))
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/join.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from .column cimport Column
from .table cimport Table


cpdef tuple inner_join(Table left_keys, Table right_keys)

cpdef tuple left_join(Table left_keys, Table right_keys)

cpdef tuple full_join(Table left_keys, Table right_keys)

cpdef Column left_semi_join(Table left_keys, Table right_keys)

cpdef Column left_anti_join(Table left_keys, Table right_keys)
159 changes: 159 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/join.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cython.operator import dereference

from libcpp.memory cimport make_unique
from libcpp.utility cimport move

from rmm._lib.device_buffer cimport device_buffer

from cudf._lib.cpp cimport join as cpp_join
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.types cimport data_type, size_type, type_id

from .column cimport Column
from .table cimport Table


cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
# helper to convert a gather map to a Column
cdef device_buffer c_empty
cdef size_type size = dereference(gather_map.get()).size()
return Column.from_libcudf(
move(
make_unique[column](
data_type(type_id.INT32),
size,
dereference(gather_map.get()).release(),
move(c_empty),
0
)
)
)


cpdef tuple inner_join(Table left_keys, Table right_keys):
"""Perform an inner join between two tables.
For details, see :cpp:func:`inner_join`.
Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.
Returns
-------
Tuple[Column, Column]
A tuple containing the row indices from the left and right tables after the
join.
"""
cdef cpp_join.gather_map_pair_type c_result
with nogil:
c_result = cpp_join.inner_join(left_keys.view(), right_keys.view())
return (
_column_from_gather_map(move(c_result.first)),
_column_from_gather_map(move(c_result.second)),
)


cpdef tuple left_join(Table left_keys, Table right_keys):
"""Perform a left join between two tables.
For details, see :cpp:func:`left_join`.
Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.
Returns
-------
Tuple[Column, Column]
A tuple containing the row indices from the left and right tables after the
join.
"""
cdef cpp_join.gather_map_pair_type c_result
with nogil:
c_result = cpp_join.left_join(left_keys.view(), right_keys.view())
return (
_column_from_gather_map(move(c_result.first)),
_column_from_gather_map(move(c_result.second)),
)


cpdef tuple full_join(Table left_keys, Table right_keys):
"""Perform a full join between two tables.
For details, see :cpp:func:`full_join`.
Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.
Returns
-------
Tuple[Column, Column]
A tuple containing the row indices from the left and right tables after the
join.
"""
cdef cpp_join.gather_map_pair_type c_result
with nogil:
c_result = cpp_join.full_join(left_keys.view(), right_keys.view())
return (
_column_from_gather_map(move(c_result.first)),
_column_from_gather_map(move(c_result.second)),
)


cpdef Column left_semi_join(Table left_keys, Table right_keys):
"""Perform a left semi join between two tables.
For details, see :cpp:func:`left_semi_join`.
Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.
Returns
-------
Column
A column containing the row indices from the left table after the join.
"""
cdef cpp_join.gather_map_type c_result
with nogil:
c_result = cpp_join.left_semi_join(left_keys.view(), right_keys.view())
return _column_from_gather_map(move(c_result))


cpdef Column left_anti_join(Table left_keys, Table right_keys):
"""Perform a left anti join between two tables.
For details, see :cpp:func:`left_anti_join`.
Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.
Returns
-------
Column
A column containing the row indices from the left table after the join.
"""
cdef cpp_join.gather_map_type c_result
with nogil:
c_result = cpp_join.left_anti_join(left_keys.view(), right_keys.view())
return _column_from_gather_map(move(c_result))

0 comments on commit c7e3dc5

Please sign in to comment.