diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 96ff86a4a90b..da9dd1eb38ed 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -49,28 +49,19 @@ class DataFrame(object): # cudf try: - from cudf.dataframe import DataFrame as CUDF - from cudf.dataframe.column import Column as CUDF_COL - from libgdf_cffi import ffi as CUDF_FFI + from cudf.dataframe import DataFrame as CudfDataFrame + from cudf.dataframe.column import Column as CudfColumn CUDF_INSTALLED = True except ImportError: - class CUDF(object): + class CudfDataFrame(object): """ dummy object for cudf.dataframe.DataFrame """ pass - class CUDF_COL(object): + class CudfColumn(object): """ dummy object for cudf.dataframe.column.Column """ pass - class CUDF_FFI(object): - """ dummy object for libgdf_cffi.ffi ... FFI bindings to cudf """ - def new(self, *args, **kwargs): - pass - - def cast(self, *args, **kwargs): - pass - CUDF_INSTALLED = False # dt diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 8c5e970f0bb1..38b0bd0b0a19 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -19,7 +19,7 @@ import numpy as np import scipy.sparse -from .compat import (STRING_TYPES, PY3, DataFrame, CUDF, CUDF_COL, CUDF_FFI, MultiIndex, py_str, +from .compat import (STRING_TYPES, PY3, DataFrame, CudfDataFrame, CudfColumn, MultiIndex, py_str, PANDAS_INSTALLED, DataTable) from .libpath import find_lib_path @@ -273,6 +273,7 @@ def _maybe_pandas_label(label): return label + DT_TYPE_MAPPER = {'bool': 'bool', 'int': 'int', 'real': 'float'} DT_TYPE_MAPPER2 = {'bool': 'i', 'int': 'int', 'real': 'float'} @@ -398,7 +399,7 @@ def __init__(self, data, label=None, missing=None, _check_call(_LIB.XGDMatrixCreateFromFile(c_str(data), ctypes.c_int(silent), ctypes.byref(self.handle))) - elif isinstance(data, CUDF): + elif isinstance(data, CudfDataFrame): self._init_from_cudf(data) elif isinstance(data, scipy.sparse.csr_matrix): self._init_from_csr(data) @@ -419,14 +420,14 @@ def __init__(self, data, label=None, missing=None, if label is not None: if isinstance(label, np.ndarray): self.set_label_npy2d(label) - elif isinstance(label, (CUDF, CUDF_COL)): - self.set_cudf_info('label', label) + elif isinstance(label, (CudfDataFrame, CudfColumn)): + self._set_cudf_info('label', label) else: self.set_label(label) if weight is not None: if isinstance(weight, np.ndarray): self.set_weight_npy2d(weight) - elif isinstance(weight, (CUDF, CUDF_COL)): + elif isinstance(weight, (CudfDataFrame, CudfColumn)): self.set_cudf_info('weight', weight) else: self.set_weight(weight) @@ -434,18 +435,31 @@ def __init__(self, data, label=None, missing=None, self.feature_names = feature_names self.feature_types = feature_types + def _set_cudf_info(self, field, data): + """ + Initialize info field from a GPU data frame or column. + """ + if isinstance(data, CudfDataFrame): + col_ptrs = data.to_interchange() + else: + # data is a single CUDF column + col_ptrs = ctypes.c_void_p * 1 + col_ptrs[0] = data.to_interchange() + _check_call(_LIB.XGDMatrixSetCUDFInfo + (self.handle, c_str(field), + col_ptrs, + ctypes.c_size_t(len(col_ptrs)))) + def _init_from_cudf(self, df): """ Initialize data from a GPU data frame. """ self.handle = ctypes.c_void_p() - col_ptrs = [df[col]._column.cffi_view for col in df.columns] - col_ptr_arr = CUDF_FFI.new('gdf_column*[]', col_ptrs) _check_call(_LIB.XGDMatrixCreateFromCUDF - (ctypes.c_void_p(int(CUDF_FFI.cast('uintptr_t', col_ptr_arr))), + (df.to_interchange(), ctypes.c_size_t(len(df.columns)), ctypes.byref(self.handle))) - + def _init_from_csr(self, csr): """ Initialize data from a CSR matrix. @@ -606,6 +620,8 @@ def set_float_info(self, field, data): and isinstance(data.base, np.ndarray) and (not data.flags.c_contiguous): self.set_float_info_npy2d(field, data) return + elif isinstance(data, (CudfDataFrame, CudfColumn)): + self._set_cudf_info(field, data) c_data = c_array(ctypes.c_float, data) _check_call(_LIB.XGDMatrixSetFloatInfo(self.handle, c_str(field), @@ -638,19 +654,6 @@ def set_float_info_npy2d(self, field, data): c_data, c_bst_ulong(len(data)))) - def set_cudf_info(self, field, data): - col_ptrs = [] - if isinstance(data, CUDF): - col_ptrs = [data[col]._column.cffi_view for col in data.columns] - else: - # data is a single CUDF column - col_ptrs = [data.cffi_view] - col_ptr_arr = CUDF_FFI.new('gdf_column*[]', col_ptrs) - _check_call(_LIB.XGDMatrixSetCUDFInfo - (self.handle, c_str(field), - ctypes.c_void_p(int(CUDF_FFI.cast('uintptr_t', col_ptr_arr))), - ctypes.c_size_t(len(col_ptrs)))) - def set_uint_info(self, field, data): """Set uint type property into the DMatrix. diff --git a/tests/python-gpu/test_gpu_gdf.py b/tests/python-gpu/test_gpu_gdf.py index 2dfd00f751ba..953fa737ef3b 100644 --- a/tests/python-gpu/test_gpu_gdf.py +++ b/tests/python-gpu/test_gpu_gdf.py @@ -1,45 +1,53 @@ +import unittest +import pytest +import testing as tm import numpy as np -import pandas as pd -try: - import cudf.dataframe as gdf -except ImportError as e: - print("Failed to import cuDF: " + str(e)) - print("Skipping this test") - return 0 from sklearn import datasets import sys -import unittest import xgboost as xgb - from regression_test_utilities import run_suite, parameter_combinations, \ assert_results_non_increasing, Dataset +try: + import cudf.dataframe as cudf +except ImportError: + pass + +pytestmark = pytest.mark.skipif( + tm.no_cudf()['condition'], + reason=tm.no_cudf()['reason']) -def get_gdf(): + +def get_cudf(): rng = np.random.RandomState(199) n = 50000 m = 20 - sparsity = 0.25 X, y = datasets.make_regression(n, m, random_state=rng) Xy = (np.ascontiguousarray - (np.transpose(np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)))) - df = gdf.DataFrame(list(zip(['col%d' % i for i in range(m+1)], Xy))) + (np.transpose(np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)))) + df = cudf.DataFrame(list(zip(['col%d' % i for i in range(m + 1)], Xy))) all_columns = list(df.columns) - cols_X = all_columns[0:len(all_columns)-1] - cols_y = [all_columns[len(all_columns)-1]] + cols_X = all_columns[0:len(all_columns) - 1] + cols_y = [all_columns[len(all_columns) - 1]] return df[cols_X], df[cols_y] -class TestGPU(unittest.TestCase): +class TestCudf(unittest.TestCase): + cudf_datasets = [Dataset("GDF", get_cudf, "reg:linear", "rmse")] - gdf_datasets = [Dataset("GDF", get_gdf, "reg:linear", "rmse")] - - def test_gdf(self): + def test_cudf(self): variable_param = {'n_gpus': [1], 'max_depth': [10], 'max_leaves': [255], 'max_bin': [255], 'grow_policy': ['lossguide']} for param in parameter_combinations(variable_param): param['tree_method'] = 'gpu_hist' gpu_results = run_suite(param, num_rounds=20, - select_datasets=self.gdf_datasets) + select_datasets=self.cudf_datasets) assert_results_non_increasing(gpu_results, 1e-2) + + def test_set_info_single_column(self): + X, y = get_cudf() + y = y[:, 0] + dtrain = xgb.DMatrix(X, y) + dtrain.set_float_info("weight", y) + dtrain.set_base_margin(y) diff --git a/tests/python/testing.py b/tests/python/testing.py index 234e3952716e..ae6064c792cf 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -1,5 +1,5 @@ # coding: utf-8 -from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED, DT_INSTALLED +from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED, DT_INSTALLED, CUDF_INSTALLED def no_sklearn(): @@ -17,6 +17,11 @@ def no_dt(): 'reason': 'Datatable is not installed.'} +def no_cudf(): + return {'condition': not CUDF_INSTALLED, + 'reason': 'cudf is not installed.'} + + def no_matplotlib(): reason = 'Matplotlib is not installed.' try: