Skip to content

Commit

Permalink
Tidy Python code
Browse files Browse the repository at this point in the history
  • Loading branch information
RAMitchell committed Mar 1, 2019
1 parent 0eee6da commit 63c9f8b
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 56 deletions.
17 changes: 4 additions & 13 deletions python-package/xgboost/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,28 +49,19 @@ class DataFrame(object):

# cudf
try:
from cudf.dataframe import DataFrame as CUDF
from cudf.dataframe.column import Column as CUDF_COL
from libgdf_cffi import ffi as CUDF_FFI
from cudf.dataframe import DataFrame as CudfDataFrame
from cudf.dataframe.column import Column as CudfColumn
CUDF_INSTALLED = True
except ImportError:

class CUDF(object):
class CudfDataFrame(object):
""" dummy object for cudf.dataframe.DataFrame """
pass

class CUDF_COL(object):
class CudfColumn(object):
""" dummy object for cudf.dataframe.column.Column """
pass

class CUDF_FFI(object):
""" dummy object for libgdf_cffi.ffi ... FFI bindings to cudf """
def new(self, *args, **kwargs):
pass

def cast(self, *args, **kwargs):
pass

CUDF_INSTALLED = False

# dt
Expand Down
47 changes: 25 additions & 22 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import numpy as np
import scipy.sparse

from .compat import (STRING_TYPES, PY3, DataFrame, CUDF, CUDF_COL, CUDF_FFI, MultiIndex, py_str,
from .compat import (STRING_TYPES, PY3, DataFrame, CudfDataFrame, CudfColumn, MultiIndex, py_str,
PANDAS_INSTALLED, DataTable)
from .libpath import find_lib_path

Expand Down Expand Up @@ -273,6 +273,7 @@ def _maybe_pandas_label(label):

return label


DT_TYPE_MAPPER = {'bool': 'bool', 'int': 'int', 'real': 'float'}

DT_TYPE_MAPPER2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
Expand Down Expand Up @@ -398,7 +399,7 @@ def __init__(self, data, label=None, missing=None,
_check_call(_LIB.XGDMatrixCreateFromFile(c_str(data),
ctypes.c_int(silent),
ctypes.byref(self.handle)))
elif isinstance(data, CUDF):
elif isinstance(data, CudfDataFrame):
self._init_from_cudf(data)
elif isinstance(data, scipy.sparse.csr_matrix):
self._init_from_csr(data)
Expand All @@ -419,33 +420,46 @@ def __init__(self, data, label=None, missing=None,
if label is not None:
if isinstance(label, np.ndarray):
self.set_label_npy2d(label)
elif isinstance(label, (CUDF, CUDF_COL)):
self.set_cudf_info('label', label)
elif isinstance(label, (CudfDataFrame, CudfColumn)):
self._set_cudf_info('label', label)
else:
self.set_label(label)
if weight is not None:
if isinstance(weight, np.ndarray):
self.set_weight_npy2d(weight)
elif isinstance(weight, (CUDF, CUDF_COL)):
elif isinstance(weight, (CudfDataFrame, CudfColumn)):
self.set_cudf_info('weight', weight)
else:
self.set_weight(weight)

self.feature_names = feature_names
self.feature_types = feature_types

def _set_cudf_info(self, field, data):
"""
Initialize info field from a GPU data frame or column.
"""
if isinstance(data, CudfDataFrame):
col_ptrs = data.to_interchange()
else:
# data is a single CUDF column
col_ptrs = ctypes.c_void_p * 1
col_ptrs[0] = data.to_interchange()
_check_call(_LIB.XGDMatrixSetCUDFInfo
(self.handle, c_str(field),
col_ptrs,
ctypes.c_size_t(len(col_ptrs))))

def _init_from_cudf(self, df):
"""
Initialize data from a GPU data frame.
"""
self.handle = ctypes.c_void_p()
col_ptrs = [df[col]._column.cffi_view for col in df.columns]
col_ptr_arr = CUDF_FFI.new('gdf_column*[]', col_ptrs)
_check_call(_LIB.XGDMatrixCreateFromCUDF
(ctypes.c_void_p(int(CUDF_FFI.cast('uintptr_t', col_ptr_arr))),
(df.to_interchange(),
ctypes.c_size_t(len(df.columns)),
ctypes.byref(self.handle)))

def _init_from_csr(self, csr):
"""
Initialize data from a CSR matrix.
Expand Down Expand Up @@ -606,6 +620,8 @@ def set_float_info(self, field, data):
and isinstance(data.base, np.ndarray) and (not data.flags.c_contiguous):
self.set_float_info_npy2d(field, data)
return
elif isinstance(data, (CudfDataFrame, CudfColumn)):
self._set_cudf_info(field, data)
c_data = c_array(ctypes.c_float, data)
_check_call(_LIB.XGDMatrixSetFloatInfo(self.handle,
c_str(field),
Expand Down Expand Up @@ -638,19 +654,6 @@ def set_float_info_npy2d(self, field, data):
c_data,
c_bst_ulong(len(data))))

def set_cudf_info(self, field, data):
col_ptrs = []
if isinstance(data, CUDF):
col_ptrs = [data[col]._column.cffi_view for col in data.columns]
else:
# data is a single CUDF column
col_ptrs = [data.cffi_view]
col_ptr_arr = CUDF_FFI.new('gdf_column*[]', col_ptrs)
_check_call(_LIB.XGDMatrixSetCUDFInfo
(self.handle, c_str(field),
ctypes.c_void_p(int(CUDF_FFI.cast('uintptr_t', col_ptr_arr))),
ctypes.c_size_t(len(col_ptrs))))

def set_uint_info(self, field, data):
"""Set uint type property into the DMatrix.
Expand Down
48 changes: 28 additions & 20 deletions tests/python-gpu/test_gpu_gdf.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,53 @@
import unittest
import pytest
import testing as tm
import numpy as np
import pandas as pd
try:
import cudf.dataframe as gdf
except ImportError as e:
print("Failed to import cuDF: " + str(e))
print("Skipping this test")
return 0
from sklearn import datasets
import sys
import unittest
import xgboost as xgb

from regression_test_utilities import run_suite, parameter_combinations, \
assert_results_non_increasing, Dataset

try:
import cudf.dataframe as cudf
except ImportError:
pass

pytestmark = pytest.mark.skipif(
tm.no_cudf()['condition'],
reason=tm.no_cudf()['reason'])

def get_gdf():

def get_cudf():
rng = np.random.RandomState(199)
n = 50000
m = 20
sparsity = 0.25
X, y = datasets.make_regression(n, m, random_state=rng)
Xy = (np.ascontiguousarray
(np.transpose(np.concatenate((X, np.expand_dims(y, axis=1)), axis=1))))
df = gdf.DataFrame(list(zip(['col%d' % i for i in range(m+1)], Xy)))
(np.transpose(np.concatenate((X, np.expand_dims(y, axis=1)), axis=1))))
df = cudf.DataFrame(list(zip(['col%d' % i for i in range(m + 1)], Xy)))
all_columns = list(df.columns)
cols_X = all_columns[0:len(all_columns)-1]
cols_y = [all_columns[len(all_columns)-1]]
cols_X = all_columns[0:len(all_columns) - 1]
cols_y = [all_columns[len(all_columns) - 1]]
return df[cols_X], df[cols_y]


class TestGPU(unittest.TestCase):
class TestCudf(unittest.TestCase):
cudf_datasets = [Dataset("GDF", get_cudf, "reg:linear", "rmse")]

gdf_datasets = [Dataset("GDF", get_gdf, "reg:linear", "rmse")]

def test_gdf(self):
def test_cudf(self):
variable_param = {'n_gpus': [1], 'max_depth': [10], 'max_leaves': [255],
'max_bin': [255],
'grow_policy': ['lossguide']}
for param in parameter_combinations(variable_param):
param['tree_method'] = 'gpu_hist'
gpu_results = run_suite(param, num_rounds=20,
select_datasets=self.gdf_datasets)
select_datasets=self.cudf_datasets)
assert_results_non_increasing(gpu_results, 1e-2)

def test_set_info_single_column(self):
X, y = get_cudf()
y = y[:, 0]
dtrain = xgb.DMatrix(X, y)
dtrain.set_float_info("weight", y)
dtrain.set_base_margin(y)
7 changes: 6 additions & 1 deletion tests/python/testing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED, DT_INSTALLED
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED, DT_INSTALLED, CUDF_INSTALLED


def no_sklearn():
Expand All @@ -17,6 +17,11 @@ def no_dt():
'reason': 'Datatable is not installed.'}


def no_cudf():
return {'condition': not CUDF_INSTALLED,
'reason': 'cudf is not installed.'}


def no_matplotlib():
reason = 'Matplotlib is not installed.'
try:
Expand Down

0 comments on commit 63c9f8b

Please sign in to comment.