diff --git a/.gitignore b/.gitignore index 81aad72480..cf134eb478 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ __pycache__ htmlcov build/ build_prims/ +cmake-build* cuml.egg-info/ dist/ python/cuml/**/*.cpp @@ -29,6 +30,9 @@ log dask-worker-space/ tmp/ +## files pickled in notebook when ran during python docstring generation +docs/source/*.model + ## eclipse .project .cproject diff --git a/BUILD.md b/BUILD.md index 96cb5ee868..8787c2b253 100644 --- a/BUILD.md +++ b/BUILD.md @@ -15,10 +15,17 @@ To install cuML from source, ensure the following dependencies are met: 9. NCCL (>=2.4) 10. UCX [optional] (>= 1.7) - enables point-to-point messaging in the cuML standard communicator. This is necessary for many multi-node multi-GPU cuML algorithms to function. -It is recommended to use conda for environment/package management. If doing so, a convenience environment .yml file is located in `conda/environments/cuml_dec_cudax.y.yml` (replace x.y for your CUDA version). This file contains most of the dependencies mentioned above (notable exceptions are `gcc` and `zlib`). To use it, for example to create an environment named `cuml_dev` for CUDA 10.0 and Python 3.7, you can use the follow command: +It is recommended to use conda for environment/package management. If doing so, a convenience environment .yml file is located in `conda/environments/cuml_dec_cudax.y.yml` (replace x.y for your CUDA version). This file contains most of the dependencies mentioned above (notable exceptions are `gcc` and `zlib`). To use it, for example to create an environment named `cuml_dev` for CUDA 10.2 and Python 3.7, you can use the follow command: +```bash +conda create -n cuml_dev python=3.7 +conda env update -n cuml_dev --file=conda/environments/cuml_dev_cuda10.2.yml ``` -conda env create -n cuml_dev python=3.7 --file=conda/environments/cuml_dev_cuda10.0.yml + +These conda environments are based on the general RAPIDS meta packages that install common dependencies for RAPIDS projects. To install different versions of packages contained in those meta packages after creating the environment, it is recommended to remove those meta packages (without removing the actual packages contained in the environment) with the following command (having the environment active): + +```bash +conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env ``` ## Installing from Source: diff --git a/CHANGELOG.md b/CHANGELOG.md index b5f0f9238a..c2c9f83eeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,145 @@ +# cuML 0.17.0 (Date TBD) + +## New Features + +## Improvements +- PR #3070: Speed up dask/test_datasets tests +- PR #3075: Speed up test_linear_model tests +- PR #3078: Speed up test_incremental_pca tests +- PR #2902: `matrix/matrix.cuh` in RAFT namespacing +- PR #2903: Moving linalg's gemm, gemv, transpose to RAFT namespaces +- PR #2905: `stats` prims `mean_center`, `sum` to RAFT namespaces +- PR #2904: Moving `linalg` basic math ops to RAFT namespaces +- PR #3000: Pin cmake policies to cmake 3.17 version, bump project version to 0.17 +- PR #3083: Improving test_make_blobs testing time +- PR #2906: Moving `linalg` decomp to RAFT namespaces +- PR #2996: Removing the max_depth restriction for switching to the batched backend +- PR #3004: Remove Single Process Multi GPU (SPMG) code +- PR #3044: Move leftover `linalg` and `stats` to RAFT namespaces +- PR #3067: Deleting prims moved to RAFT and updating header paths +- PR #3074: Reducing dask coordinate descent test runtime + +## Bug Fixes +- PR #3072: Fusing metrics and score directories in src_prims +- PR #3037: Avoid logging deadlock in multi-threaded C code +- PR #2983: Fix seeding of KISS99 RNG +- PR #3011: Fix unused initialize_embeddings parameter in Barnes-Hut t-SNE +- PR #3008: Check number of columns in check_array validator +- PR #3012: Increasing learning rate for SGD log loss and invscaling pytests +- PR #3021: Fix a hang in cuML RF experimental backend +- PR #3039: Update RF and decision tree parameter initializations in benchmark codes +- PR #3061: Handle C++ exception thrown from FIL predict +- PR #3073: Update mathjax CDN URL for documentation +- PR #3062: Bumping xgboost version to match cuml version +- PR #3086: Reverting FIL Notebook Testing + +# cuML 0.16.0 (Date TBD) + +## New Features +- PR #2922: Install RAFT headers with cuML +- PR #2909: Update allgatherv for compatibility with latest RAFT +- PR #2677: Ability to export RF trees as JSON +- PR #2698: Distributed TF-IDF transformer +- PR #2476: Porter Stemmer +- PR #2789: Dask LabelEncoder +- PR #2152: add FIL C++ benchmark +- PR #2638: Improve cython build with custom `build_ext` +- PR #2866: Support XGBoost-style multiclass models (gradient boosted decision trees) in FIL C++ +- PR #2874: Issue warning for degraded accuracy with float64 models in Treelite +- PR #2881: Introduces experimental batched backend for random forest +- PR #2916: Add SKLearn multi-class GBDT model support in FIL + +## Improvements +- PR #2947: Add more warnings for accuracy degradation with 64-bit models +- PR #2873: Remove empty marker kernel code for NVTX markers +- PR #2796: Remove tokens of length 1 by default for text vectorizers +- PR #2741: Use rapids build packages in conda environments +- PR #2735: Update seed to random_state in random forest and associated tests +- PR #2739: Use cusparse_wrappers.h from RAFT +- PR #2729: Replace `cupy.sparse` with `cupyx.scipy.sparse` +- PR #2749: Correct docs for python version used in cuml_dev conda environment +- PR #2747: Adopting raft::handle_t and raft::comms::comms_t in cuML +- PR #2762: Fix broken links and provide minor edits to docs +- PR #2723: Support and enable convert_dtype in estimator predict +- PR #2758: Match sklearn's default n_components behavior for PCA +- PR #2770: Fix doxygen version during cmake +- PR #2766: Update default RandomForestRegressor score function to use r2 +- PR #2775: Enablinbg mg gtests w/ raft mpi comms +- PR #2783: Add pytest that will fail when GPU IDs in Dask cluster are not unique +- PR #2784: Add SparseCumlArray container for sparse index/data arrays +- PR #2785: Add in cuML-specific dev conda dependencies +- PR #2778: Add README for FIL +- PR #2799: Reenable lightgbm test with lower (1%) proba accuracy +- PR #2800: Align cuML's spdlog version with RMM's +- PR #2824: Make data conversions warnings be debug level +- PR #2835: Rng prims, utils, and dependencies in RAFT +- PR #2541: Improve Documentation Examples and Source Linking +- PR #2837: Make the FIL node reorder loop more obvious +- PR #2849: make num_classes significant in FLOAT_SCALAR case +- PR #2792: Project flash (new build process) script changes +- PR #2850: Clean up unused params in paramsPCA +- PR #2871: Add timing function to utils +- PR #2863: in FIL, rename leaf_value_t enums to more descriptive +- PR #2867: improve stability of FIL benchmark measurements +- PR #2798: Add python tests for FIL multiclass classification of lightgbm models +- PR #2892: Update ci/local/README.md +- PR #2910: Adding Support for CuPy 8.x +- PR #2914: Add tests for XGBoost multi-class models in FIL +- PR #2622: Simplify tSNE perplexity search +- PR #2930: Pin libfaiss to <=1.6.3 +- PR #2928: Updating Estimators Derived from Base for Consistency +- PR #2942: Adding `cuml.experimental` to the Docs +- PR #3010: Improve gpuCI Scripts + +## Bug Fixes +- PR #2973: Allow data imputation for nan values +- PR #2982: Adjust kneighbors classifier test threshold to avoid intermittent failure +- PR #2885: Changing test target for NVTX wrapper test +- PR #2882: Allow import on machines without GPUs +- PR #2875: Bug fix to enable colorful NVTX markers +- PR #2744: Supporting larger number of classes in KNeighborsClassifier +- PR #2769: Remove outdated doxygen options for 1.8.20 +- PR #2787: Skip lightgbm test for version 3 and above temporarily +- PR #2805: Retain index in stratified splitting for dataframes +- PR #2781: Use Python print to correctly redirect spdlogs when sys.stdout is changed +- PR #2787: Skip lightgbm test for version 3 and above temporarily +- PR #2813: Fix memory access in generation of non-row-major random blobs +- PR #2810: Update Rf MNMG threshold to prevent sporadic test failure +- PR #2808: Relax Doxygen version required in CMake to coincide with integration repo +- PR #2818: Fix parsing of singlegpu option in build command +- PR #2827: Force use of whole dataset when sample bootstrapping is disabled +- PR #2829: Fixing description for labels in docs and removing row number constraint from PCA xform/inverse_xform +- PR #2832: Updating stress tests that fail with OOM +- PR #2831: Removing repeated capture and parameter in lambda function +- PR #2847: Workaround for TSNE lockup, change caching preference. +- PR #2842: KNN index preprocessors were using incorrect n_samples +- PR #2848: Fix typo in Python docstring for UMAP +- PR #2856: Fix LabelEncoder for filtered input +- PR #2855: Updates for RMM being header only +- PR #2844: Fix for OPG KNN Classifier & Regressor +- PR #2880: Fix bugs in Auto-ARIMA when s==None +- PR #2877: TSNE exception for n_components > 2 +- PR #2879: Update unit test for LabelEncoder on filtered input +- PR #2932: Marking KBinsDiscretizer pytests as xfail +- PR #2925: Fixing Owner Bug When Slicing CumlArray Objects +- PR #2931: Fix notebook error handling in gpuCI +- PR #2941: Fixing dask tsvd stress test failure +- PR #2943: Remove unused shuffle_features parameter +- PR #2940: Correcting labels meta dtype for `cuml.dask.make_classification` +- PR #2965: Notebooks update +- PR #2955: Fix for conftest for singlegpu build +- PR #2968: Remove shuffle_features from RF param names +- PR #2957: Fix ols test size for stability +- PR #2972: Upgrade Treelite to 0.93 +- PR #2981: Prevent unguarded import of sklearn in SVC +- PR #2984: Fix GPU test scripts gcov error +- PR #2990: Reduce MNMG kneighbors regressor test threshold +- PR #2997: Changing ARIMA `get/set_params` to `get/set_fit_params` + # cuML 0.15.0 (Date TBD) ## New Features +- PR #2581: Added model persistence via joblib in each section of estimator_intro.ipynb - PR #2554: Hashing Vectorizer and general vectorizer improvements - PR #2240: Making Dask models pickleable - PR #2267: CountVectorizer estimator @@ -12,11 +151,23 @@ - PR #2394: Adding cosine & correlation distance for KNN - PR #2392: PCA can accept sparse inputs, and sparse prim for computing covariance - PR #2465: Support pandas 1.0+ +- PR #2550: Single GPU Target Encoder - PR #2519: Precision recall curve using cupy - PR #2500: Replace UMAP functionality dependency on nvgraph with RAFT Spectral Clustering +- PR #2502: cuML Implementation of `sklearn.metrics.pairwise_distances` - PR #2520: TfidfVectorizer estimator - PR #2211: MNMG KNN Classifier & Regressor - PR #2461: Add KNN Sparse Output Functionality +- PR #2615: Incremental PCA +- PR #2594: Confidence intervals for ARIMA forecasts +- PR #2607: Add support for probability estimates in SVC +- PR #2618: SVM class and sample weights +- PR #2635: Decorator to generate docstrings with autodetection of parameters +- PR #2270: Multi class MNMG RF +- PR #2661: CUDA-11 support for single-gpu code +- PR #2322: Sparse FIL forests with 8-byte nodes +- PR #2675: Update conda recipes to support CUDA 11 +- PR #2645: Add experimental, sklearn-based preprocessing ## Improvements - PR #2336: Eliminate `rmm.device_array` usage @@ -46,6 +197,7 @@ - PR #2403: Support for input and output type consistency in logistic regression predict_proba - PR #2473: Add metrics.roc_auc_score to API docs. Additional readability and minor docs bug fixes - PR #2468: Add `_n_features_in_` attribute to all single GPU estimators that implement fit +- PR #2489: Removing explicit FAISS build and adding dependency on libfaiss conda package - PR #2480: Moving MNMG glm and solvers to cuml - PR #2490: Moving MNMG KMeans to cuml - PR #2483: Moving MNMG KNN to cuml @@ -55,6 +207,7 @@ - PR #2237: Refactor RF cython code - PR #2513: Fixing LGTM Analysis Issues - PR #2099: Raise an error when float64 data is used with dask RF +- PR #2522: Renaming a few arguments in KNeighbors* to be more readable - PR #2499: Provide access to `cuml.DBSCAN` core samples - PR #2526: Removing PCA TSQR as a solver due to scalability issues - PR #2536: Update conda upload versions for new supported CUDA/Python @@ -69,8 +222,25 @@ - PR #2591: Generate benchmark datsets using `cuml.datasets` - PR #2548: Fix limitation on number of rows usable with tSNE and refactor memory allocation - PR #2589: including cuda-11 build fixes into raft +- PR #2599: Add Stratified train_test_split - PR #2487: Set classes_ attribute during classifier fit - PR #2605: Reduce memory usage in tSNE +- PR #2611: Adding building doxygen docs to gpu ci +- PR #2631: Enabling use of gtest conda package for build +- PR #2623: Fixing kmeans score() API to be compatible with Scikit-learn +- PR #2629: Add naive_bayes api docs +- PR #2643: 'dense' and 'sparse' values of `storage_type` for FIL +- PR #2691: Generic Base class attribute setter +- PR #2666: Update MBSGD documentation to mention that the model is experimental +- PR #2687: Update xgboost version to 1.2.0dev.rapidsai0.15 +- PR #2684: CUDA 11 conda development environment yml and faiss patch +- PR #2648: Replace CNMeM with `rmm::mr::pool_memory_resource`. +- PR #2686: Improve SVM tests +- PR #2692: Changin LBFGS log level +- PR #2705: Add sum operator and base operator overloader functions to cumlarray +- PR #2701: Updating README + Adding ref to UMAP paper +- PR #2721: Update API docs +- PR #2730: Unpin cumlprims in conda recipes for release ## Bug Fixes - PR #2369: Update RF code to fix set_params memory leak @@ -94,6 +264,8 @@ - PR #2497: Changes to accomodate cuDF unsigned categorical changes - PR #2209: Fix FIL benchmark for gpuarray-c input - PR #2507: Import `treelite.sklearn` +- PR #2521: Fixing invalid smem calculation in KNeighborsCLassifier +- PR #2515: Increase tolerance for LogisticRegression test - PR #2532: Updating doxygen in new MG headers - PR #2521: Fixing invalid smem calculation in KNeighborsCLassifier - PR #2515: Increase tolerance for LogisticRegression test @@ -105,12 +277,41 @@ - PR #2535: Fix issue with incorrect docker image being used in local build script - PR #2542: Fix small memory leak in TSNE - PR #2552: Fixed the length argument of updateDevice calls in RF test +- PR #2565: Fix cell allocation code to avoid loops in quad-tree. Prevent NaNs causing infinite descent - PR #2563: Update scipy call for arima gradient test - PR #2569: Fix for cuDF update - PR #2508: Use keyword parameters in sklearn.datasets.make_* functions +- PR #2587: Attributes for estimators relying on solvers - PR #2586: Fix SVC decision function data type - PR #2573: Considering managed memory as device type on checking for KMeans - PR #2574: Fixing include path in `tsvd_mg.pyx` +- PR #2506: Fix usage of CumlArray attributes on `cuml.common.base.Base` +- PR #2593: Fix inconsistency in train_test_split +- PR #2609: Fix small doxygen issues +- PR #2610: Remove cuDF tolist call +- PR #2613: Removing thresholds from kmeans score tests (SG+MG) +- PR #2616: Small test code fix for pandas dtype tests +- PR #2617: Fix floating point precision error in tSNE +- PR #2625: Update Estimator notebook to resolve errors +- PR #2634: singlegpu build option fixes +- PR #2641: [Breaking] Make `max_depth` in RF compatible with scikit-learn +- PR #2650: Make max_depth behave consistently for max_depth > 14 +- PR #2651: AutoARIMA Python bug fix +- PR #2654: Fix for vectorizer concatenations +- PR #2655: Fix C++ RF predict function access of rows/samples array +- PR #2649: Cleanup sphinx doc warnings for 0.15 +- PR #2668: Order conversion improvements to account for cupy behavior changes +- PR #2669: Revert PR 2655 Revert "Fixes C++ RF predict function" +- PR #2683: Fix incorrect "Bad CumlArray Use" error messages on test failures +- PR #2695: Fix debug build issue due to incorrect host/device method setup +- PR #2709: Fixing OneHotEncoder Overflow Error +- PR #2710: Fix SVC doc statement about predic_proba +- PR #2726: Return correct output type in QN +- PR #2711: Fix Dask RF failure intermittently +- PR #2718: Fix temp directory for py.test +- PR #2719: Set KNeighborsRegressor output dtype according to training target dtype +- PR #2720: Updates to outdated links +- PR #2722: Getting cuML covariance test passing w/ Cupy 7.8 & CUDA 11 # cuML 0.14.0 (03 Jun 2020) @@ -132,6 +333,7 @@ - PR #2256: Add a `make_arima` generator - PR #2245: ElasticNet, Lasso and Coordinate Descent MNMG - PR #2242: Pandas input support with output as NumPy arrays by default +- PR #2551: Add cuML RF multiclass prediction using FIL from python - PR #1728: Added notebook testing to gpuCI gpu build ## Improvements @@ -283,6 +485,8 @@ - PR #2295: Fix convert_to_dtype copy even with same dtype - PR #2305: Fixed race condition in DBScan - PR #2354: Fix broken links in README +- PR #2619: Explicitly skip raft test folder for pytest 6.0.0 +- PR #2788: Set the minimum number of columns that can be sampled to 1 to fix 0 mem allocation error # cuML 0.13.0 (31 Mar 2020) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ced0b646a..afb28bc23e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,7 +24,7 @@ into three categories: ### Your first issue -1. Read the project's [README.md](https://github.com/rapidsai/cuml/blob/master/README.md) +1. Read the project's [README.md](https://github.com/rapidsai/cuml/blob/main/README.md) to learn how to setup the development environment. 2. Find an issue to work on. The best way is to look for the [good first issue](https://github.com/rapidsai/cuml/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) or [help wanted](https://github.com/rapidsai/cuml/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels @@ -62,12 +62,12 @@ implementation of the issue, ask them in the issue instead of the PR. The cuML repository has two main branches: -1. `master` branch: it contains the last released version. Only hotfixes are targeted and merged into it. +1. `main` branch: it contains the last released version. Only hotfixes are targeted and merged into it. 2. `branch-x.y`: it is the development branch which contains the upcoming release. All the new features should be based on this branch and Merge/Pull request should target this branch (with the exception of hotfixes). ### Additional details -For every new version `x.y` of cuML there is a corresponding branch called `branch-x.y`, from where new feature development starts and PRs will be targeted and merged before its release. The exceptions to this are the 'hotfixes' that target the `master` branch, which target critical issues raised by Github users and are directly merged to `master` branch, and create a new subversion of the project. While trying to patch an issue which requires a 'hotfix', please state the intent in the PR. +For every new version `x.y` of cuML there is a corresponding branch called `branch-x.y`, from where new feature development starts and PRs will be targeted and merged before its release. The exceptions to this are the 'hotfixes' that target the `main` branch, which target critical issues raised by Github users and are directly merged to `main` branch, and create a new subversion of the project. While trying to patch an issue which requires a 'hotfix', please state the intent in the PR. For all development, your changes should be pushed into a branch (created using the naming instructions below) in your own fork of cuML and then create a pull request when the code is ready. diff --git a/Dockerfile b/Dockerfile index 074a7058e3..a9d92e37b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# From: https://github.com/rapidsai/cudf/blob/master/Dockerfile +# From: https://github.com/rapidsai/cudf/blob/main/Dockerfile FROM cudf ENV CONDA_ENV=cudf diff --git a/README.md b/README.md index 52e96148d0..a2ec4ba307 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ For large datasets, these GPU-based implementations can complete 10-50x faster than their CPU equivalents. For details on performance, see the [cuML Benchmarks Notebook](https://github.com/rapidsai/cuml/tree/branch-0.14/notebooks/tools). -As an example, the following Python snippet loads input and computes DBSCAN clusters, all on GPU: +As an example, the following Python snippet loads input and computes DBSCAN clusters, all on GPU, using cuDF: ```python import cudf from cuml.cluster import DBSCAN @@ -43,10 +43,25 @@ dtype: int32 cuML also features multi-GPU and multi-node-multi-GPU operation, using [Dask](https://www.dask.org), for a growing list of algorithms. The following Python snippet reads input from a CSV file and performs a NearestNeighbors query across a cluster of Dask workers, using multiple GPUs on a single node: + + +Initialize a `LocalCUDACluster` configured with [UCX](https://github.com/rapidsai/ucx-py) for fast transport of CUDA arrays ```python -# Create a Dask CUDA cluster w/ one worker per device +# Initialize UCX for high-speed transport of CUDA arrays from dask_cuda import LocalCUDACluster -cluster = LocalCUDACluster() + +# Create a Dask single-node CUDA cluster w/ one worker per device +cluster = LocalCUDACluster(protocol="ucx", + enable_tcp_over_ucx=True, + enable_nvlink=True, + enable_infiniband=False) +``` + +Load data and perform `k-Nearest Neighbors` search. `cuml.dask` estimators also support `Dask.Array` as input: +```python + +from dask.distributed import Client +client = Client(cluster) # Read CSV file in parallel across workers import dask_cudf @@ -54,16 +69,15 @@ df = dask_cudf.read_csv("/path/to/csv") # Fit a NearestNeighbors model and query it from cuml.dask.neighbors import NearestNeighbors -nn = NearestNeighbors(n_neighbors = 10) +nn = NearestNeighbors(n_neighbors = 10, client=client) nn.fit(df) neighbors = nn.kneighbors(df) ``` - For additional examples, browse our complete [API documentation](https://docs.rapids.ai/api/cuml/stable/), or check out our example [walkthrough -notebooks](https://github.com/rapidsai/cuml/tree/branch-0.14/notebooks). Finally, you +notebooks](https://github.com/rapidsai/cuml/tree/branch-0.15/notebooks). Finally, you can find complete end-to-end examples in the [notebooks-contrib repo](https://github.com/rapidsai/notebooks-contrib). @@ -74,6 +88,7 @@ repo](https://github.com/rapidsai/notebooks-contrib). | **Clustering** | Density-Based Spatial Clustering of Applications with Noise (DBSCAN) | | | | K-Means | Multi-node multi-GPU via Dask | | **Dimensionality Reduction** | Principal Components Analysis (PCA) | Multi-node multi-GPU via Dask| +| | Incremental PCA | Experimental | | | Truncated Singular Value Decomposition (tSVD) | Multi-node multi-GPU via Dask | | | Uniform Manifold Approximation and Projection (UMAP) | Multi-node multi-GPU Inference via Dask | | | Random Projection | | @@ -82,17 +97,18 @@ repo](https://github.com/rapidsai/notebooks-contrib). | | Linear Regression with Lasso or Ridge Regularization | Multi-node multi-GPU via Dask | | | ElasticNet Regression | | | | Logistic Regression | | +| | Naive Bayes | Multi-node multi-GPU via Dask | | | Stochastic Gradient Descent (SGD), Coordinate Descent (CD), and Quasi-Newton (QN) (including L-BFGS and OWL-QN) solvers for linear models | | | **Nonlinear Models for Regression or Classification** | Random Forest (RF) Classification | Experimental multi-node multi-GPU via Dask | | | Random Forest (RF) Regression | Experimental multi-node multi-GPU via Dask | | | Inference for decision tree-based models | Forest Inference Library (FIL) | -| | K-Nearest Neighbors (KNN) | Multi-node multi-GPU via Dask, uses [Faiss](https://github.com/facebookresearch/faiss) for Nearest Neighbors Query. | -| | K-Nearest Neighbors (KNN) Classification | | -| | K-Nearest Neighbors (KNN) Regression | | +| | K-Nearest Neighbors (KNN) Classification | Multi-node multi-GPU via Dask+[UCX](https://github.com/rapidsai/ucx-py), uses [Faiss](https://github.com/facebookresearch/faiss) for Nearest Neighbors Query. | +| | K-Nearest Neighbors (KNN) Regression | Multi-node multi-GPU via Dask+[UCX](https://github.com/rapidsai/ucx-py), uses [Faiss](https://github.com/facebookresearch/faiss) for Nearest Neighbors Query. | | | Support Vector Machine Classifier (SVC) | | | | Epsilon-Support Vector Regression (SVR) | | | **Time Series** | Holt-Winters Exponential Smoothing | | | | Auto-regressive Integrated Moving Average (ARIMA) | Supports seasonality (SARIMA) | +| **Other** | K-Nearest Neighbors (KNN) Search | Multi-node multi-GPU via Dask+[UCX](https://github.com/rapidsai/ucx-py), uses [Faiss](https://github.com/facebookresearch/faiss) for Nearest Neighbors Query. | --- ## Installation @@ -115,12 +131,14 @@ For additional details on the technologies behind cuML, as well as a broader ove Please consider citing this when using cuML in a project. You can use the citation BibTeX: -> @article{raschka2020machine, -> title={Machine Learning in Python: Main developments and technology trends in data science, machine learning, and artificial intelligence}, -> author={Raschka, Sebastian and Patterson, Joshua and Nolet, Corey}, -> journal={arXiv preprint arXiv:2002.04803}, -> year={2020} -> } +``` +@article{raschka2020machine, + title={Machine Learning in Python: Main developments and technology trends in data science, machine learning, and artificial intelligence}, + author={Raschka, Sebastian and Patterson, Joshua and Nolet, Corey}, + journal={arXiv preprint arXiv:2002.04803}, + year={2020} +} +``` ## Contact diff --git a/build.sh b/build.sh index 74cd7e944d..1366b0cf9c 100755 --- a/build.sh +++ b/build.sh @@ -19,7 +19,7 @@ ARGS=$* REPODIR=$(cd $(dirname $0); pwd) VALIDTARGETS="clean libcuml cuml cpp-mgtests prims bench prims-bench cppdocs pydocs" -VALIDFLAGS="-v -g -n --allgpuarch --singlegpu --nvtx --show_depr_warn -h --help " +VALIDFLAGS="-v -g -n --allgpuarch --buildfaiss --buildgtest --singlegpu --nvtx --show_depr_warn -h --help " VALIDARGS="${VALIDTARGETS} ${VALIDFLAGS}" HELP="$0 [ ...] [ ...] where is: @@ -27,7 +27,7 @@ HELP="$0 [ ...] [ ...] libcuml - build the cuml C++ code only. Also builds the C-wrapper library around the C++ code. cuml - build the cuml Python package - cpp-mgtests - Build libcuml mnmg tests. Builds MPI communicator, adding MPI as dependency. + cpp-mgtests - build libcuml mnmg tests. Builds MPI communicator, adding MPI as dependency. prims - build the ML prims tests bench - build the cuml C++ benchmark prims-bench - build the ml-prims C++ benchmark @@ -38,6 +38,8 @@ HELP="$0 [ ...] [ ...] -g - build for debug -n - no install step --allgpuarch - build for all supported GPU architectures + --buildfaiss - build faiss statically into libcuml + --buildgtest - build googletest library --singlegpu - Build libcuml and cuml without multigpu components --nvtx - Enable nvtx for profiling support --show_depr_warn - show cmake deprecation warnings @@ -45,7 +47,7 @@ HELP="$0 [ ...] [ ...] default action (no args) is to build and install 'libcuml', 'cuml', and 'prims' targets only for the detected GPU arch " -LIBCUML_BUILD_DIR=${REPODIR}/cpp/build +LIBCUML_BUILD_DIR=${LIBCUML_BUILD_DIR:=${REPODIR}/cpp/build} CUML_BUILD_DIR=${REPODIR}/python/build PYTHON_DEPS_CLONE=${REPODIR}/python/external_repositories BUILD_DIRS="${LIBCUML_BUILD_DIR} ${CUML_BUILD_DIR} ${PYTHON_DEPS_CLONE}" @@ -62,13 +64,13 @@ CLEAN=0 BUILD_DISABLE_DEPRECATION_WARNING=ON BUILD_CUML_STD_COMMS=ON BUILD_CPP_MG_TESTS=OFF +BUILD_STATIC_FAISS=OFF # Set defaults for vars that may not have been defined externally # FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check # CONDA_PREFIX, but there is no fallback from there! INSTALL_PREFIX=${INSTALL_PREFIX:=${PREFIX:=${CONDA_PREFIX}}} PARALLEL_LEVEL=${PARALLEL_LEVEL:=""} -BUILD_ABI=${BUILD_ABI:=ON} function hasArg { (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") @@ -119,6 +121,12 @@ fi if hasArg cpp-mgtests; then BUILD_CPP_MG_TESTS=ON fi +if hasArg --buildfaiss; then + BUILD_STATIC_FAISS=ON +fi +if hasArg --buildgtest; then + BUILD_GTEST=ON +fi if hasArg --nvtx; then NVTX=ON fi @@ -162,7 +170,6 @@ if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg pri cd ${LIBCUML_BUILD_DIR} cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -DCMAKE_CXX11_ABI=${BUILD_ABI} \ -DBLAS_LIBRARIES=${INSTALL_PREFIX}/lib/libopenblas.so.0 \ ${GPU_ARCH} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ @@ -171,6 +178,7 @@ if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg pri -DWITH_UCX=ON \ -DBUILD_CUML_MPI_COMMS=${BUILD_CPP_MG_TESTS} \ -DBUILD_CUML_MG_TESTS=${BUILD_CPP_MG_TESTS} \ + -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS} \ -DNVTX=${NVTX} \ -DPARALLEL_LEVEL=${PARALLEL_LEVEL} \ -DNCCL_PATH=${INSTALL_PREFIX} \ @@ -216,10 +224,9 @@ fi if completeBuild || hasArg cuml || hasArg pydocs; then cd ${REPODIR}/python if [[ ${INSTALL_TARGET} != "" ]]; then - python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace ${SINGLEGPU_PYTHON_FLAG} - python setup.py install --single-version-externally-managed --record=record.txt ${SINGLEGPU_PYTHON_FLAG} + python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${SINGLEGPU_PYTHON_FLAG} --library-dir=${LIBCUML_BUILD_DIR} install --single-version-externally-managed --record=record.txt else - python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBCUML_BUILD_DIR} ${SINGLEGPU_PYTHON_FLAG} + python setup.py build_ext -j${PARALLEL_LEVEL:-1} --library-dir=${LIBCUML_BUILD_DIR} ${SINGLEGPU_PYTHON_FLAG} fi if hasArg pydocs; then diff --git a/ci/checks/black_lists.sh b/ci/checks/black_lists.sh index 8d1a63c47b..2ed13a2135 100755 --- a/ci/checks/black_lists.sh +++ b/ci/checks/black_lists.sh @@ -6,7 +6,6 @@ # PR_TARGET_BRANCH is set by the CI enviroment -# Checkout master for comparison git checkout --quiet $PR_TARGET_BRANCH # Switch back to tip of PR branch diff --git a/ci/checks/changelog.sh b/ci/checks/changelog.sh index 41cb6d6bd8..946c005f68 100755 --- a/ci/checks/changelog.sh +++ b/ci/checks/changelog.sh @@ -4,17 +4,17 @@ # cuML CHANGELOG Tester # ######################### -# Checkout master for comparison -git checkout --quiet master +# Checkout main for comparison +git checkout --force --quiet main # Switch back to tip of PR branch -git checkout --quiet current-pr-branch +git checkout --force --quiet current-pr-branch # Ignore errors during searching set +e # Get list of modified files between matster and PR branch -CHANGELOG=`git diff --name-only master...current-pr-branch | grep CHANGELOG.md` +CHANGELOG=`git diff --name-only main...current-pr-branch | grep CHANGELOG.md` # Check if CHANGELOG has PR ID PRNUM=`cat CHANGELOG.md | grep "$PR_ID"` RETVAL=0 diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 6d2cddd80c..f762e5502f 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -1,27 +1,24 @@ #!/bin/bash # Copyright (c) 2018, NVIDIA CORPORATION. -###################################### -# cuML CPU conda build script for CI # -###################################### +############################################## +# cuML CPU conda build script for CI # +############################################## set -ex -# Logger function for build status output -function logger() { - echo -e "\n>>>> $@\n" -} - # Set path and build parallel level -export PATH=/conda/bin:/usr/local/cuda/bin:$PATH -export PARALLEL_LEVEL=4 - -# Set versions of packages needed to be grabbed -export CUDF_VERSION=0.8.* -export NVSTRINGS_VERSION=0.8.* -export RMM_VERSION=0.8.* +export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH +export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} # Set home to the job's workspace export HOME=$WORKSPACE +# Determine CUDA release version +export CUDA_REL=${CUDA_VERSION%.*} + + # Setup 'gpuci_conda_retry' for build retries (results in 2 total attempts) +export GPUCI_CONDA_RETRY_MAX=1 +export GPUCI_CONDA_RETRY_SLEEP=30 + # Switch to project root; also root of repo checkout cd $WORKSPACE @@ -34,17 +31,22 @@ fi # SETUP - Check environment ################################################################################ -logger "Get env..." +gpuci_logger "Check environment variables" env -logger "Activate conda env..." -source activate gdf +gpuci_logger "Activate conda env" +. /opt/conda/etc/profile.d/conda.sh +conda activate rapids -logger "Check versions..." +gpuci_logger "Check compiler versions" python --version -gcc --version -g++ --version -conda list +$CC --version +$CXX --version + +gpuci_logger "Check conda environment" +conda info +conda config --show-sources +conda list --show-channel-urls # FIX Added to deal with Anancoda SSL verification issues during conda builds conda config --set ssl_verify False @@ -53,18 +55,32 @@ conda config --set ssl_verify False # BUILD - Conda package builds (conda deps: libcuml <- cuml) ################################################################################ -logger "Build conda pkg for libcuml..." -source ci/cpu/libcuml/build_libcuml.sh +if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + if [ "$BUILD_LIBCUML" == '1' -o "$BUILD_CUML" == '1' ]; then + gpuci_logger "Build conda pkg for libcuml" + gpuci_conda_retry build conda/recipes/libcuml + fi +else + if [ "$BUILD_LIBCUML" == '1' ]; then + gpuci_logger "PROJECT FLASH: Build conda pkg for libcuml" + gpuci_conda_retry build conda/recipes/libcuml --dirty --no-remove-work-dir + fi +fi -logger "Build conda pkg for cuml..." -source ci/cpu/cuml/build_cuml.sh +if [ "$BUILD_CUML" == '1' ]; then + if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + gpuci_logger "Build conda pkg for cuml" + gpuci_conda_retry build conda/recipes/cuml --python=${PYTHON} + else + gpuci_logger "PROJECT FLASH: Build conda pkg for cuml" + gpuci_conda_retry build -c ci/artifacts/cuml/cpu/conda-bld/ --dirty --no-remove-work-dir conda/recipes/cuml --python=${PYTHON} + fi +fi ################################################################################ # UPLOAD - Conda packages ################################################################################ -logger "Upload conda pkgs for libcuml..." -source ci/cpu/libcuml/upload-anaconda.sh +gpuci_logger "Upload conda pkgs" +source ci/cpu/upload.sh -logger "Upload conda pkg for cuml..." -source ci/cpu/cuml/upload-anaconda.sh diff --git a/ci/cpu/cuml/build_cuml.sh b/ci/cpu/cuml/build_cuml.sh deleted file mode 100644 index 561b439318..0000000000 --- a/ci/cpu/cuml/build_cuml.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash - -set -e - -if [ "$BUILD_CUML" == '1' ]; then - echo "Building cuML" - CUDA_REL=${CUDA_VERSION%.*} - conda build conda/recipes/cuml --python=${PYTHON} - -fi diff --git a/ci/cpu/cuml/upload-anaconda.sh b/ci/cpu/cuml/upload-anaconda.sh deleted file mode 100755 index 6a79b85919..0000000000 --- a/ci/cpu/cuml/upload-anaconda.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -# -# Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh - -set -e - -if [ "$BUILD_CUML" == "1" ]; then - CUDA_REL=${CUDA_VERSION%.*} - - export UPLOADFILE=`conda build conda/recipes/cuml -c conda-forge -c numba -c conda-forge/label/rc_ucx -c rapidsai -c nvidia -c pytorch -c defaults --python=${PYTHON} --output` - - SOURCE_BRANCH=master - - LABEL_OPTION="--label main" - echo "LABEL_OPTION=${LABEL_OPTION}" - - test -e ${UPLOADFILE} - - # Restrict uploads to master branch - if [ ${GIT_BRANCH} != ${SOURCE_BRANCH} ]; then - echo "Skipping upload" - return 0 - fi - - if [ -z "$MY_UPLOAD_KEY" ]; then - echo "No upload key" - return 0 - fi - - echo "Upload" - echo ${UPLOADFILE} - anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${UPLOADFILE} -fi diff --git a/ci/cpu/libcuml/build_libcuml.sh b/ci/cpu/libcuml/build_libcuml.sh deleted file mode 100755 index c4b88af80e..0000000000 --- a/ci/cpu/libcuml/build_libcuml.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash - -set -e - -if [ "$BUILD_LIBCUML" == '1' -o "$BUILD_CUML" == '1' ]; then - echo "Building libcuml" - CUDA_REL=${CUDA_VERSION%.*} - - conda build conda/recipes/libcuml --python=${PYTHON} -fi diff --git a/ci/cpu/libcuml/upload-anaconda.sh b/ci/cpu/libcuml/upload-anaconda.sh deleted file mode 100644 index 26634fe865..0000000000 --- a/ci/cpu/libcuml/upload-anaconda.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -# -# Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh - -set -e - -if [ "$BUILD_LIBCUML" == "1" ]; then - CUDA_REL=${CUDA_VERSION%.*} - - export UPLOADFILE=`conda build conda/recipes/libcuml -c conda-forge -c numba -c conda-forge/label/rc_ucx -c nvidia -c rapidsai -c pytorch -c defaults --python=${PYTHON} --output` - - SOURCE_BRANCH=master - - LABEL_OPTION="--label main" - echo "LABEL_OPTION=${LABEL_OPTION}" - - test -e ${UPLOADFILE} - - # Restrict uploads to master branch - if [ ${GIT_BRANCH} != ${SOURCE_BRANCH} ]; then - echo "Skipping upload" - return 0 - fi - - if [ -z "$MY_UPLOAD_KEY" ]; then - echo "No upload key" - return 0 - fi - - echo "Upload" - echo ${UPLOADFILE} - anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${UPLOADFILE} -fi diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh index 04096e28ed..a362d2b9ca 100644 --- a/ci/cpu/prebuild.sh +++ b/ci/cpu/prebuild.sh @@ -1,9 +1,15 @@ #!/usr/bin/env bash -export BUILD_CUML=1 +export UPLOAD_CUML=1 if [[ "$PYTHON" == "3.7" ]]; then - export BUILD_LIBCUML=1 + export UPLOAD_LIBCUML=1 else - export BUILD_LIBCUML=0 + export UPLOAD_LIBCUML=0 fi + +if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + #If project flash is not activate, always build both + export BUILD_LIBCUML=1 + export BUILD_CUML=1 +fi \ No newline at end of file diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh new file mode 100644 index 0000000000..586cabae55 --- /dev/null +++ b/ci/cpu/upload.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# +# Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh + +set -e + +# Setup 'gpuci_retry' for upload retries (results in 4 total attempts) +export GPUCI_RETRY_MAX=3 +export GPUCI_RETRY_SLEEP=30 + +# Set default label options if they are not defined elsewhere +export LABEL_OPTION=${LABEL_OPTION:-"--label main"} + +# Skip uploads unless BUILD_MODE == "branch" +if [[ ${BUILD_MODE} != "branch" ]]; then + echo "Skipping upload" + return 0 +fi + +# Skip uploads if there is no upload key +if [[ -z "$MY_UPLOAD_KEY" ]]; then + echo "No upload key" + return 0 +fi + +################################################################################ +# SETUP - Get conda file output locations +################################################################################ + +gpuci_logger "Get conda file output locations" + +export LIBCUML_FILE=`conda build conda/recipes/libcuml --output` +export CUML_FILE=`conda build conda/recipes/cuml --python=$PYTHON --output` + +################################################################################ +# UPLOAD - Conda packages +################################################################################ + +gpuci_logger "Starting conda uploads" + +if [[ "$BUILD_LIBCUML" == "1" && "$UPLOAD_LIBCUML" == "1" ]]; then + test -e ${LIBCUML_FILE} + echo "Upload libcuml" + echo ${LIBCUML_FILE} + gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUML_FILE} +fi + +if [[ "$BUILD_CUML" == "1" && "$UPLOAD_CUML" == "1" ]]; then + test -e ${CUML_FILE} + echo "Upload cuml" + echo ${CUML_FILE} + gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUML_FILE} +fi + diff --git a/ci/docs/build.sh b/ci/docs/build.sh index 762e580b49..863f9328d4 100644 --- a/ci/docs/build.sh +++ b/ci/docs/build.sh @@ -18,32 +18,39 @@ export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" export NIGHTLY_VERSION=$(echo $BRANCH_VERSION | awk -F. '{print $2}') export PROJECTS=(cuml libcuml) -logger "Check environment..." +gpuci_logger "Check environment" env -logger "Check GPU usage..." +gpuci_logger "Check GPU usage" nvidia-smi -logger "Activate conda env..." -source activate rapids + +gpuci_logger "Activate conda env" +. /opt/conda/etc/profile.d/conda.sh +conda activate rapids + # TODO: Move installs to docs-build-env meta package conda install -c anaconda beautifulsoup4 jq pip install sphinx-markdown-tables -logger "Check versions..." +gpuci_logger "Check versions" python --version $CC --version $CXX --version -conda list + +gpuci_logger "Show conda info" +conda info +conda config --show-sources +conda list --show-channel-urls # Build Doxygen docs -logger "Build Doxygen docs..." +gpuci_logger "Build Doxygen docs" cd $PROJECT_WORKSPACE/cpp/build make doc # Build Python docs -logger "Build Sphinx docs..." +gpuci_logger "Build Sphinx docs" cd $PROJECT_WORKSPACE/docs make html @@ -54,7 +61,7 @@ for PROJECT in ${PROJECTS[@]}; do if [ ! -d "api/$PROJECT/$BRANCH_VERSION" ]; then mkdir -p api/$PROJECT/$BRANCH_VERSION fi - rm -rf $DOCS_WORKSPACE/api/$PROJECT/$BRANCH_VERSION/* + rm -rf $DOCS_WORKSPACE/api/$PROJECT/$BRANCH_VERSION/* done diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 46224f011c..9223621ebd 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,32 +1,32 @@ #!/bin/bash # Copyright (c) 2018-2020, NVIDIA CORPORATION. -######################################### -# cuML GPU build and test script for CI # -######################################### +############################################## +# cuML GPU build and test script for CI # +############################################## + set -e NUMARGS=$# ARGS=$* -# Logger function for build status output -function logger() { - echo -e "\n>>>> $@\n" -} - # Arg parsing function function hasArg { (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") } # Set path and build parallel level -export PATH=/conda/bin:/usr/local/cuda/bin:$PATH -export PARALLEL_LEVEL=4 -export CUDA_REL=${CUDA_VERSION%.*} +export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH +export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} # Set home to the job's workspace export HOME=$WORKSPACE -# Parse git describei +# Determine CUDA release version +export CUDA_REL=${CUDA_VERSION%.*} + +# Switch to project root; also root of repo checkout cd $WORKSPACE + +# Parse git describe export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` @@ -34,114 +34,208 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` # SETUP - Check environment ################################################################################ -logger "Check environment..." +gpuci_logger "Check environment" env -logger "Check GPU usage..." +gpuci_logger "Check GPU usage" nvidia-smi -logger "Activate conda env..." -source activate gdf -conda install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ +gpuci_logger "Activate conda env" +. /opt/conda/etc/profile.d/conda.sh +conda activate rapids + +gpuci_logger "Install dependencies" +gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ "cudatoolkit=${CUDA_REL}" \ "cudf=${MINOR_VERSION}" \ "rmm=${MINOR_VERSION}" \ - "libcumlprims=0.15.0a200720" \ + "libcumlprims=${MINOR_VERSION}" \ "dask-cudf=${MINOR_VERSION}" \ "dask-cuda=${MINOR_VERSION}" \ "ucx-py=${MINOR_VERSION}" \ - "xgboost==1.1.0dev.rapidsai0.15" \ - "rapids-build-env=$MINOR_VERSION.*" \ - "rapids-notebook-env=$MINOR_VERSION.*" + "xgboost=1.2.0dev.rapidsai${MINOR_VERSION}" \ + "rapids-build-env=${MINOR_VERSION}.*" \ + "rapids-notebook-env=${MINOR_VERSION}.*" \ + "rapids-doc-env=${MINOR_VERSION}.*" # https://docs.rapids.ai/maintainers/depmgmt/ -# conda remove -f rapids-build-env rapids-notebook-env -# conda install "your-pkg=1.0.0" - +# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +# gpuci_conda_retry install -y "your-pkg=1.0.0" -# Install contextvars on Python 3.6 +gpuci_logger "Install contextvars if needed" py_ver=$(python -c "import sys; print('.'.join(map(str, sys.version_info[:2])))") if [ "$py_ver" == "3.6" ];then conda install contextvars fi -# Install the master version of dask, distributed, and dask-ml -logger "pip install git+https://github.com/dask/distributed.git --upgrade --no-deps" +gpuci_logger "Install the master version of dask and distributed" +set -x pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps -logger "pip install git+https://github.com/dask/dask.git --upgrade --no-deps" pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps +set +x - -logger "Check versions..." +gpuci_logger "Check compiler versions" python --version $CC --version $CXX --version -conda list -################################################################################ -# BUILD - Build libcuml, cuML, and prims from source -################################################################################ +gpuci_logger "Check conda environment" +conda info +conda config --show-sources +conda list --show-channel-urls -logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH" +gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH" export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH -logger "Build libcuml, cuml, prims and bench targets..." -$WORKSPACE/build.sh clean libcuml cuml prims bench -v +if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + gpuci_logger "Building doxygen C++ docs" + $WORKSPACE/build.sh cppdocs -v -logger "Resetting LD_LIBRARY_PATH..." + ################################################################################ + # BUILD - Build libcuml, cuML, and prims from source + ################################################################################ -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED -export LD_LIBRARY_PATH_CACHED="" + gpuci_logger "Build from source" + $WORKSPACE/build.sh clean libcuml cuml prims bench -v -cd $WORKSPACE + gpuci_logger "Resetting LD_LIBRARY_PATH" -################################################################################ -# TEST - Run GoogleTest and py.tests for libcuml and cuML -################################################################################ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED + export LD_LIBRARY_PATH_CACHED="" -if hasArg --skip-tests; then - logger "Skipping Tests..." - exit 0 -fi + cd $WORKSPACE -logger "Check GPU usage..." -nvidia-smi + ################################################################################ + # TEST - Run GoogleTest and py.tests for libcuml and cuML + ################################################################################ + set +e -Eo pipefail + EXITCODE=0 + trap "EXITCODE=1" ERR + + if hasArg --skip-tests; then + gpuci_logger "Skipping Tests" + exit 0 + fi -logger "GoogleTest for libcuml..." -cd $WORKSPACE/cpp/build -GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml + gpuci_logger "Check GPU usage" + nvidia-smi -logger "Python pytest for cuml..." -cd $WORKSPACE/python + gpuci_logger "GoogleTest for libcuml" + set -x + cd $WORKSPACE/cpp/build + GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml -pytest --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v -s -m "not memleak" --durations=50 --timeout=300 --ignore=cuml/test/dask + + gpuci_logger "Python pytest for cuml" + cd $WORKSPACE/python -timeout 7200 sh -c "pytest cuml/test/dask --cache-clear --junitxml=${WORKSPACE}/junit-cuml-mg.xml -v -s -m 'not memleak' --durations=50 --timeout=300" + pytest --cache-clear --basetemp=${WORKSPACE}/cuml-cuda-tmp --junitxml=${WORKSPACE}/junit-cuml.xml -v -s -m "not memleak" --durations=50 --timeout=300 --ignore=cuml/test/dask --ignore=cuml/raft --cov-config=.coveragerc --cov=cuml --cov-report=xml:${WORKSPACE}/python/cuml/cuml-coverage.xml --cov-report term + timeout 7200 sh -c "pytest cuml/test/dask --cache-clear --basetemp=${WORKSPACE}/cuml-mg-cuda-tmp --junitxml=${WORKSPACE}/junit-cuml-mg.xml -v -s -m 'not memleak' --durations=50 --timeout=300" -################################################################################ -# TEST - Run notebook tests -################################################################################ -${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log -python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log + ################################################################################ + # TEST - Run notebook tests + ################################################################################ -################################################################################ -# TEST - Run GoogleTest for ml-prims -################################################################################ - -logger "Run ml-prims test..." -cd $WORKSPACE/cpp/build -GTEST_OUTPUT="xml:${WORKSPACE}/test-results/prims/" ./test/prims + gpuci_logger "Notebook tests" + ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log + python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log -################################################################################ -# TEST - Run GoogleTest for ml-prims, but with cuda-memcheck enabled -################################################################################ + ################################################################################ + # TEST - Run GoogleTest for ml-prims + ################################################################################ -if [ "$BUILD_MODE" = "branch" ] && [ "$BUILD_TYPE" = "gpu" ]; then - logger "GoogleTest for ml-prims with cuda-memcheck enabled..." + gpuci_logger "Run ml-prims test" cd $WORKSPACE/cpp/build - python ../scripts/cuda-memcheck.py -tool memcheck -exe ./test/prims + GTEST_OUTPUT="xml:${WORKSPACE}/test-results/prims/" ./test/prims + + ################################################################################ + # TEST - Run GoogleTest for ml-prims, but with cuda-memcheck enabled + ################################################################################ + + if [ "$BUILD_MODE" = "branch" ] && [ "$BUILD_TYPE" = "gpu" ]; then + gpuci_logger "GoogleTest for ml-prims with cuda-memcheck enabled..." + cd $WORKSPACE/cpp/build + python ../scripts/cuda-memcheck.py -tool memcheck -exe ./test/prims + fi +else + #Project Flash + export LIBCUML_BUILD_DIR="$WORKSPACE/ci/artifacts/cuml/cpu/conda_work/cpp/build" + export LD_LIBRARY_PATH="$LIBCUML_BUILD_DIR:$LD_LIBRARY_PATH" + + if hasArg --skip-tests; then + gpuci_logger "Skipping Tests" + exit 0 + fi + + gpuci_logger "Check GPU usage" + nvidia-smi + + gpuci_logger "Update binaries" + cd $LIBCUML_BUILD_DIR + chrpath -d libcuml.so + chrpath -d libcuml++.so + patchelf --replace-needed `patchelf --print-needed libcuml++.so | grep faiss` libfaiss.so libcuml++.so + + gpuci_logger "GoogleTest for libcuml" + cd $LIBCUML_BUILD_DIR + chrpath -d ./test/ml + patchelf --replace-needed `patchelf --print-needed ./test/ml | grep faiss` libfaiss.so ./test/ml + GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml + + gpuci_logger "Installing libcuml" + conda install -c $WORKSPACE/ci/artifacts/cuml/cpu/conda-bld/ libcuml + + gpuci_logger "Building cuml" + "$WORKSPACE/build.sh" -v cuml + + gpuci_logger "Python pytest for cuml" + cd $WORKSPACE/python + + pytest --cache-clear --basetemp=${WORKSPACE}/cuml-cuda-tmp --junitxml=${WORKSPACE}/junit-cuml.xml -v -s -m "not memleak" --durations=50 --timeout=300 --ignore=cuml/test/dask --ignore=cuml/raft --cov-config=.coveragerc --cov=cuml --cov-report=xml:${WORKSPACE}/python/cuml/cuml-coverage.xml --cov-report term + + timeout 7200 sh -c "pytest cuml/test/dask --cache-clear --basetemp=${WORKSPACE}/cuml-mg-cuda-tmp --junitxml=${WORKSPACE}/junit-cuml-mg.xml -v -s -m 'not memleak' --durations=50 --timeout=300" + + ################################################################################ + # TEST - Run notebook tests + ################################################################################ + + gpuci_logger "Notebook tests" + set +e -Eo pipefail + EXITCODE=0 + trap "EXITCODE=1" ERR + + ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log + python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log + + ################################################################################ + # TEST - Run GoogleTest for ml-prims + ################################################################################ + + gpuci_logger "Run ml-prims test" + cd $LIBCUML_BUILD_DIR + chrpath -d ./test/prims + patchelf --replace-needed `patchelf --print-needed ./test/prims | grep faiss` libfaiss.so ./test/prims + GTEST_OUTPUT="xml:${WORKSPACE}/test-results/prims/" ./test/prims + + ################################################################################ + # TEST - Run GoogleTest for ml-prims, but with cuda-memcheck enabled + ################################################################################ + + if [ "$BUILD_MODE" = "branch" ] && [ "$BUILD_TYPE" = "gpu" ]; then + logger "GoogleTest for ml-prims with cuda-memcheck enabled..." + cd $WORKSPACE/ci/artifacts/cuml/cpu/conda_work/cpp/build + python ../scripts/cuda-memcheck.py -tool memcheck -exe ./test/prims + fi + + gpuci_logger "Building doxygen C++ docs" + #Need to run in standard directory, not our artifact dir + unset LIBCUML_BUILD_DIR + $WORKSPACE/build.sh cppdocs -v + fi + +return ${EXITCODE} diff --git a/ci/local/README.md b/ci/local/README.md index 1e1520bac9..6425d40f0c 100644 --- a/ci/local/README.md +++ b/ci/local/README.md @@ -18,19 +18,19 @@ Build and test your local repository using a base gpuCI Docker image where: -H Show this help text -r Path to repository (defaults to working directory) - -i Use Docker image (default is gpuci/rapidsai-base:cuda10.0-ubuntu16.04-gcc5-py3.6) + -i Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7) -s Skip building and testing and start an interactive shell in a container of the Docker image ``` Example Usage: -`bash build.sh -r ~/rapids/cuml -i gpuci/rapidsai-base:cuda10.1-ubuntu16.04-gcc5-py3.6` +`bash build.sh -r ~/rapids/cuml -i gpuci/rapidsai:0.16-cuda10.2-devel-ubuntu16.04-py3.7` -For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page. +For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page. Style Check: ```bash $ bash ci/local/build.sh -r ~/rapids/cuml -s -$ source activate gdf #Activate gpuCI conda environment +$ source activate rapids # Activate gpuCI conda environment $ cd rapids $ flake8 python ``` @@ -42,7 +42,7 @@ There are some caveats to be aware of when using this script, especially if you ### Docker Image Build Repository -The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cuml/build_rapidsai-base_cuda10.1-ubuntu16.04-gcc5-py3.6/`. Feel free to remove this directory after the script is finished. +The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cuml/build_rapidsai_cuda10.1-ubuntu16.04-py3.7/`. Feel free to remove this directory after the script is finished. *Note*: The script *will not* override your local build repository. Your local environment stays in tact. diff --git a/ci/mg/build.sh b/ci/mg/build.sh index 9821d28896..9f742edb52 100644 --- a/ci/mg/build.sh +++ b/ci/mg/build.sh @@ -43,7 +43,7 @@ nvidia-smi logger "Activate conda env..." source activate gdf conda install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ - "cupy>=7,<8.0.0a0" \ + "cupy>7.1.0,<9.0.0a0" \ "cudatoolkit=${CUDA_REL}" \ "cudf=${MINOR_VERSION}" \ "rmm=${MINOR_VERSION}" \ diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index eac4dd42d6..cb8fca4073 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -47,7 +47,7 @@ function sed_runner() { sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak } -sed_runner 's/'"cuML VERSION .* LANGUAGES"'/'"cuML VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt +sed_runner 's/'"CUML VERSION .* LANGUAGES"'/'"cuML VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt # RTD update sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py @@ -59,4 +59,7 @@ for FILE in conda/environments/*.yml; do sed_runner "s/dask-cudf=${CURRENT_SHORT_TAG}/dask-cudf=${NEXT_SHORT_TAG}/g" ${FILE}; sed_runner "s/ucx-py=${CURRENT_SHORT_TAG}/ucx-py=${NEXT_SHORT_TAG}/g" ${FILE}; sed_runner "s/libcumlprims=${CURRENT_SHORT_TAG}/libcumlprims=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/rapids-build-env=${CURRENT_SHORT_TAG}/rapids-build-env=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/rapids-notebook-env=${CURRENT_SHORT_TAG}/rapids-notebook-env=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/rapids-doc-env=${CURRENT_SHORT_TAG}/rapids-doc-env=${NEXT_SHORT_TAG}/g" ${FILE}; done diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000000..c0a3a2fba2 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,5 @@ +#Configuration File for CodeCov +coverage: + status: + project: off + patch: off diff --git a/conda/environments/cuml_dev_cuda10.0.yml b/conda/environments/cuml_dev_cuda10.0.yml deleted file mode 100644 index 16fbebdcee..0000000000 --- a/conda/environments/cuml_dev_cuda10.0.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: cuml_dev -channels: -- rapidsai -- nvidia -- rapidsai-nightly -- conda-forge -dependencies: -- cudatoolkit=10.0 -- clang=8.0.1 -- clang-tools=8.0.1 -- cmake=3.14.5 -- numba>=0.46 -- cupy>=7,<8.0.0a0 -- cudf=0.15* -- rmm=0.15* -- cython>=0.29,<0.30 -- pytest>=4.6 -- pytest-timeout -- scikit-learn>=0.21 -- umap-learn>=0.3.9 -- scikit-learn>=0.21 -- dask>=2.12.0 -- distributed>=2.12.0 -- dask-cuda=0.15* -- dask-cudf=0.15* -- dask-ml -- ucx-py=0.15* -- nccl>=2.5 -- libcumlprims=0.15.0a200720 -- statsmodels -- treelite=0.92 -- doxygen -- sphinx -- sphinx_rtd_theme -- numpydoc -- nbsphinx -- recommonmark -- ipython -- pip -- pip: - - sphinx_markdown_tables - - git+https://github.com/dask/dask.git - - git+https://github.com/dask/distributed.git diff --git a/conda/environments/cuml_dev_cuda10.1.yml b/conda/environments/cuml_dev_cuda10.1.yml index 43795aa095..c456bc6b5d 100644 --- a/conda/environments/cuml_dev_cuda10.1.yml +++ b/conda/environments/cuml_dev_cuda10.1.yml @@ -6,38 +6,32 @@ channels: - conda-forge dependencies: - cudatoolkit=10.1 -- clang=8.0.1 -- clang-tools=8.0.1 -- cmake=3.14.5 -- numba>=0.46 -- cupy>=7,<8.0.0a0 -- cudf=0.15* -- rmm=0.15* -- cython>=0.29,<0.30 -- pytest>=4.6 -- pytest-timeout -- scikit-learn>=0.21 -- umap-learn>=0.3.9 -- scikit-learn>=0.21 -- dask>=2.12.0 -- distributed>=2.12.0 -- dask-cuda=0.15* -- dask-cudf=0.15* +- rapids-build-env=0.17 +- rapids-notebook-env=0.17 +- rapids-doc-env=0.17 +- cudf=0.17.* +- rmm=0.17.* +- libcumlprims=0.17.* +- dask-cudf=0.17.* +- dask-cuda=0.17.* +- ucx-py=0.17.* - dask-ml -- ucx-py=0.15* -- nccl>=2.5 -- libcumlprims=0.15.0a200720 -- statsmodels -- treelite=0.92 -- doxygen -- sphinx -- sphinx_rtd_theme -- numpydoc -- nbsphinx -- recommonmark -- ipython +- doxygen>=1.8.20 +- libfaiss>=1.6.3 +- faiss-proc=*=cuda +- umap-learn +- scikit-learn=0.23.1 +- treelite=0.93 - pip - pip: - sphinx_markdown_tables - git+https://github.com/dask/dask.git - git+https://github.com/dask/distributed.git + +# rapids-build-env, notebook-env and doc-env meta packages are defined in +# https://docs.rapids.ai/maintainers/depmgmt/ + +# To install different versions of packages contained in those meta packages, +# it is recommended to remove those meta packages (without removing the actual +# packages contained in the environment) first with: +# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env diff --git a/conda/environments/cuml_dev_cuda10.2.yml b/conda/environments/cuml_dev_cuda10.2.yml index db52085a1e..0078bf4221 100644 --- a/conda/environments/cuml_dev_cuda10.2.yml +++ b/conda/environments/cuml_dev_cuda10.2.yml @@ -6,38 +6,32 @@ channels: - conda-forge dependencies: - cudatoolkit=10.2 -- clang=8.0.1 -- clang-tools=8.0.1 -- cmake=3.14.5 -- numba>=0.46 -- cupy>=7,<8.0.0a0 -- cudf=0.15* -- rmm=0.15* -- cython>=0.29,<0.30 -- pytest>=4.6 -- pytest-timeout -- scikit-learn>=0.21 -- umap-learn>=0.3.9 -- scikit-learn>=0.21 -- dask>=2.12.0 -- distributed>=2.12.0 -- dask-cuda=0.15* -- dask-cudf=0.15* +- rapids-build-env=0.17 +- rapids-notebook-env=0.17 +- rapids-doc-env=0.17 +- cudf=0.17.* +- rmm=0.17.* +- libcumlprims=0.17.* +- dask-cudf=0.17.* +- dask-cuda=0.17.* +- ucx-py=0.17.* - dask-ml -- ucx-py=0.15* -- nccl>=2.5 -- libcumlprims=0.15.0a200720 -- statsmodels -- treelite=0.92 -- doxygen -- sphinx -- sphinx_rtd_theme -- numpydoc -- nbsphinx -- recommonmark -- ipython +- doxygen>=1.8.20 +- libfaiss>=1.6.3 +- faiss-proc=*=cuda +- umap-learn +- scikit-learn=0.23.1 +- treelite=0.93 - pip - pip: - sphinx_markdown_tables - git+https://github.com/dask/dask.git - git+https://github.com/dask/distributed.git + +# rapids-build-env, notebook-env and doc-env are defined in +# https://docs.rapids.ai/maintainers/depmgmt/ + +# To install different versions of packages contained in those meta packages, +# it is recommended to remove those meta packages (without removing the actual +# packages contained in the environment) first with: +# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env diff --git a/conda/environments/cuml_dev_cuda11.0.yml b/conda/environments/cuml_dev_cuda11.0.yml new file mode 100644 index 0000000000..7282bc7493 --- /dev/null +++ b/conda/environments/cuml_dev_cuda11.0.yml @@ -0,0 +1,37 @@ +name: cuml_dev +channels: +- rapidsai +- nvidia +- rapidsai-nightly +- conda-forge +dependencies: +- cudatoolkit=11.0 +- rapids-build-env=0.17 +- rapids-notebook-env=0.17 +- rapids-doc-env=0.17 +- cudf=0.17.* +- rmm=0.17.* +- libcumlprims=0.17.* +- dask-cudf=0.17.* +- dask-cuda=0.17.* +- ucx-py=0.17.* +- dask-ml +- doxygen>=1.8.20 +- libfaiss>=1.6.3 +- faiss-proc=*=cuda +- umap-learn +- scikit-learn=0.23.1 +- treelite=0.93 +- pip +- pip: + - sphinx_markdown_tables + - git+https://github.com/dask/dask.git + - git+https://github.com/dask/distributed.git + +# rapids-build-env, notebook-env and doc-env are defined in +# https://docs.rapids.ai/maintainers/depmgmt/ + +# To install different versions of packages contained in those meta packages, +# it is recommended to remove those meta packages (without removing the actual +# packages contained in the environment) first with: +# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml index 14a49504ff..740d238d1d 100644 --- a/conda/recipes/cuml/meta.yaml +++ b/conda/recipes/cuml/meta.yaml @@ -28,10 +28,10 @@ requirements: - setuptools - cython>=0.29,<0.30 - cmake>=3.14 - - treelite=0.92 + - treelite=0.93 - cudf {{ minor_version }} - libcuml={{ version }} - - libcumlprims 0.15.0a200720 + - libcumlprims {{ minor_version }} - cudatoolkit {{ cuda_version }}.* - ucx-py {{ minor_version }} run: @@ -39,9 +39,9 @@ requirements: - cudf {{ minor_version }} - dask-cudf {{ minor_version }} - libcuml={{ version }} - - libcumlprims=0.15.0a200720 - - treelite=0.92 - - cupy>=7,<8.0.0a0 + - libcumlprims {{ minor_version }} + - treelite=0.93 + - cupy>7.1.0,<9.0.0a0 - nccl>=2.5 - ucx-py {{ minor_version }} - dask>=2.12.0 diff --git a/conda/recipes/libcuml/build.sh b/conda/recipes/libcuml/build.sh index 04b629c8b9..318fa1b445 100644 --- a/conda/recipes/libcuml/build.sh +++ b/conda/recipes/libcuml/build.sh @@ -5,9 +5,8 @@ if [ -n "$MACOSX_DEPLOYMENT_TARGET" ]; then export MACOSX_DEPLOYMENT_TARGET=10.11 fi -# show environment -printenv -# Cleanup local git -git clean -xdf - -./build.sh clean libcuml -v --allgpuarch +if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + ./build.sh clean libcuml -v --allgpuarch +else + ./build.sh clean libcuml prims -v --allgpuarch +fi diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml index e37d8363be..012f911253 100644 --- a/conda/recipes/libcuml/meta.yaml +++ b/conda/recipes/libcuml/meta.yaml @@ -23,6 +23,7 @@ build: - CUDAHOSTCXX - PARALLEL_LEVEL - VERSION_SUFFIX + - PROJECT_FLASH requirements: build: @@ -30,20 +31,25 @@ requirements: - clang=8.0.1 - clang-tools=8.0.1 host: - - nccl 2.5.* + - nccl >=2.5 - cudf {{ minor_version }} - cudatoolkit {{ cuda_version }}.* - ucx-py {{ minor_version }} - - libcumlprims=0.15.0a200720 + - libcumlprims {{ minor_version }} - lapack - - treelite=0.92 + - treelite=0.93 + - faiss-proc=*=cuda + - gtest=1.10.0 + - libfaiss=1.6.3 run: - - libcumlprims=0.15.0a200720 + - libcumlprims {{ minor_version }} - cudf {{ minor_version }} - nccl>=2.5 - ucx-py {{ minor_version }} - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} - - treelite=0.92 + - treelite=0.93 + - faiss-proc=*=cuda + - libfaiss=1.6.3 about: home: http://rapids.ai/ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 61e90b4985..7cb4d471f1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,9 +16,9 @@ set (CMAKE_FIND_NO_INSTALL_PREFIX TRUE FORCE) -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) +cmake_minimum_required(VERSION 3.14...3.17 FATAL_ERROR) -project(CUML VERSION 0.15.0 LANGUAGES C CXX CUDA) +project(CUML VERSION 0.17.0 LANGUAGES C CXX CUDA) ############################################################################## # - build type --------------------------------------------------------------- @@ -64,7 +64,9 @@ option(BUILD_CUML_STD_COMMS "Build the standard NCCL+UCX Communicator" ON) option(BUILD_CUML_MPI_COMMS "Build the MPI+NCCL Communicator (used for testing)" OFF) -option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON) +option(BUILD_STATIC_FAISS "Build the FAISS library for nearest neighbors search on GPU" OFF) + +option(BUILD_GTEST "Build the GTEST library for running libcuml++ and prims test executables" OFF) option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON) @@ -72,8 +74,6 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF) option(ENABLE_CUMLPRIMS_MG "Enable algorithms that use libcumlprims_mg" ON) -option(EMPTY_MARKER_KERNEL "Enable empty marker kernel after nvtxRangePop" OFF) - option(KERNEL_INFO "Enable kernel resource usage info" OFF) option(LINE_INFO "Enable lineinfo in nvcc" OFF) @@ -169,22 +169,34 @@ endif(BUILD_CUML_MG_TESTS AND NOT SINGLEGPU) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates) -GENERATE_FIND_MODULE(NAME cumlprims_mg - HEADER_NAME cumlprims.hpp - LIBRARY_NAME cumlprims - LOCATION cumlprims) +if(ENABLE_CUMLPRIMS_MG) + GENERATE_FIND_MODULE( + NAME cumlprims_mg + HEADER_NAME cumlprims.hpp + LIBRARY_NAME cumlprims + LOCATION cumlprims) +endif(ENABLE_CUMLPRIMS_MG) -GENERATE_FIND_MODULE(NAME NCCL - HEADER_NAME nccl.h - LIBRARY_NAME nccl) +if(BUILD_CUML_STD_COMMS OR BUILD_CUML_MPI_COMMS) +GENERATE_FIND_MODULE( + NAME NCCL + HEADER_NAME nccl.h + LIBRARY_NAME nccl) +endif(BUILD_CUML_STD_COMMS OR BUILD_CUML_MPI_COMMS) -GENERATE_FIND_MODULE(NAME RMM - HEADER_NAME rmm/device_buffer.hpp - LIBRARY_NAME rmm) +if(BUILD_CUML_STD_COMMS) + GENERATE_FIND_MODULE( + NAME UCX + HEADER_NAME ucp/api/ucp.h + LIBRARY_NAME ucp) +endif(BUILD_CUML_STD_COMMS) -GENERATE_FIND_MODULE(NAME UCX - HEADER_NAME ucp/api/ucp.h - LIBRARY_NAME ucp) +if(NOT BUILD_STATIC_FAISS) + GENERATE_FIND_MODULE( + NAME FAISS + HEADER_NAME faiss/IndexFlat.h + LIBRARY_NAME faiss) +endif(NOT BUILD_STATIC_FAISS) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_BINARY_DIR}/cmake) @@ -254,9 +266,7 @@ endif(KERNEL_INFO) if(NVTX) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DNVTX_ENABLED") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNVTX_ENABLED") - if(EMPTY_MARKER_KERNEL) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DENABLE_EMPTY_MARKER_KERNEL") - endif(EMPTY_MARKER_KERNEL) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") endif(NVTX) if(CMAKE_BUILD_TYPE MATCHES Debug) @@ -269,6 +279,14 @@ if("${GPU_ARCHS}" STREQUAL "") evaluate_gpu_archs(GPU_ARCHS) endif() + +# CUDA 11 onwards cub ships with CTK +if((CUDA_VERSION_MAJOR EQUAL 11) OR (CUDA_VERSION_MAJOR GREATER 11)) + set(CUB_IS_PART_OF_CTK ON) +else() + set(CUB_IS_PART_OF_CTK OFF) +endif() + if("${GPU_ARCHS}" STREQUAL "ALL") set(GPU_ARCHS "60") if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) @@ -277,6 +295,9 @@ if("${GPU_ARCHS}" STREQUAL "ALL") if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) set(GPU_ARCHS "${GPU_ARCHS};75") endif() + if((CUDA_VERSION_MAJOR EQUAL 11) OR (CUDA_VERSION_MAJOR GREATER 11)) + set(GPU_ARCHS "${GPU_ARCHS};80") + endif() endif() message("-- Building for GPU_ARCHS = ${GPU_ARCHS}") @@ -290,17 +311,6 @@ list(GET GPU_ARCHS -1 ptx) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}") set(FAISS_GPU_ARCHS "${FAISS_GPU_ARCHS} -gencode arch=compute_${ptx},code=compute_${ptx}") -if(CMAKE_COMPILER_IS_GNUCXX) - if(NOT CMAKE_CXX11_ABI) - message(STATUS "Disabling the GLIBCXX11 ABI") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -D_GLIBCXX_USE_CXX11_ABI=0") - elseif(CMAKE_CXX11_ABI) - message(STATUS "Enabling the GLIBCXX11 ABI") - endif(NOT CMAKE_CXX11_ABI) -endif(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe --diag_suppress=unrecognized_gcc_pragma") @@ -317,37 +327,53 @@ set(CUML_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/src_prims ${CMAKE_CURRENT_SOURCE_DIR}/test/prims - ${FAISS_DIR}/src/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTLASS_DIR}/src/cutlass - ${CUB_DIR}/src/cub ${SPDLOG_DIR}/src/spdlog/include + ${FAISS_INCLUDE_DIRS} ${RAFT_DIR}/cpp/include - ${RMM_INCLUDE_DIRS} - ) + ${RMM_INCLUDE_DIRS}) + +if(NOT CUB_IS_PART_OF_CTK) + list(APPEND CUML_INCLUDE_DIRECTORIES ${CUB_DIR}/src/cub) +endif(NOT CUB_IS_PART_OF_CTK) set(CUML_PUBLIC_LINK_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDA_CUDART_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ) + ${CUDA_cusparse_LIBRARY}) + set(CUML_PRIVATE_LINK_LIBRARIES - faisslib + FAISS::FAISS treelite::treelite treelite::treelite_runtime - RMM::RMM ) +if(BUILD_CUML_STD_COMMS OR BUILD_CUML_MPI_COMMS) + list(APPEND CUML_INCLUDE_DIRECTORIES + ${NCCL_INCLUDE_DIRS}) + + list(APPEND CUML_PRIVATE_LINK_LIBRARIES + NCCL::NCCL) +endif(BUILD_CUML_STD_COMMS OR BUILD_CUML_MPI_COMMS) + +if(BUILD_CUML_MPI_COMMS) + list(APPEND CUML_INCLUDE_DIRECTORIES + ${MPI_CXX_INCLUDE_PATH}) + + list(APPEND CUML_PRIVATE_LINK_LIBRARIES + ${MPI_CXX_LIBRARIES}) +endif(BUILD_CUML_MPI_COMMS) + if(ENABLE_CUMLPRIMS_MG) list(APPEND CUML_INCLUDE_DIRECTORIES ${cumlprims_mg_INCLUDE_DIRS}) list(APPEND CUML_PRIVATE_LINK_LIBRARIES cumlprims_mg::cumlprims_mg) - endif(ENABLE_CUMLPRIMS_MG) ############################################################################## @@ -358,14 +384,12 @@ if(BUILD_CUML_CPP_LIBRARY) # single GPU components add_library(${CUML_CPP_TARGET} SHARED - src/arima/batched_kalman.cu src/arima/batched_arima.cu + src/arima/batched_kalman.cu src/common/cumlHandle.cpp src/common/cuml_api.cpp - src/common/cuML_comms_impl.cpp src/common/logger.cpp src/common/nvtx.cu - src/comms/cuML_comms_test.cpp src/datasets/make_arima.cu src/datasets/make_blobs.cu src/datasets/make_regression.cu @@ -465,17 +489,6 @@ if(BUILD_CUML_TESTS OR BUILD_CUML_MG_TESTS OR BUILD_PRIMS_TESTS) add_subdirectory(test ${PROJECT_BINARY_DIR}/test) endif(BUILD_CUML_TESTS OR BUILD_CUML_MG_TESTS OR BUILD_PRIMS_TESTS) -############################################################################## -# - build comms ------------------------------------------------------------------------------ - -if(BUILD_CUML_STD_COMMS) - add_subdirectory(comms/std) -endif(BUILD_CUML_STD_COMMS) - -if(BUILD_CUML_MPI_COMMS) - add_subdirectory(comms/mpi) -endif(BUILD_CUML_MPI_COMMS) - ############################################################################## # - build examples ------------------------------------------------------------------------------ @@ -491,6 +504,7 @@ install(TARGETS ${CUML_CPP_TARGET} DESTINATION lib) install(DIRECTORY ${CUML_INCLUDE_DIR}/cuml DESTINATION include) +install(DIRECTORY ${RAFT_DIR}/cpp/include/ DESTINATION include/cuml) ############################################################################## # - build benchmark executable ----------------------------------------------- diff --git a/cpp/Doxyfile.in b/cpp/Doxyfile.in index 7a8200db78..d8cd284118 100644 --- a/cpp/Doxyfile.in +++ b/cpp/Doxyfile.in @@ -230,12 +230,6 @@ TAB_SIZE = 4 ALIASES = -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all @@ -771,8 +765,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = @CMAKE_CURRENT_SOURCE_DIR@/comms \ - @CMAKE_CURRENT_SOURCE_DIR@/include \ +INPUT = @CMAKE_CURRENT_SOURCE_DIR@/include \ @CMAKE_CURRENT_SOURCE_DIR@/src \ @CMAKE_CURRENT_SOURCE_DIR@/src_prims @@ -873,7 +866,11 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = @CMAKE_CURRENT_SOURCE_DIR@/doxygen/images +# IMAGE_PATH = @CMAKE_CURRENT_SOURCE_DIR@/doxygen/images + +# temporarily using cmake_current_source_dir for image path since we don't have images, +# comment the above whenever images are needed in the doxygen/images folder +IMAGE_PATH = @CMAKE_CURRENT_SOURCE_DIR@/ # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -1017,25 +1014,6 @@ USE_HTAGS = NO VERBATIM_HEADERS = YES -# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. -# Note: The availability of this option depends on whether or not doxygen was -# generated with the -Duse-libclang=ON option for CMake. -# The default value is: NO. - -CLANG_ASSISTED_PARSING = NO - -# If clang assisted parsing is enabled you can provide the compiler with command -# line options that you would normally use when invoking the compiler. Note that -# the include paths will already be set by doxygen for the files and directories -# specified with INPUT and INCLUDE_PATH. -# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. - -CLANG_OPTIONS = - #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- @@ -1500,10 +1478,10 @@ MATHJAX_FORMAT = HTML-CSS # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. +# The default value is: https://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest +MATHJAX_RELPATH = https://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example @@ -2113,11 +2091,6 @@ EXTERNAL_GROUPS = YES EXTERNAL_PAGES = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of 'which perl'). -# The default file (with absolute path) is: /usr/bin/perl. - -PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool @@ -2132,14 +2105,6 @@ PERL_PATH = /usr/bin/perl CLASS_DIAGRAMS = YES -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see: -# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The diff --git a/cpp/README.md b/cpp/README.md index d258e084c6..f4d9076710 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -33,6 +33,7 @@ Current cmake offers the following configuration options: | Flag | Possible Values | Default Value | Behavior | | --- | --- | --- | --- | | BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. Setting this variable to `OFF` sets the variables BUILD_CUML_TESTS, BUILD_CUML_MG_TESTS and BUILD_CUML_EXAMPLES to `OFF` | +| BUILD_GTEST | [ON, OFF] | ON | Enable/disable building Googletest for test executables. The library search path will be used to find an existing version. | | BUILD_CUML_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_test`. | | BUILD_CUML_MG_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_mg_test`. Requires MPI to be installed. When enabled, BUILD_CUML_MPI_COMMS will be automatically set to ON. | | BUILD_PRIMS_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | @@ -42,7 +43,7 @@ Current cmake offers the following configuration options: | SINGLEGPU | [ON, OFF] | OFF | Disable all mnmg components. Disables building of all multi-GPU algorithms and all comms library components. Removes libcumlprims, UCX-py and NCCL dependencies. Overrides values of BUILD_CUML_MG_TESTS, BUILD_CUML_STD_COMMS, WITH_UCX and BUILD_CUML_MPI_COMMS. | | BUILD_CUML_EXAMPLES | [ON, OFF] | ON | Enable/disable building cuML C++ API usage examples. | | BUILD_CUML_BENCH | [ON, OFF] | ON | Enable/disable building oc cuML C++ benchark. | -| CMAKE_CXX11_ABI | [ON, OFF] | ON | Enable/disable the GLIBCXX11 ABI | +| BUILD_STATIC_FAISS | [ON, OFF] | OFF | Enable/disable building and static linking of FAISS into cuML. When this option is disabled, build will search for an installed version of FAISS. | | DISABLE_OPENMP | [ON, OFF] | OFF | Set to `ON` to disable OpenMP | | GPU_ARCHS | List of GPU architectures, semicolon-separated | Empty | List of GPU architectures that all artifacts are compiled for. Passing ALL means compiling for all currently supported GPU architectures: 60;70;75. If you don't pass this flag, then the build system will try to look for the GPU card installed on the system and compile only for that. | @@ -50,7 +51,9 @@ Current cmake offers the following configuration options: | Flag | Possible Values | Default Value | Behavior | | --- | --- | --- | --- | -| BLAS_LIBRARIES | path/to/blas_lib | "" | Optional variable allowing to manually specify location of BLAS library. | +| BLAS_LIBRARIES | path/to/blas_lib | "" | Optional variable allowing to manually specify location of BLAS library. This is only used when BUILD_STATIC_FAISS=ON | +| FAISS_ROOT | path/to/faiss | "" | Optional variable allowing to manually specify the location of FAISS. | +| GTEST_ROOT | path/to/gtest | "" | Optional variable allowing to manually specify the location of Googletest. | | NCCL_PATH| path/to/nccl | "" | Optional variable allowing to manually specify location of NCCL library. | | CUMLPRIMS_MG_PATH | path/to/libcumlprims | "" | Optional variable allowing to manually specify location of libcumlprims library. | @@ -83,3 +86,36 @@ Current external submodules are: 2. [CUB](https://github.com/NVlabs/cub) 3. [Faiss](https://github.com/facebookresearch/faiss) 4. [Google Test](https://github.com/google/googletest) + +## Using cuML libraries + +After building cuML, you can use its functionality in other C/C++ applications +by linking against the generated libraries. The following trivial example shows +how to make external use of cuML's logger: + +```cpp +// main.cpp +#include + +int main(int argc, char *argv[]) { + CUML_LOG_WARN("This is a warning from the cuML logger!"); + return 0; +} +``` + +To compile this example, we must point the compiler to where cuML was +installed. Assuming you did not provide a custom `$CMAKE_INSTALL_PREFIX`, this +will default to the `$CONDA_PREFIX` environment variable. + +```bash +$ export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib" +$ nvcc \ + main.cpp \ + -o cuml_logger_example \ + "-L${CONDA_PREFIX}/lib" \ + "-I${CONDA_PREFIX}/include" \ + "-I${CONDA_PREFIX}/include/cuml/raft" \ + -lcuml++ +$ ./cuml_logger_example +[W] [13:26:43.503068] This is a warning from the cuML logger! +``` diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 63cecba1ac..48de6fe1bd 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -27,10 +27,13 @@ if(BUILD_CUML_BENCH) sg/kmeans.cu sg/main.cpp sg/rf_classifier.cu - sg/rf_regressor.cu + # FIXME: RF Regressor is having an issue where the tests now seem to take + # forever to finish, as opposed to the classifier counterparts! + # sg/rf_regressor.cu sg/svc.cu sg/svr.cu sg/umap.cu + sg/fil.cu ) target_link_libraries(sg_benchmark @@ -62,9 +65,11 @@ if(BUILD_CUML_PRIMS_BENCH) prims/permute.cu prims/reduce.cu prims/rng.cu - ../src/common/logger.cpp # because prims is header only! - ) + ../src/common/logger.cpp) # because prims is header only! + if(NOT CUB_IS_PART_OF_CTK) + add_dependencies(prims_benchmark cub) + endif(NOT CUB_IS_PART_OF_CTK) add_dependencies(prims_benchmark spdlog) target_link_libraries(prims_benchmark ${CUDA_cublas_LIBRARY} benchmarklib) diff --git a/cpp/bench/common/ml_benchmark.hpp b/cpp/bench/common/ml_benchmark.hpp index 678c32da52..a205256cda 100644 --- a/cpp/bench/common/ml_benchmark.hpp +++ b/cpp/bench/common/ml_benchmark.hpp @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include #include #include #include diff --git a/cpp/bench/prims/add.cu b/cpp/bench/prims/add.cu index 92eadcc289..c89c7413d4 100644 --- a/cpp/bench/prims/add.cu +++ b/cpp/bench/prims/add.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include "../common/ml_benchmark.hpp" namespace MLCommon { @@ -28,8 +28,8 @@ struct AddParams { template struct AddBench : public Fixture { AddBench(const std::string& name, const AddParams& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: @@ -45,7 +45,7 @@ struct AddBench : public Fixture { void runBenchmark(::benchmark::State& state) override { loopOnState(state, [this]() { - MLCommon::LinAlg::add(ptr0, ptr0, ptr1, params.len, stream); + raft::linalg::add(ptr0, ptr0, ptr1, params.len, stream); }); } diff --git a/cpp/bench/prims/distance_common.cuh b/cpp/bench/prims/distance_common.cuh index 895a6be86e..112d17d18f 100644 --- a/cpp/bench/prims/distance_common.cuh +++ b/cpp/bench/prims/distance_common.cuh @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include "../common/ml_benchmark.hpp" @@ -26,11 +26,11 @@ struct Params { int m, n, k; }; // struct Params -template +template struct Distance : public Fixture { Distance(const std::string& name, const Params& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: diff --git a/cpp/bench/prims/distance_cosine.cu b/cpp/bench/prims/distance_cosine.cu index 6f9291535e..5f937fdd43 100644 --- a/cpp/bench/prims/distance_cosine.cu +++ b/cpp/bench/prims/distance_cosine.cu @@ -20,7 +20,8 @@ namespace MLCommon { namespace Bench { namespace Distance { -DIST_BENCH_REGISTER(DistanceCosine, MLCommon::Distance::EucExpandedCosine); +DIST_BENCH_REGISTER(DistanceCosine, + ML::Distance::DistanceType::EucExpandedCosine); } // namespace Distance } // namespace Bench diff --git a/cpp/bench/prims/distance_exp_l2.cu b/cpp/bench/prims/distance_exp_l2.cu index 31ca18f5f9..9940e6ba28 100644 --- a/cpp/bench/prims/distance_exp_l2.cu +++ b/cpp/bench/prims/distance_exp_l2.cu @@ -20,8 +20,9 @@ namespace MLCommon { namespace Bench { namespace Distance { -DIST_BENCH_REGISTER(DistanceL2Sq, MLCommon::Distance::EucExpandedL2); -DIST_BENCH_REGISTER(DistanceL2Sqrt, MLCommon::Distance::EucExpandedL2Sqrt); +DIST_BENCH_REGISTER(DistanceL2Sq, ML::Distance::DistanceType::EucExpandedL2); +DIST_BENCH_REGISTER(DistanceL2Sqrt, + ML::Distance::DistanceType::EucExpandedL2Sqrt); } // namespace Distance } // namespace Bench diff --git a/cpp/bench/prims/distance_l1.cu b/cpp/bench/prims/distance_l1.cu index 6abb0cb8aa..1e97e9b891 100644 --- a/cpp/bench/prims/distance_l1.cu +++ b/cpp/bench/prims/distance_l1.cu @@ -20,7 +20,7 @@ namespace MLCommon { namespace Bench { namespace Distance { -DIST_BENCH_REGISTER(DistanceL1, MLCommon::Distance::EucUnexpandedL1); +DIST_BENCH_REGISTER(DistanceL1, ML::Distance::DistanceType::EucUnexpandedL1); } // namespace Distance } // namespace Bench diff --git a/cpp/bench/prims/distance_unexp_l2.cu b/cpp/bench/prims/distance_unexp_l2.cu index 5bbf3d81f3..82e65c69ea 100644 --- a/cpp/bench/prims/distance_unexp_l2.cu +++ b/cpp/bench/prims/distance_unexp_l2.cu @@ -20,9 +20,10 @@ namespace MLCommon { namespace Bench { namespace Distance { -DIST_BENCH_REGISTER(DistanceUnexpL2Sq, MLCommon::Distance::EucUnexpandedL2); +DIST_BENCH_REGISTER(DistanceUnexpL2Sq, + ML::Distance::DistanceType::EucUnexpandedL2); DIST_BENCH_REGISTER(DistanceUnexpL2Sqrt, - MLCommon::Distance::EucUnexpandedL2Sqrt); + ML::Distance::DistanceType::EucUnexpandedL2Sqrt); } // namespace Distance } // namespace Bench diff --git a/cpp/bench/prims/fused_l2_nn.cu b/cpp/bench/prims/fused_l2_nn.cu index fcb2608c85..60a2a6cfaa 100644 --- a/cpp/bench/prims/fused_l2_nn.cu +++ b/cpp/bench/prims/fused_l2_nn.cu @@ -14,11 +14,11 @@ * limitations under the License. */ -#include +#include #include #include -#include -#include +#include +#include #include "../common/ml_benchmark.hpp" namespace MLCommon { @@ -32,8 +32,8 @@ struct FLNParams { template struct FusedL2NN : public Fixture { FusedL2NN(const std::string& name, const FLNParams& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: @@ -44,14 +44,14 @@ struct FusedL2NN : public Fixture { alloc(yn, params.n); alloc(out, params.m); alloc(workspace, params.m); - MLCommon::Random::Rng r(123456ULL); + raft::random::Rng r(123456ULL); r.uniform(x, params.m * params.k, T(-1.0), T(1.0), stream); r.uniform(y, params.n * params.k, T(-1.0), T(1.0), stream); - MLCommon::LinAlg::rowNorm(xn, x, params.k, params.m, - MLCommon::LinAlg::L2Norm, true, stream); - MLCommon::LinAlg::rowNorm(yn, y, params.k, params.n, - MLCommon::LinAlg::L2Norm, true, stream); - auto blks = ceildiv(params.m, 256); + raft::linalg::rowNorm(xn, x, params.k, params.m, raft::linalg::L2Norm, true, + stream); + raft::linalg::rowNorm(yn, y, params.k, params.n, raft::linalg::L2Norm, true, + stream); + auto blks = raft::ceildiv(params.m, 256); MLCommon::Distance::initKernel, int> <<>>(out, params.m, std::numeric_limits::max(), op); diff --git a/cpp/bench/prims/gram_matrix.cu b/cpp/bench/prims/gram_matrix.cu index 5961a5ee38..f8cbf664c6 100644 --- a/cpp/bench/prims/gram_matrix.cu +++ b/cpp/bench/prims/gram_matrix.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -40,8 +40,8 @@ struct GramTestParams { template struct GramMatrix : public Fixture { GramMatrix(const std::string& name, const GramTestParams& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) { std::vector kernel_names{"linear", "poly", "rbf", "tanh"}; std::ostringstream oss; @@ -61,7 +61,7 @@ struct GramMatrix : public Fixture { alloc(A, params.m * params.k); alloc(B, params.k * params.n); alloc(C, params.m * params.n); - MLCommon::Random::Rng r(123456ULL); + raft::random::Rng r(123456ULL); r.uniform(A, params.m * params.k, T(-1.0), T(1.0), stream); r.uniform(B, params.k * params.n, T(-1.0), T(1.0), stream); } diff --git a/cpp/bench/prims/make_blobs.cu b/cpp/bench/prims/make_blobs.cu index ca7eff42f2..0e9f65a5f3 100644 --- a/cpp/bench/prims/make_blobs.cu +++ b/cpp/bench/prims/make_blobs.cu @@ -29,8 +29,8 @@ struct Params { template struct MakeBlobs : public Fixture { MakeBlobs(const std::string& name, const Params& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: diff --git a/cpp/bench/prims/map_then_reduce.cu b/cpp/bench/prims/map_then_reduce.cu index d5c757d003..2bd8bf2501 100644 --- a/cpp/bench/prims/map_then_reduce.cu +++ b/cpp/bench/prims/map_then_reduce.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include "../common/ml_benchmark.hpp" namespace MLCommon { @@ -33,8 +33,8 @@ struct Identity { template struct MapThenReduce : public Fixture { MapThenReduce(const std::string& name, const Params& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: @@ -50,8 +50,8 @@ struct MapThenReduce : public Fixture { void runBenchmark(::benchmark::State& state) override { loopOnState(state, [this]() { - MLCommon::LinAlg::mapThenSumReduce(out, params.len, Identity(), stream, - in); + raft::linalg::mapThenSumReduce(out, params.len, Identity(), stream, + in); }); } diff --git a/cpp/bench/prims/matrix_vector_op.cu b/cpp/bench/prims/matrix_vector_op.cu index 62b1ebaa76..4dd7a3ea75 100644 --- a/cpp/bench/prims/matrix_vector_op.cu +++ b/cpp/bench/prims/matrix_vector_op.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include "../common/ml_benchmark.hpp" namespace MLCommon { @@ -29,8 +29,8 @@ struct Params { template struct MatVecOp : public Fixture { MatVecOp(const std::string& name, const Params& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: @@ -50,9 +50,9 @@ struct MatVecOp : public Fixture { void runBenchmark(::benchmark::State& state) override { loopOnState(state, [this]() { - MLCommon::LinAlg::matrixVectorOp(out, in, vec, params.cols, params.rows, - params.rowMajor, params.bcastAlongRows, - Sum(), stream); + raft::linalg::matrixVectorOp(out, in, vec, params.cols, params.rows, + params.rowMajor, params.bcastAlongRows, + raft::Sum(), stream); }); } diff --git a/cpp/bench/prims/permute.cu b/cpp/bench/prims/permute.cu index 1b54d6e8cb..8d3b8f1157 100644 --- a/cpp/bench/prims/permute.cu +++ b/cpp/bench/prims/permute.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include +#include +#include #include -#include #include "../common/ml_benchmark.hpp" namespace MLCommon { @@ -31,8 +31,8 @@ struct Params { template struct Permute : public Fixture { Permute(const std::string& name, const Params& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: @@ -44,7 +44,7 @@ struct Permute : public Fixture { } else { perms = nullptr; } - MLCommon::Random::Rng r(123456ULL); + raft::random::Rng r(123456ULL); if (params.needShuffle) { alloc(out, matLen); alloc(in, matLen); @@ -67,7 +67,7 @@ struct Permute : public Fixture { } void runBenchmark(::benchmark::State& state) override { - MLCommon::Random::Rng r(123456ULL); + raft::random::Rng r(123456ULL); loopOnState(state, [this, &r]() { MLCommon::Random::permute(perms, out, in, params.cols, params.rows, params.rowMajor, stream); diff --git a/cpp/bench/prims/reduce.cu b/cpp/bench/prims/reduce.cu index cfcb193ffb..0ed557ab71 100644 --- a/cpp/bench/prims/reduce.cu +++ b/cpp/bench/prims/reduce.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include "../common/ml_benchmark.hpp" namespace MLCommon { @@ -29,8 +29,8 @@ struct Params { template struct Reduce : public Fixture { Reduce(const std::string& name, const Params& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: @@ -46,8 +46,8 @@ struct Reduce : public Fixture { void runBenchmark(::benchmark::State& state) override { loopOnState(state, [this]() { - MLCommon::LinAlg::reduce(dots, data, params.cols, params.rows, T(0.f), - true, params.alongRows, stream); + raft::linalg::reduce(dots, data, params.cols, params.rows, T(0.f), true, + params.alongRows, stream); }); } diff --git a/cpp/bench/prims/rng.cu b/cpp/bench/prims/rng.cu index 934794bd80..af1281eb0e 100644 --- a/cpp/bench/prims/rng.cu +++ b/cpp/bench/prims/rng.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include -#include +#include +#include #include "../common/ml_benchmark.hpp" namespace MLCommon { @@ -38,15 +38,15 @@ template struct Params { int len; RandomType type; - MLCommon::Random::GeneratorType gtype; + raft::random::GeneratorType gtype; T start, end; }; // struct Params template struct RngBench : public Fixture { RngBench(const std::string& name, const Params& p) - : Fixture(name, - std::shared_ptr(new defaultDeviceAllocator)), + : Fixture(name, std::shared_ptr( + new raft::mr::device::default_allocator)), params(p) {} protected: @@ -59,7 +59,7 @@ struct RngBench : public Fixture { } void runBenchmark(::benchmark::State& state) override { - MLCommon::Random::Rng r(123456ULL, params.gtype); + raft::random::Rng r(123456ULL, params.gtype); loopOnState(state, [this, &r]() { switch (params.type) { case RNG_Normal: @@ -100,7 +100,7 @@ struct RngBench : public Fixture { template static std::vector> getInputs() { - using namespace MLCommon::Random; + using namespace raft::random; return { {1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, {32 * 1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)}, diff --git a/cpp/bench/sg/arima_loglikelihood.cu b/cpp/bench/sg/arima_loglikelihood.cu index f3befac912..d13e859e49 100644 --- a/cpp/bench/sg/arima_loglikelihood.cu +++ b/cpp/bench/sg/arima_loglikelihood.cu @@ -22,8 +22,9 @@ #include #include -#include +#include +#include #include "benchmark.cuh" namespace ML { @@ -46,13 +47,12 @@ class ArimaLoglikelihood : public TsFixtureRandom { using MLCommon::Bench::CudaEventTimer; auto& handle = *this->handle; - auto stream = handle.getStream(); + auto stream = handle.get_stream(); auto counting = thrust::make_counting_iterator(0); // Generate random parameters int N = order.complexity(); - MLCommon::Random::Rng gpu_gen(this->params.seed, - MLCommon::Random::GenPhilox); + raft::random::Rng gpu_gen(this->params.seed, raft::random::GenPhilox); gpu_gen.uniform(param, N * this->params.batch_size, -1.0, 1.0, stream); // Set sigma2 parameters to 1.0 DataT* x = param; // copy the object attribute for thrust @@ -75,8 +75,8 @@ class ArimaLoglikelihood : public TsFixtureRandom { Fixture::allocateBuffers(state); auto& handle = *this->handle; - auto stream = handle.getStream(); - auto allocator = handle.getDeviceAllocator(); + auto stream = handle.get_stream(); + auto allocator = handle.get_device_allocator(); // Buffer for the model parameters param = (DataT*)allocator->allocate( @@ -86,28 +86,24 @@ class ArimaLoglikelihood : public TsFixtureRandom { loglike = (DataT*)allocator->allocate( this->params.batch_size * sizeof(DataT), stream); residual = (DataT*)allocator->allocate( - this->params.batch_size * (this->params.n_obs - order.lost_in_diff()) * - sizeof(DataT), - stream); + this->params.batch_size * this->params.n_obs * sizeof(DataT), stream); } void deallocateBuffers(const ::benchmark::State& state) { Fixture::deallocateBuffers(state); auto& handle = *this->handle; - auto stream = handle.getStream(); - auto allocator = handle.getDeviceAllocator(); + auto stream = handle.get_stream(); + auto allocator = handle.get_device_allocator(); allocator->deallocate( param, order.complexity() * this->params.batch_size * sizeof(DataT), stream); allocator->deallocate(loglike, this->params.batch_size * sizeof(DataT), stream); - allocator->deallocate(residual, - this->params.batch_size * - (this->params.n_obs - order.lost_in_diff()) * - sizeof(DataT), - stream); + allocator->deallocate( + residual, this->params.batch_size * this->params.n_obs * sizeof(DataT), + stream); } protected: diff --git a/cpp/bench/sg/benchmark.cuh b/cpp/bench/sg/benchmark.cuh index 2e448a0a51..2669b79019 100644 --- a/cpp/bench/sg/benchmark.cuh +++ b/cpp/bench/sg/benchmark.cuh @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include #include #include #include "../common/ml_benchmark.hpp" @@ -32,15 +32,16 @@ namespace Bench { class Fixture : public MLCommon::Bench::Fixture { public: Fixture(const std::string& name) - : MLCommon::Bench::Fixture( - name, std::shared_ptr(new defaultDeviceAllocator)) {} + : MLCommon::Bench::Fixture(name, + std::shared_ptr( + new raft::mr::device::default_allocator)) {} Fixture() = delete; void SetUp(const ::benchmark::State& state) override { - handle.reset(new cumlHandle(NumStreams)); - d_alloc = handle->getDeviceAllocator(); + handle.reset(new raft::handle_t(NumStreams)); + d_alloc = handle->get_device_allocator(); MLCommon::Bench::Fixture::SetUp(state); - handle->setStream(stream); + handle->set_stream(stream); } void TearDown(const ::benchmark::State& state) override { @@ -82,7 +83,7 @@ class Fixture : public MLCommon::Bench::Fixture { generateMetrics(state); } - std::unique_ptr handle; + std::unique_ptr handle; ///@todo: ideally, this should be determined at runtime based on the inputs /// passed to the fixture. That will require a whole lot of plumbing of diff --git a/cpp/bench/sg/dataset.cuh b/cpp/bench/sg/dataset.cuh index 1cb72bef53..ce9d243a85 100644 --- a/cpp/bench/sg/dataset.cuh +++ b/cpp/bench/sg/dataset.cuh @@ -16,15 +16,15 @@ #pragma once -#include -#include +#include +#include #include -#include #include #include #include #include -#include +#include +#include #include #include #include @@ -81,17 +81,17 @@ struct Dataset { L* y; /** allocate space needed for the dataset */ - void allocate(const cumlHandle& handle, const DatasetParams& p) { - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); + void allocate(const raft::handle_t& handle, const DatasetParams& p) { + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); X = (D*)allocator->allocate(p.nrows * p.ncols * sizeof(D), stream); y = (L*)allocator->allocate(p.nrows * sizeof(L), stream); } /** free-up the buffers */ - void deallocate(const cumlHandle& handle, const DatasetParams& p) { - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); + void deallocate(const raft::handle_t& handle, const DatasetParams& p) { + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); allocator->deallocate(X, p.nrows * p.ncols * sizeof(D), stream); allocator->deallocate(y, p.nrows * sizeof(L), stream); } @@ -103,12 +103,12 @@ struct Dataset { * Generate random blobs data. Args are the same as in make_blobs. * Assumes that the user has already called `allocate` */ - void blobs(const cumlHandle& handle, const DatasetParams& p, + void blobs(const raft::handle_t& handle, const DatasetParams& p, const BlobsParams& b) { - const auto& handle_impl = handle.getImpl(); - auto stream = handle_impl.getStream(); - auto cublas_handle = handle_impl.getCublasHandle(); - auto allocator = handle_impl.getDeviceAllocator(); + const auto& handle_impl = handle; + auto stream = handle_impl.get_stream(); + auto cublas_handle = handle_impl.get_cublas_handle(); + auto allocator = handle_impl.get_device_allocator(); // Make blobs will generate labels of type IdxT which has to be an integer // type. We cast it to a different output type if needed. @@ -124,7 +124,7 @@ struct Dataset { b.shuffle, D(b.center_box_min), D(b.center_box_max), b.seed); if (!std::is_same::value) { - MLCommon::LinAlg::unaryOp( + raft::linalg::unaryOp( y, tmpY, p.nrows, [] __device__(IdxT z) { return (L)z; }, stream); allocator->deallocate(tmpY, p.nrows * sizeof(IdxT), stream); } @@ -134,15 +134,15 @@ struct Dataset { * Generate random regression data. Args are the same as in make_regression. * Assumes that the user has already called `allocate` */ - void regression(const cumlHandle& handle, const DatasetParams& p, + void regression(const raft::handle_t& handle, const DatasetParams& p, const RegressionParams& r) { ASSERT(!isClassification(), "make_regression: is only for regression problems!"); - const auto& handle_impl = handle.getImpl(); - auto stream = handle_impl.getStream(); - auto cublas_handle = handle_impl.getCublasHandle(); - auto cusolver_handle = handle_impl.getcusolverDnHandle(); - auto allocator = handle_impl.getDeviceAllocator(); + const auto& handle_impl = handle; + auto stream = handle_impl.get_stream(); + auto cublas_handle = handle_impl.get_cublas_handle(); + auto cusolver_handle = handle_impl.get_cusolver_dn_handle(); + auto allocator = handle_impl.get_device_allocator(); D* tmpX = X; @@ -150,12 +150,11 @@ struct Dataset { tmpX = (D*)allocator->allocate(p.nrows * p.ncols * sizeof(D), stream); } MLCommon::Random::make_regression( - tmpX, y, p.nrows, p.ncols, r.n_informative, cublas_handle, - cusolver_handle, allocator, stream, (D*)nullptr, 1, D(r.bias), - r.effective_rank, D(r.tail_strength), D(r.noise), r.shuffle, r.seed); + handle, tmpX, y, p.nrows, p.ncols, r.n_informative, stream, (D*)nullptr, + 1, D(r.bias), r.effective_rank, D(r.tail_strength), D(r.noise), r.shuffle, + r.seed); if (!p.rowMajor) { - MLCommon::LinAlg::transpose(tmpX, X, p.nrows, p.ncols, cublas_handle, - stream); + raft::linalg::transpose(handle, tmpX, X, p.nrows, p.ncols, stream); allocator->deallocate(tmpX, p.nrows * p.ncols * sizeof(D), stream); } } @@ -173,7 +172,7 @@ struct Dataset { * std::vector& y, int lineNum, const DatasetParams& p);` */ template - void read_csv(const cumlHandle& handle, const std::string& csvfile, + void read_csv(const raft::handle_t& handle, const std::string& csvfile, const DatasetParams& p, Lambda readOp) { if (isClassification() && p.nclasses <= 0) { ASSERT(false, @@ -192,9 +191,9 @@ struct Dataset { counter++; } myfile.close(); - auto stream = handle.getStream(); - MLCommon::copy(X, &(_X[0]), p.nrows * p.ncols, stream); - MLCommon::copy(y, &(_y[0]), p.nrows, stream); + auto stream = handle.get_stream(); + raft::copy(X, &(_X[0]), p.nrows * p.ncols, stream); + raft::copy(y, &(_y[0]), p.nrows, stream); } private: diff --git a/cpp/bench/sg/dataset_ts.cuh b/cpp/bench/sg/dataset_ts.cuh index 686e45e697..b43029d22b 100644 --- a/cpp/bench/sg/dataset_ts.cuh +++ b/cpp/bench/sg/dataset_ts.cuh @@ -17,10 +17,11 @@ #pragma once #include -#include #include +#include -#include +#include +#include namespace ML { namespace Bench { @@ -42,25 +43,25 @@ struct TimeSeriesDataset { DataT* X; /** allocate space needed for the dataset */ - void allocate(const cumlHandle& handle, const TimeSeriesParams& p) { - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); + void allocate(const raft::handle_t& handle, const TimeSeriesParams& p) { + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); X = (DataT*)allocator->allocate(p.batch_size * p.n_obs * sizeof(DataT), stream); } /** free-up the buffers */ - void deallocate(const cumlHandle& handle, const TimeSeriesParams& p) { - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); + void deallocate(const raft::handle_t& handle, const TimeSeriesParams& p) { + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); allocator->deallocate(X, p.batch_size * p.n_obs * sizeof(DataT), stream); } /** generate random time series (normal distribution) */ - void random(const cumlHandle& handle, const TimeSeriesParams& p, DataT mu = 0, - DataT sigma = 1) { - MLCommon::Random::Rng gpu_gen(p.seed, MLCommon::Random::GenPhilox); - gpu_gen.normal(X, p.batch_size * p.n_obs, mu, sigma, handle.getStream()); + void random(const raft::handle_t& handle, const TimeSeriesParams& p, + DataT mu = 0, DataT sigma = 1) { + raft::random::Rng gpu_gen(p.seed, raft::random::GenPhilox); + gpu_gen.normal(X, p.batch_size * p.n_obs, mu, sigma, handle.get_stream()); } }; diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu new file mode 100644 index 0000000000..094e735c1a --- /dev/null +++ b/cpp/bench/sg/fil.cu @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "benchmark.cuh" + +namespace ML { +namespace Bench { +namespace fil { + +struct Params { + DatasetParams data; + RegressionParams blobs; + ModelHandle model; + ML::fil::storage_type_t storage; + ML::fil::algo_t algo; + RF_params rf; + int predict_repetitions; +}; + +class FIL : public RegressionFixture { + typedef RegressionFixture Base; + + public: + FIL(const std::string& name, const Params& p) + /* + fitting to linear combinations in "y" normally yields trees that check + values of all significant columns, as well as their linear + combinations in "X". During inference, the exact threshold + values do not affect speed. The distribution of column popularity does + not affect speed barring lots of uninformative columns in succession. + Hence, this method represents real datasets well enough for both + classification and regression. + */ + : RegressionFixture(name, p.data, p.blobs), + model(p.model), + p_rest(p) {} + + static void regression_to_classification(float* y, int nrows, int nclasses, + cudaStream_t stream) { + raft::linalg::unaryOp( + y, y, nrows, + [=] __device__(float a) { + return float(lroundf(fabsf(a) * 1000. * nclasses) % nclasses); + }, + stream); + } + + protected: + void runBenchmark(::benchmark::State& state) override { + if (!params.rowMajor) { + state.SkipWithError("FIL only supports row-major inputs"); + } + if (params.nclasses > 1) { + // convert regression ranges into [0..nclasses-1] + regression_to_classification(data.y, params.nrows, params.nclasses, + stream); + } + // create model + ML::RandomForestRegressorF rf_model; + auto* mPtr = &rf_model; + mPtr->trees = nullptr; + size_t train_nrows = std::min(params.nrows, 1000); + fit(*handle, mPtr, data.X, train_nrows, params.ncols, data.y, p_rest.rf); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + ML::build_treelite_forest(&model, &rf_model, params.ncols, + params.nclasses > 1 ? 2 : 1); + ML::fil::treelite_params_t tl_params = { + .algo = p_rest.algo, + .output_class = params.nclasses > 1, // cuML RF forest + .threshold = 1.f / params.nclasses, //Fixture::DatasetParams + .storage_type = p_rest.storage}; + ML::fil::from_treelite(*handle, &forest, model, &tl_params); + + // only time prediction + this->loopOnState(state, [this]() { + // Dataset allocates y assuming one output value per input row, + // so not supporting predict_proba yet + for (int i = 0; i < p_rest.predict_repetitions; i++) { + ML::fil::predict(*this->handle, this->forest, this->data.y, + this->data.X, this->params.nrows, false); + } + }); + } + + void allocateBuffers(const ::benchmark::State& state) override { + Base::allocateBuffers(state); + } + + void deallocateBuffers(const ::benchmark::State& state) override { + ML::fil::free(*handle, forest); + Base::deallocateBuffers(state); + } + + private: + ML::fil::forest_t forest; + ModelHandle model; + Params p_rest; +}; + +struct FilBenchParams { + int nrows; + int ncols; + int nclasses; + int max_depth; + int ntrees; + ML::fil::storage_type_t storage; + ML::fil::algo_t algo; +}; + +std::vector getInputs() { + std::vector out; + Params p; + p.data.rowMajor = true; + p.blobs = { + .n_informative = -1, // Just a placeholder value, anyway changed below + .effective_rank = -1, // Just a placeholder value, anyway changed below + .bias = 0.f, + .tail_strength = 0.1, + .noise = 0.01, + .shuffle = false, + .seed = 12345ULL}; + + set_rf_params(p.rf, // Output RF parameters + 1, // n_trees, just a placeholder value, anyway changed below + true, // bootstrap + 1.f, // rows_sample + 1234, // seed + 8); // n_streams + + set_tree_params(p.rf.tree_params, // Output tree parameters + 10, // max_depth, just a placeholder value, + // anyway changed below + (1 << 20), // max_leaves + 1, // max_features + 32, // n_bins + 1, // split_algo + 3, // min_rows_per_node + 0.0f, // min_impurity_decrease + true, // bootstrap_features + ML::CRITERION::MSE, // split_criterion + false, // quantile_per_tree + false, // use_experimental_backend + 128); // max_batch_size + + using ML::fil::algo_t; + using ML::fil::storage_type_t; + std::vector var_params = { + {(int)1e6, 20, 1, 5, 1000, storage_type_t::DENSE, algo_t::BATCH_TREE_REORG}, + {(int)1e6, 20, 2, 5, 1000, storage_type_t::DENSE, + algo_t::BATCH_TREE_REORG}}; + for (auto& i : var_params) { + p.data.nrows = i.nrows; + p.data.ncols = i.ncols; + p.blobs.n_informative = i.ncols / 3; + p.blobs.effective_rank = i.ncols / 3; + p.data.nclasses = i.nclasses; + p.rf.tree_params.max_depth = i.max_depth; + p.rf.n_trees = i.ntrees; + p.storage = i.storage; + p.algo = i.algo; + p.predict_repetitions = 10; + out.push_back(p); + } + return out; +} + +ML_BENCH_REGISTER(Params, FIL, "", getInputs()); + +} // end namespace fil +} // end namespace Bench +} // end namespace ML diff --git a/cpp/bench/sg/rf_classifier.cu b/cpp/bench/sg/rf_classifier.cu index 54859d5f15..ec1f95cba0 100644 --- a/cpp/bench/sg/rf_classifier.cu +++ b/cpp/bench/sg/rf_classifier.cu @@ -77,22 +77,34 @@ std::vector getInputs() { std::vector out; Params p; p.data.rowMajor = false; - p.blobs.cluster_std = 10.0; - p.blobs.shuffle = false; - p.blobs.center_box_min = -10.0; - p.blobs.center_box_max = 10.0; - p.blobs.seed = 12345ULL; - p.rf.bootstrap = true; - p.rf.rows_sample = 1.f; - p.rf.tree_params.max_leaves = 1 << 20; - p.rf.tree_params.min_rows_per_node = 3; - p.rf.tree_params.n_bins = 32; - p.rf.tree_params.bootstrap_features = true; - p.rf.tree_params.quantile_per_tree = false; - p.rf.tree_params.split_algo = 1; - p.rf.tree_params.split_criterion = (ML::CRITERION)0; - p.rf.n_trees = 500; - p.rf.n_streams = 8; + p.blobs = {10.0, // cluster_std + false, // shuffle + -10.0, // center_box_min + 10.0, // center_box_max + 2152953ULL}; //seed + + set_rf_params(p.rf, // Output RF parameters + 500, // n_trees + true, // bootstrap + 1.f, // rows_sample + 1234, // seed + 8); // n_streams + + set_tree_params(p.rf.tree_params, // Output tree parameters + 10, // max_depth, this is anyway changed below + (1 << 20), // max_leaves + 0.3, // max_features, just a placeholder value, + // anyway changed below + 32, // n_bins + 1, // split_algo + 3, // min_rows_per_node + 0.0f, // min_impurity_decrease + true, // bootstrap_features + ML::CRITERION::GINI, // split_criterion + false, // quantile_per_tree + false, // use_experimental_backend + 128); // max_batch_size + std::vector rowcols = { {160000, 64, 2}, {640000, 64, 8}, @@ -105,7 +117,7 @@ std::vector getInputs() { p.data.ncols = rc.ncols; p.data.nclasses = rc.nclasses; p.rf.tree_params.max_features = 1.f / std::sqrt(float(rc.ncols)); - for (auto max_depth : std::vector({8, 10})) { + for (auto max_depth : std::vector({7, 9})) { p.rf.tree_params.max_depth = max_depth; out.push_back(p); } diff --git a/cpp/bench/sg/rf_regressor.cu b/cpp/bench/sg/rf_regressor.cu index 8235ce382c..1ed292a089 100644 --- a/cpp/bench/sg/rf_regressor.cu +++ b/cpp/bench/sg/rf_regressor.cu @@ -77,23 +77,37 @@ std::vector getInputs() { struct std::vector out; RegParams p; p.data.rowMajor = false; - p.regression.shuffle = true; // better to shuffle when n_informative < ncols - p.regression.seed = 12345ULL; - p.regression.effective_rank = -1; // dataset generation will be faster - p.regression.bias = 4.5; - p.regression.tail_strength = 0.5; // unused when effective_rank = -1 - p.regression.noise = 1.; - p.rf.bootstrap = true; - p.rf.rows_sample = 1.f; - p.rf.tree_params.max_leaves = 1 << 20; - p.rf.tree_params.min_rows_per_node = 3; - p.rf.tree_params.n_bins = 32; - p.rf.tree_params.bootstrap_features = true; - p.rf.tree_params.quantile_per_tree = false; - p.rf.tree_params.split_algo = 1; - p.rf.tree_params.split_criterion = ML::CRITERION::MSE; - p.rf.n_trees = 500; - p.rf.n_streams = 8; + p.regression = { + .shuffle = true, // Better to shuffle when n_informative < ncols + .effective_rank = -1, // dataset generation will be faster + .bias = 4.5, + .tail_strength = 0.5, // unused when effective_rank = -1 + .noise = 1.0, + .seed = 12345ULL}; + + set_rf_params(p.rf, // Output RF parameters + 500, // n_trees + true, // bootstrap + 1.f, // rows_sample + 1234, // seed + 8); // n_streams + + set_tree_params(p.rf.tree_params, // Output tree parameters + 10, // max_depth, just a place holder value, + // anyway changed below + (1 << 20), // max_leaves + 0.3, // max_features, just a place holder value, + // anyway changed below + 32, // n_bins + 1, // split_algo + 3, // min_rows_per_node + 0.0f, // min_impurity_decrease + true, // bootstrap_features + ML::CRITERION::MSE, // split_criterion + false, // quantile_per_tree + false, // use_experimental_backend + 128); // max_batch_size + std::vector dim_info = {{500000, 500, 400}}; for (auto& di : dim_info) { // Let's run Bosch only for float type @@ -102,7 +116,7 @@ std::vector getInputs() { p.data.ncols = di.ncols; p.regression.n_informative = di.n_informative; p.rf.tree_params.max_features = 1.f; - for (auto max_depth : std::vector({8, 12, 16})) { + for (auto max_depth : std::vector({7, 11, 15})) { p.rf.tree_params.max_depth = max_depth; out.push_back(p); } diff --git a/cpp/bench/sg/umap.cu b/cpp/bench/sg/umap.cu index e4395b9268..d7ddb31552 100644 --- a/cpp/bench/sg/umap.cu +++ b/cpp/bench/sg/umap.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include #include #include +#include #include #include "benchmark.cuh" @@ -40,7 +40,7 @@ __global__ void castKernel(OutT* out, const InT* in, IdxT len) { template void cast(OutT* out, const InT* in, IdxT len, cudaStream_t stream) { static const int TPB = 256; - auto nblks = MLCommon::ceildiv(len, TPB); + auto nblks = raft::ceildiv(len, TPB); castKernel<<>>(out, in, len); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake index 6fcbf742c6..ffa014641b 100644 --- a/cpp/cmake/Dependencies.cmake +++ b/cpp/cmake/Dependencies.cmake @@ -39,7 +39,7 @@ else(DEFINED ENV{RAFT_PATH}) ExternalProject_Add(raft GIT_REPOSITORY https://github.com/rapidsai/raft.git - GIT_TAG b6ef2a825bfcd47aa46d634a46049da791b43fa0 + GIT_TAG 9b3afe67895fbea397fb2c72375157aadfc132d8 PREFIX ${RAFT_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -53,7 +53,7 @@ endif(DEFINED ENV{RAFT_PATH}) ############################################################################## # - cumlprims (binary dependency) -------------------------------------------- -if(NOT DISABLE_CUMLPRIMS_MG) +if(ENABLE_CUMLPRIMS_MG) if(DEFINED ENV{CUMLPRIMS_MG_PATH}) set(CUMLPRIMS_MG_PATH ENV{CUMLPRIMS_MG_PATH}}) @@ -74,31 +74,47 @@ if(NOT DISABLE_CUMLPRIMS_MG) endif(EXISTS "${CUMLPRIMS_MG_PATH}/lib/libcumlprims.so") endif(NOT CUMLPRIMS_MG_PATH) -endif(NOT DISABLE_CUMLPRIMS_MG) +endif(ENABLE_CUMLPRIMS_MG) ############################################################################## # - RMM ---------------------------------------------------------------------- -# find package module uses RMM_INSTALL_DIR for Hints, checking RMM_ROOT env variable -# to match other RAPIDS repos. -set(RMM_INSTALL_DIR ENV{RMM_ROOT}) +find_path(RMM_INCLUDE_DIRS "rmm" + HINTS + "$ENV{RMM_ROOT}/include" + "$ENV{CONDA_PREFIX}/include/rmm" + "$ENV{CONDA_PREFIX}/include") -find_package(RMM - REQUIRED) +message(STATUS "RMM: RMM_INCLUDE_DIRS set to ${RMM_INCLUDE_DIRS}") +############################################################################## +# - NCCL --------------------------------------------------------------------- + +if(BUILD_CUML_MPI_COMMS OR BUILD_CUML_STD_COMMS) + find_package(NCCL REQUIRED) +endif(BUILD_CUML_MPI_COMMS OR BUILD_CUML_STD_COMMS) + +############################################################################## +# - MPI --------------------------------------------------------------------- + +if(BUILD_CUML_MPI_COMMS) + find_package(MPI REQUIRED) +endif(BUILD_CUML_MPI_COMMS) ############################################################################## # - cub - (header only) ------------------------------------------------------ -set(CUB_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub CACHE STRING "Path to cub repo") -ExternalProject_Add(cub - GIT_REPOSITORY https://github.com/thrust/cub.git - GIT_TAG 1.8.0 - PREFIX ${CUB_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "") +if(NOT CUB_IS_PART_OF_CTK) + set(CUB_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub CACHE STRING "Path to cub repo") + ExternalProject_Add(cub + GIT_REPOSITORY https://github.com/thrust/cub.git + GIT_TAG 1.8.0 + PREFIX ${CUB_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "") +endif(NOT CUB_IS_PART_OF_CTK) ############################################################################## # - cutlass - (header only) -------------------------------------------------- @@ -120,7 +136,7 @@ set(SPDLOG_DIR ${CMAKE_CURRENT_BINARY_DIR}/spdlog CACHE STRING "Path to spdlog install directory") ExternalProject_Add(spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git - GIT_TAG v1.x + GIT_TAG v1.7.0 PREFIX ${SPDLOG_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -129,69 +145,85 @@ ExternalProject_Add(spdlog ############################################################################## # - faiss -------------------------------------------------------------------- -set(FAISS_DIR ${CMAKE_CURRENT_BINARY_DIR}/faiss CACHE STRING - "Path to FAISS source directory") -ExternalProject_Add(faiss - GIT_REPOSITORY https://github.com/facebookresearch/faiss.git - GIT_TAG v1.6.2 - CONFIGURE_COMMAND LIBS=-pthread - CPPFLAGS=-w - LDFLAGS=-L${CMAKE_INSTALL_PREFIX}/lib - ${CMAKE_CURRENT_BINARY_DIR}/faiss/src/faiss/configure - --prefix=${CMAKE_CURRENT_BINARY_DIR}/faiss - --with-blas=${BLAS_LIBRARIES} - --with-cuda=${CUDA_TOOLKIT_ROOT_DIR} - --with-cuda-arch=${FAISS_GPU_ARCHS} - -v - PREFIX ${FAISS_DIR} - BUILD_COMMAND make -j${PARALLEL_LEVEL} VERBOSE=1 - BUILD_BYPRODUCTS ${FAISS_DIR}/lib/libfaiss.a - INSTALL_COMMAND make -s install > /dev/null - UPDATE_COMMAND "" - BUILD_IN_SOURCE 1) - -ExternalProject_Get_Property(faiss install_dir) - -add_library(faisslib STATIC IMPORTED) - -set_property(TARGET faisslib PROPERTY - IMPORTED_LOCATION ${FAISS_DIR}/lib/libfaiss.a) +if(BUILD_STATIC_FAISS) + set(FAISS_DIR ${CMAKE_CURRENT_BINARY_DIR}/faiss CACHE STRING + "Path to FAISS source directory") + ExternalProject_Add(faiss + GIT_REPOSITORY https://github.com/facebookresearch/faiss.git + GIT_TAG a5b850dec6f1cd6c88ab467bfd5e87b0cac2e41d + CONFIGURE_COMMAND LIBS=-pthread + CPPFLAGS=-w + LDFLAGS=-L${CMAKE_INSTALL_PREFIX}/lib + ${CMAKE_CURRENT_BINARY_DIR}/faiss/src/faiss/configure + --prefix=${CMAKE_CURRENT_BINARY_DIR}/faiss + --with-blas=${BLAS_LIBRARIES} + --with-cuda=${CUDA_TOOLKIT_ROOT_DIR} + --with-cuda-arch=${FAISS_GPU_ARCHS} + -v + PREFIX ${FAISS_DIR} + BUILD_COMMAND make -j${PARALLEL_LEVEL} VERBOSE=1 + BUILD_BYPRODUCTS ${FAISS_DIR}/lib/libfaiss.a + BUILD_ALWAYS 1 + INSTALL_COMMAND make -s install > /dev/null + UPDATE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND patch -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/faiss_cuda11.patch || true) + + ExternalProject_Get_Property(faiss install_dir) + add_library(FAISS::FAISS STATIC IMPORTED) + set_property(TARGET FAISS::FAISS PROPERTY + IMPORTED_LOCATION ${FAISS_DIR}/lib/libfaiss.a) + # to account for the FAISS file reorg that happened recently after the current + # pinned commit, just change the following line to + # set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src/faiss") + set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src") +else() + set(FAISS_INSTALL_DIR ENV{FAISS_ROOT}) + find_package(FAISS REQUIRED) +endif(BUILD_STATIC_FAISS) ############################################################################## # - treelite build ----------------------------------------------------------- -find_package(Treelite 0.92 REQUIRED) +find_package(Treelite 0.93 REQUIRED) ############################################################################## -# - googletest --------------------------------------------------------------- - -set(GTEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/googletest CACHE STRING - "Path to googletest repo") -set(GTEST_BINARY_DIR ${PROJECT_BINARY_DIR}/googletest) -set(GTEST_INSTALL_DIR ${GTEST_BINARY_DIR}/install) -set(GTEST_LIB ${GTEST_INSTALL_DIR}/lib/libgtest_main.a) -include(ExternalProject) -ExternalProject_Add(googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG 6ce9b98f541b8bcd84c5c5b3483f29a933c4aefb - PREFIX ${GTEST_DIR} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= - -DBUILD_SHARED_LIBS=OFF - -DCMAKE_INSTALL_LIBDIR=lib - BUILD_BYPRODUCTS ${GTEST_DIR}/lib/libgtest.a - ${GTEST_DIR}/lib/libgtest_main.a - UPDATE_COMMAND "") - -add_library(gtestlib STATIC IMPORTED) -add_library(gtest_mainlib STATIC IMPORTED) - -set_property(TARGET gtestlib PROPERTY - IMPORTED_LOCATION ${GTEST_DIR}/lib/libgtest.a) -set_property(TARGET gtest_mainlib PROPERTY - IMPORTED_LOCATION ${GTEST_DIR}/lib/libgtest_main.a) - -add_dependencies(gtestlib googletest) -add_dependencies(gtest_mainlib googletest) +# - googletest build ----------------------------------------------------------- + +if(BUILD_GTEST) + set(GTEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/googletest CACHE STRING + "Path to googletest repo") + set(GTEST_BINARY_DIR ${PROJECT_BINARY_DIR}/googletest) + set(GTEST_INSTALL_DIR ${GTEST_BINARY_DIR}/install) + set(GTEST_LIB ${GTEST_INSTALL_DIR}/lib/libgtest_main.a) + include(ExternalProject) + ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG 6ce9b98f541b8bcd84c5c5b3483f29a933c4aefb + PREFIX ${GTEST_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_INSTALL_LIBDIR=lib + BUILD_BYPRODUCTS ${GTEST_DIR}/lib/libgtest.a + ${GTEST_DIR}/lib/libgtest_main.a + UPDATE_COMMAND "") + + add_library(GTest::GTest STATIC IMPORTED) + add_library(GTest::Main STATIC IMPORTED) + + set_property(TARGET GTest::GTest PROPERTY + IMPORTED_LOCATION ${GTEST_DIR}/lib/libgtest.a) + set_property(TARGET GTest::Main PROPERTY + IMPORTED_LOCATION ${GTEST_DIR}/lib/libgtest_main.a) + + set(GTEST_INCLUDE_DIRS "${GTEST_DIR}") + + add_dependencies(GTest::GTest googletest) + add_dependencies(GTest::Main googletest) + +else() + find_package(GTest REQUIRED) +endif(BUILD_GTEST) ############################################################################## # - googlebench --------------------------------------------------------------- @@ -225,10 +257,14 @@ set_property(TARGET benchmarklib PROPERTY # TODO: Change to using build.sh and make targets instead of this -add_dependencies(cub raft) -add_dependencies(cutlass cub) +if(CUB_IS_PART_OF_CTK) + add_dependencies(cutlass raft) +else() + add_dependencies(cub raft) + add_dependencies(cutlass cub) +endif(CUB_IS_PART_OF_CTK) add_dependencies(spdlog cutlass) -add_dependencies(googletest spdlog) -add_dependencies(benchmark googletest) -add_dependencies(faiss benchmark) -add_dependencies(faisslib faiss) +add_dependencies(GTest::GTest spdlog) +add_dependencies(benchmark GTest::GTest) +add_dependencies(FAISS::FAISS benchmark) +add_dependencies(FAISS::FAISS faiss) diff --git a/cpp/cmake/doxygen.cmake b/cpp/cmake/doxygen.cmake index b27cb39290..07b2d1488a 100644 --- a/cpp/cmake/doxygen.cmake +++ b/cpp/cmake/doxygen.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ # limitations under the License. # -find_package(Doxygen 1.8.11) +find_package(Doxygen 1.8.12 REQUIRED) function(add_doxygen_target) if(Doxygen_FOUND) diff --git a/cpp/cmake/faiss_cuda11.patch b/cpp/cmake/faiss_cuda11.patch new file mode 100644 index 0000000000..496ca0e7b2 --- /dev/null +++ b/cpp/cmake/faiss_cuda11.patch @@ -0,0 +1,40 @@ +diff --git a/configure b/configure +index ed40dae..f88ed0a 100755 +--- a/configure ++++ b/configure +@@ -2970,7 +2970,7 @@ ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ex + ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + +- ax_cxx_compile_alternatives="11 0x" ax_cxx_compile_cxx11_required=true ++ ax_cxx_compile_alternatives="14 11 0x" ax_cxx_compile_cxx11_required=true + ac_ext=cpp + ac_cpp='$CXXCPP $CPPFLAGS' + ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh +index 89d3dda..bc0f9b5 100644 +--- a/gpu/utils/DeviceDefs.cuh ++++ b/gpu/utils/DeviceDefs.cuh +@@ -13,7 +13,7 @@ + namespace faiss { namespace gpu { + + #ifdef __CUDA_ARCH__ +-#if __CUDA_ARCH__ <= 750 ++#if __CUDA_ARCH__ <= 800 + constexpr int kWarpSize = 32; + #else + #error Unknown __CUDA_ARCH__; please define parameters for compute capability +diff --git a/gpu/utils/MatrixMult-inl.cuh b/gpu/utils/MatrixMult-inl.cuh +index ede225e..4f7eb44 100644 +--- a/gpu/utils/MatrixMult-inl.cuh ++++ b/gpu/utils/MatrixMult-inl.cuh +@@ -51,6 +51,9 @@ rawGemm(cublasHandle_t handle, + auto cBT = GetCudaType::Type; + + // Always accumulate in f32 ++# if __CUDACC_VER_MAJOR__ >= 11 ++ cublasSetMathMode(handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); ++# endif + return cublasSgemmEx(handle, transa, transb, m, n, k, + &fAlpha, A, cAT, lda, + B, cBT, ldb, diff --git a/cpp/comms/mpi/CMakeLists.txt b/cpp/comms/mpi/CMakeLists.txt deleted file mode 100644 index 5f6713b709..0000000000 --- a/cpp/comms/mpi/CMakeLists.txt +++ /dev/null @@ -1,53 +0,0 @@ -# -# Copyright (c) 2019-2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) -project(cuML-comms-MPI LANGUAGES CXX CUDA) - -find_package(MPI REQUIRED) - -if(NOT NCCL_PATH) - find_package(NCCL REQUIRED) -else() - set(NCCL_INCLUDE_DIRS ${NCCL_PATH}/include) - set(NCCL_LIBRARIES ${NCCL_PATH}/lib/libnccl.so) - set(NCCL_FOUND ON) -endif(NOT NCCL_PATH) - -set(CMAKE_CXX_STANDARD 14) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -include_directories(include - ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} - ${MPI_CXX_INCLUDE_PATH} - ../../include - ../../src - ../../src_prims -) - -set(MPI_COMMS_LINK_LIBRARIES ${CUML_CPP_TARGET} ${MPI_C_LIBRARIES}) - -if (NCCL_FOUND) - add_definitions(-DHAVE_NCCL) - include_directories( ${NCCL_INCLUDE_DIRS} ) - list(APPEND MPI_COMMS_LINK_LIBRARIES ${NCCL_LIBRARIES}) -endif() - -add_library(cumlcommsmpi SHARED src/cuML_comms_mpi_impl.cpp) -target_link_libraries(cumlcommsmpi ${MPI_COMMS_LINK_LIBRARIES}) -target_compile_options(cumlcommsmpi PUBLIC ${MPI_C_COMPILE_FLAGS}) - -install(TARGETS cumlcommsmpi DESTINATION lib) diff --git a/cpp/comms/mpi/include/cuML_comms.hpp b/cpp/comms/mpi/include/cuML_comms.hpp deleted file mode 100644 index 1df2c3f27d..0000000000 --- a/cpp/comms/mpi/include/cuML_comms.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -namespace ML { - -void initialize_mpi_comms(cumlHandle& handle, MPI_Comm comm); - -} // end namespace ML diff --git a/cpp/comms/mpi/src/cuML_comms_mpi_impl.cpp b/cpp/comms/mpi/src/cuML_comms_mpi_impl.cpp deleted file mode 100644 index ad2b5ab98a..0000000000 --- a/cpp/comms/mpi/src/cuML_comms_mpi_impl.cpp +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cuML_comms_mpi_impl.hpp" - -#include -#include - -#include -#include -#include - -#include - -#define MPI_CHECK(call) \ - do { \ - int status = call; \ - if (MPI_SUCCESS != status) { \ - int mpi_error_string_lenght = 0; \ - char mpi_error_string[MPI_MAX_ERROR_STRING]; \ - MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - ASSERT(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \ - #call, mpi_error_string); \ - } \ - } while (0) - -#define MPI_CHECK_NO_THROW(call) \ - do { \ - int status = call; \ - if (MPI_SUCCESS != status) { \ - int mpi_error_string_lenght = 0; \ - char mpi_error_string[MPI_MAX_ERROR_STRING]; \ - MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - CUML_LOG_ERROR("MPI call='%s' at file=%s line=%d failed with %s ", \ - #call, __FILE__, __LINE__, mpi_error_string); \ - } \ - } while (0) - -#define NCCL_CHECK(call) \ - do { \ - ncclResult_t status = call; \ - ASSERT(ncclSuccess == status, "ERROR: NCCL call='%s'. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } while (0) - -#define NCCL_CHECK_NO_THROW(call) \ - do { \ - ncclResult_t status = call; \ - if (status != ncclSuccess) { \ - CUML_LOG_ERROR("NCCL call='%s' failed. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } \ - } while (0) - -namespace ML { - -namespace { -size_t getDatatypeSize(const cumlMPICommunicator_impl::datatype_t datatype) { - switch (datatype) { - case MLCommon::cumlCommunicator::CHAR: - return sizeof(char); - case MLCommon::cumlCommunicator::UINT8: - return sizeof(unsigned char); - case MLCommon::cumlCommunicator::INT: - return sizeof(int); - case MLCommon::cumlCommunicator::UINT: - return sizeof(unsigned int); - case MLCommon::cumlCommunicator::INT64: - return sizeof(long long int); - case MLCommon::cumlCommunicator::UINT64: - return sizeof(unsigned long long int); - case MLCommon::cumlCommunicator::FLOAT: - return sizeof(float); - case MLCommon::cumlCommunicator::DOUBLE: - return sizeof(double); - default: - // Execution should never reach here. This takes care of compiler warning. - return 0; - } -} - -MPI_Datatype getMPIDatatype( - const cumlMPICommunicator_impl::datatype_t datatype) { - switch (datatype) { - case MLCommon::cumlCommunicator::CHAR: - return MPI_CHAR; - case MLCommon::cumlCommunicator::UINT8: - return MPI_UNSIGNED_CHAR; - case MLCommon::cumlCommunicator::INT: - return MPI_INT; - case MLCommon::cumlCommunicator::UINT: - return MPI_UNSIGNED; - case MLCommon::cumlCommunicator::INT64: - return MPI_LONG_LONG; - case MLCommon::cumlCommunicator::UINT64: - return MPI_UNSIGNED_LONG_LONG; - case MLCommon::cumlCommunicator::FLOAT: - return MPI_FLOAT; - case MLCommon::cumlCommunicator::DOUBLE: - return MPI_DOUBLE; - default: - // Execution should never reach here. This takes care of compiler warning. - return MPI_DOUBLE; - } -} - -MPI_Op getMPIOp(const cumlMPICommunicator_impl::op_t op) { - switch (op) { - case MLCommon::cumlCommunicator::SUM: - return MPI_SUM; - case MLCommon::cumlCommunicator::PROD: - return MPI_PROD; - case MLCommon::cumlCommunicator::MIN: - return MPI_MIN; - case MLCommon::cumlCommunicator::MAX: - return MPI_MAX; - default: - // Execution should never reach here. This takes care of compiler warning. - return MPI_MAX; - } -} - -#ifdef HAVE_NCCL -ncclDataType_t getNCCLDatatype( - const cumlMPICommunicator_impl::datatype_t datatype) { - switch (datatype) { - case MLCommon::cumlCommunicator::CHAR: - return ncclChar; - case MLCommon::cumlCommunicator::UINT8: - return ncclUint8; - case MLCommon::cumlCommunicator::INT: - return ncclInt; - case MLCommon::cumlCommunicator::UINT: - return ncclUint32; - case MLCommon::cumlCommunicator::INT64: - return ncclInt64; - case MLCommon::cumlCommunicator::UINT64: - return ncclUint64; - case MLCommon::cumlCommunicator::FLOAT: - return ncclFloat; - case MLCommon::cumlCommunicator::DOUBLE: - return ncclDouble; - default: - // Execution should never reach here. This takes care of compiler warning. - return ncclDouble; - } -} - -ncclRedOp_t getNCCLOp(const cumlMPICommunicator_impl::op_t op) { - switch (op) { - case MLCommon::cumlCommunicator::SUM: - return ncclSum; - case MLCommon::cumlCommunicator::PROD: - return ncclProd; - case MLCommon::cumlCommunicator::MIN: - return ncclMin; - case MLCommon::cumlCommunicator::MAX: - return ncclMax; - default: - // Execution should never reach here. This takes care of compiler warning. - return ncclMax; - } -} -#endif -} // namespace - -void initialize_mpi_comms(cumlHandle& handle, MPI_Comm comm) { - auto communicator = std::make_shared( - std::unique_ptr( - new cumlMPICommunicator_impl(comm))); - handle.getImpl().setCommunicator(communicator); -} - -cumlMPICommunicator_impl::cumlMPICommunicator_impl(MPI_Comm comm, - const bool owns_mpi_comm) - : _owns_mpi_comm(owns_mpi_comm), - _mpi_comm(comm), - _size(0), - _rank(1), - _next_request_id(0) { - int mpi_is_initialized = 0; - MPI_CHECK(MPI_Initialized(&mpi_is_initialized)); - ASSERT(mpi_is_initialized, "ERROR: MPI is not initialized!"); - MPI_CHECK(MPI_Comm_size(_mpi_comm, &_size)); - MPI_CHECK(MPI_Comm_rank(_mpi_comm, &_rank)); -#ifdef HAVE_NCCL - //get NCCL unique ID at rank 0 and broadcast it to all others - ncclUniqueId id; - if (0 == _rank) NCCL_CHECK(ncclGetUniqueId(&id)); - MPI_CHECK(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, _mpi_comm)); - - //initializing NCCL - NCCL_CHECK(ncclCommInitRank(&_nccl_comm, _size, id, _rank)); -#endif -} - -cumlMPICommunicator_impl::~cumlMPICommunicator_impl() { -#ifdef HAVE_NCCL - //finalizing NCCL - NCCL_CHECK_NO_THROW(ncclCommDestroy(_nccl_comm)); -#endif - if (_owns_mpi_comm) { - MPI_CHECK_NO_THROW(MPI_Comm_free(&_mpi_comm)); - } -} - -int cumlMPICommunicator_impl::getSize() const { return _size; } - -int cumlMPICommunicator_impl::getRank() const { return _rank; } - -std::unique_ptr -cumlMPICommunicator_impl::commSplit(int color, int key) const { - MPI_Comm new_comm; - MPI_CHECK(MPI_Comm_split(_mpi_comm, color, key, &new_comm)); - return std::unique_ptr( - new cumlMPICommunicator_impl(new_comm, true)); -} - -void cumlMPICommunicator_impl::barrier() const { - MPI_CHECK(MPI_Barrier(_mpi_comm)); -} - -void cumlMPICommunicator_impl::isend(const void* buf, int size, int dest, - int tag, request_t* request) const { - MPI_Request mpi_req; - request_t req_id; - if (_free_requests.empty()) { - req_id = _next_request_id++; - } else { - auto it = _free_requests.begin(); - req_id = *it; - _free_requests.erase(it); - } - MPI_CHECK(MPI_Isend(buf, size, MPI_BYTE, dest, tag, _mpi_comm, &mpi_req)); - _requests_in_flight.insert(std::make_pair(req_id, mpi_req)); - *request = req_id; -} - -void cumlMPICommunicator_impl::irecv(void* buf, int size, int source, int tag, - request_t* request) const { - if (source == CUML_ANY_SOURCE) source = MPI_ANY_SOURCE; - - MPI_Request mpi_req; - request_t req_id; - if (_free_requests.empty()) { - req_id = _next_request_id++; - } else { - auto it = _free_requests.begin(); - req_id = *it; - _free_requests.erase(it); - } - - MPI_CHECK(MPI_Irecv(buf, size, MPI_BYTE, source, tag, _mpi_comm, &mpi_req)); - _requests_in_flight.insert(std::make_pair(req_id, mpi_req)); - *request = req_id; -} - -void cumlMPICommunicator_impl::waitall(int count, - request_t array_of_requests[]) const { - std::vector requests; - requests.reserve(count); - for (int i = 0; i < count; ++i) { - auto req_it = _requests_in_flight.find(array_of_requests[i]); - ASSERT(_requests_in_flight.end() != req_it, - "ERROR: waitall on invalid request: %d", array_of_requests[i]); - requests.push_back(req_it->second); - _free_requests.insert(req_it->first); - _requests_in_flight.erase(req_it); - } - MPI_CHECK(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE)); -} - -void cumlMPICommunicator_impl::allreduce(const void* sendbuff, void* recvbuff, - int count, datatype_t datatype, - op_t op, cudaStream_t stream) const { -#ifdef HAVE_NCCL - NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), - getNCCLOp(op), _nccl_comm, stream)); -#else - CUDA_CHECK(cudaStreamSynchronize(stream)); - MPI_CHECK(MPI_Allreduce(sendbuff, recvbuff, count, getMPIDatatype(datatype), - getMPIOp(op), _mpi_comm)); -#endif -} - -void cumlMPICommunicator_impl::bcast(void* buff, int count, datatype_t datatype, - int root, cudaStream_t stream) const { -#ifdef HAVE_NCCL - NCCL_CHECK(ncclBroadcast(buff, buff, count, getNCCLDatatype(datatype), root, - _nccl_comm, stream)); -#else - CUDA_CHECK(cudaStreamSynchronize(stream)); - MPI_CHECK(MPI_Bcast(buff, count, getMPIDatatype(datatype), root, _mpi_comm)); -#endif -} - -void cumlMPICommunicator_impl::reduce(const void* sendbuff, void* recvbuff, - int count, datatype_t datatype, op_t op, - int root, cudaStream_t stream) const { -#ifdef HAVE_NCCL - NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), - getNCCLOp(op), root, _nccl_comm, stream)); -#else - CUDA_CHECK(cudaStreamSynchronize(stream)); - MPI_CHECK(MPI_Reduce(sendbuff, recvbuff, count, getMPIDatatype(datatype), - getMPIOp(op), root, _mpi_comm)); -#endif -} - -void cumlMPICommunicator_impl::allgather(const void* sendbuff, void* recvbuff, - int sendcount, datatype_t datatype, - cudaStream_t stream) const { -#ifdef HAVE_NCCL - NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, - getNCCLDatatype(datatype), _nccl_comm, stream)); -#else - CUDA_CHECK(cudaStreamSynchronize(stream)); - MPI_CHECK(MPI_Allgather(sendbuff, sendcount, getMPIDatatype(datatype), - recvbuff, sendcount, getMPIDatatype(datatype), - _mpi_comm)); -#endif -} - -void cumlMPICommunicator_impl::allgatherv(const void* sendbuf, void* recvbuf, - const int recvcounts[], - const int displs[], - datatype_t datatype, - cudaStream_t stream) const { -#ifdef HAVE_NCCL - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. - for (int root = 0; root < _size; ++root) { - NCCL_CHECK(ncclBroadcast( - sendbuf, - static_cast(recvbuf) + displs[root] * getDatatypeSize(datatype), - recvcounts[root], getNCCLDatatype(datatype), root, _nccl_comm, stream)); - } -#else - CUDA_CHECK(cudaStreamSynchronize(stream)); - MPI_CHECK(MPI_Allgatherv(sendbuf, recvcounts[_rank], getMPIDatatype(datatype), - recvbuf, recvcounts, displs, - getMPIDatatype(datatype), _mpi_comm)); -#endif -} - -void cumlMPICommunicator_impl::reducescatter(const void* sendbuff, - void* recvbuff, int recvcount, - datatype_t datatype, op_t op, - cudaStream_t stream) const { -#ifdef HAVE_NCCL - NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, - getNCCLDatatype(datatype), getNCCLOp(op), - _nccl_comm, stream)); -#else - CUDA_CHECK(cudaStreamSynchronize(stream)); - std::vector recvcounts(_size, recvcount); - MPI_CHECK(MPI_Reduce_scatter(sendbuff, recvbuff, recvcounts.data(), - getMPIDatatype(datatype), getMPIOp(op), - _mpi_comm)); -#endif -} - -MLCommon::cumlCommunicator::status_t cumlMPICommunicator_impl::syncStream( - cudaStream_t stream) const { -#ifdef HAVE_NCCL - cudaError_t cudaErr; - ncclResult_t ncclErr, ncclAsyncErr; - while (1) { - cudaErr = cudaStreamQuery(stream); - if (cudaErr == cudaSuccess) return status_t::commStatusSuccess; - - if (cudaErr != cudaErrorNotReady) { - // An error occurred querying the status of the stream - return status_t::commStatusError; - } - - ncclErr = ncclCommGetAsyncError(_nccl_comm, &ncclAsyncErr); - if (ncclErr != ncclSuccess) { - // An error occurred retrieving the asynchronous error - return status_t::commStatusError; - } - - if (ncclAsyncErr != ncclSuccess) { - // An asynchronous error happened. Stop the operation and destroy - // the communicator - ncclErr = ncclCommAbort(_nccl_comm); - if (ncclErr != ncclSuccess) - // Caller may abort with an exception or try to re-create a new communicator. - return status_t::commStatusAbort; - } - - // Let other threads (including NCCL threads) use the CPU. - pthread_yield(); - } -#else - CUDA_CHECK(cudaStreamSynchronize(stream)); - return status_t::commStatusSuccess; -#endif -} -} // end namespace ML diff --git a/cpp/comms/mpi/src/cuML_comms_mpi_impl.hpp b/cpp/comms/mpi/src/cuML_comms_mpi_impl.hpp deleted file mode 100644 index 165d2df8c6..0000000000 --- a/cpp/comms/mpi/src/cuML_comms_mpi_impl.hpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include - -#ifdef HAVE_NCCL -#include -#endif - -#include - -namespace ML { - -class cumlMPICommunicator_impl : public MLCommon::cumlCommunicator_iface { - public: - cumlMPICommunicator_impl() = delete; - - cumlMPICommunicator_impl(MPI_Comm comm, const bool owns_mpi_comm = false); - - virtual ~cumlMPICommunicator_impl(); - - virtual int getSize() const; - virtual int getRank() const; - - virtual std::unique_ptr commSplit( - int color, int key) const; - - virtual void barrier() const; - - virtual void isend(const void* buf, int size, int dest, int tag, - request_t* request) const; - - virtual void irecv(void* buf, int size, int source, int tag, - request_t* request) const; - - virtual void waitall(int count, request_t array_of_requests[]) const; - - virtual void allreduce(const void* sendbuff, void* recvbuff, int count, - datatype_t datatype, op_t op, - cudaStream_t stream) const; - - virtual void bcast(void* buff, int count, datatype_t datatype, int root, - cudaStream_t stream) const; - - virtual void reduce(const void* sendbuff, void* recvbuff, int count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const; - - virtual void allgather(const void* sendbuff, void* recvbuff, int sendcount, - datatype_t datatype, cudaStream_t stream) const; - - virtual void allgatherv(const void* sendbuf, void* recvbuf, - const int recvcounts[], const int displs[], - datatype_t datatype, cudaStream_t stream) const; - - virtual void reducescatter(const void* sendbuff, void* recvbuff, - int recvcount, datatype_t datatype, op_t op, - cudaStream_t stream) const; - - virtual status_t syncStream(cudaStream_t stream) const; - - private: - bool _owns_mpi_comm; - MPI_Comm _mpi_comm; -#ifdef HAVE_NCCL - ncclComm_t _nccl_comm; -#endif - int _size; - int _rank; - mutable request_t _next_request_id; - mutable std::unordered_map _requests_in_flight; - mutable std::unordered_set _free_requests; -}; - -} // end namespace ML diff --git a/cpp/comms/std/CMakeLists.txt b/cpp/comms/std/CMakeLists.txt deleted file mode 100644 index 16891178c7..0000000000 --- a/cpp/comms/std/CMakeLists.txt +++ /dev/null @@ -1,60 +0,0 @@ -# -# Copyright (c) 2019-2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) -project(cuML-comms LANGUAGES CXX CUDA) - -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CUML_DIR}/cmake") - -option(WITH_UCX "Uses UCX for P2P comms" ON) - -if(NOT NCCL_PATH) - find_package(NCCL REQUIRED) -else() - message("-- Manually set NCCL PATH to ${NCCL_PATH}") - set(NCCL_INCLUDE_DIRS ${NCCL_PATH}/include) - set(NCCL_LIBRARIES ${NCCL_PATH}/lib/libnccl.so) -endif(NOT NCCL_PATH) - -set(CMAKE_CXX_STANDARD 14) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -include_directories(include - ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} - ../../include - ../../src - ../../src_prims -) - -set(COMMS_LINK_LIBRARIES ${CUML_CPP_TARGET}) - -# Note this option will be removed once UCX conda package is released -if(WITH_UCX) - # dlopen is used to dynamically load the needed ucp symbols at runtime. - # Only the UCX include directories are needed for compiling - find_package(UCX) - include_directories(${UCX_INCLUDE_DIRS}) - add_compile_definitions(WITH_UCX=1) -endif(WITH_UCX) - -add_definitions(-DHAVE_NCCL) -include_directories( ${NCCL_INCLUDE_DIRS} ) -list(APPEND COMMS_LINK_LIBRARIES ${NCCL_LIBRARIES}) - -add_library(cumlcomms SHARED src/cuML_std_comms_impl.cpp) -target_link_libraries(cumlcomms ${COMMS_LINK_LIBRARIES}) - -install(TARGETS cumlcomms DESTINATION lib) diff --git a/cpp/comms/std/include/cuML_comms.hpp b/cpp/comms/std/include/cuML_comms.hpp deleted file mode 100644 index 61b9623f18..0000000000 --- a/cpp/comms/std/include/cuML_comms.hpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#ifdef WITH_UCX -#include -#endif - -#include - -namespace ML { - -#ifdef WITH_UCX -/** - * @brief Given initialized comms handles for NCCL and UCP, this function builds a - * cumlCommunicator object and injects it into the given cumlHandle instance. - * @param handle the cuml handle to inject a new communicator instance into - * @param comm initialized nccl communicator - * @param ucp_worker the ucp_worker for the current initialized ucp context - * @param eps an array of endpoints to the other ucp workers in the cluster - * @param size the size of the cluster (number of elements in eps) - * @param rank rank of the current worker - */ -void inject_comms(cumlHandle &handle, ncclComm_t comm, ucp_worker_h ucp_worker, - ucp_ep_h *eps, int size, int rank); -#endif - -/** - * @brief Given an initialized comms handle for NCCL, this function builds a - * cumlCommunicator object and injects it into the given cumlHandle instance. - * The underlying cumlCommunicator will only have support for collective - * communications functions. - * @param handle the cuml handle to inject a new communicator instance into - * @param comm initialized nccl communicator - * @param size the size of the cluster - * @param rank rank of the current worker - */ -void inject_comms(cumlHandle &handle, ncclComm_t comm, int size, int rank); - -} // end namespace ML diff --git a/cpp/comms/std/src/cuML_comms_py.hpp b/cpp/comms/std/src/cuML_comms_py.hpp deleted file mode 100644 index 5267c7cb15..0000000000 --- a/cpp/comms/std/src/cuML_comms_py.hpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace ML { - -bool ucx_enabled(); - -/** - * @brief This function wraps the inject comms functions in - * cpp/comms/std/include/cuML_comms.hpp to decouple the Python - * layer from the optional UCX dependency in the C++ build. This - * allows the Cython to compile without having to propagate the `WITH_UCX` - * directive to that layer. - * @param handle the cuml handle to inject a new communicator instance into - * @param comm initialized nccl communicator - * @param ucp_worker: ucp_worker_h instance for the current initialized ucp context - * @param eps an array of ucp_ep_h endpoints to the other ucp workers in the cluster - * @param size the size of the cluster (number of elements in eps) - * @param rank rank of the current worker - */ -void inject_comms_py(cumlHandle *handle, ncclComm_t comm, void *ucp_worker, - void *eps, int size, int rank); - -/** - * @brief This function follows the design of the wrapper function in - * cpp/comms/std/include/cuML_comms.hpp to decouple the Python layer - * injection functions from the C++ layer functions. - * @param handle the cuml handle to inject a new communicator instance into - * @param comm initialized nccl communicator - * @param size the size of the cluster (number of elements in eps) - * @param rank rank of the current worker - */ -void inject_comms_py_coll(cumlHandle *handle, ncclComm_t comm, int size, - int rank); - -/** - * @brief Stores the given character array on the given ncclUniqueId struct. - * @param id the ncclUniqueId struct instance to store the given character array - * @param uniqueId the unique id char array to store on the ncclUniqueId - * @param size id size - */ -void ncclUniqueIdFromChar(ncclUniqueId *id, char *uniqueId, int size); - -/** - * @brief Returns a NCCL unique ID as a character array. PyTorch - * uses this same approach, so that it can be more easily - * converted to a native Python string by Cython and further - * serialized to be sent across process & node boundaries. - * - * @param uid nccl unique id for establishing a new clique. - * @param size uid size - */ -void get_unique_id(char *uid, int size); -} // namespace ML diff --git a/cpp/comms/std/src/cuML_std_comms_impl.cpp b/cpp/comms/std/src/cuML_std_comms_impl.cpp deleted file mode 100644 index dab51f3129..0000000000 --- a/cpp/comms/std/src/cuML_std_comms_impl.cpp +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cuML_std_comms_impl.hpp" - -#include - -#ifdef WITH_UCX -constexpr bool UCX_ENABLED = true; -#else -constexpr bool UCX_ENABLED = false; -#endif - -#ifdef WITH_UCX -#include -#include -#include "ucp_helper.h" -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include - -#include - -#define NCCL_CHECK(call) \ - do { \ - ncclResult_t status = call; \ - ASSERT(ncclSuccess == status, "ERROR: NCCL call='%s'. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } while (0) - -#define NCCL_CHECK_NO_THROW(call) \ - do { \ - ncclResult_t status = call; \ - if (status != ncclSuccess) { \ - CUML_LOG_ERROR("NCCL call='%s' failed. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } \ - } while (0) - -namespace ML { - -namespace { - -size_t getDatatypeSize(const cumlStdCommunicator_impl::datatype_t datatype) { - switch (datatype) { - case MLCommon::cumlCommunicator::CHAR: - return sizeof(char); - case MLCommon::cumlCommunicator::UINT8: - return sizeof(uint8_t); - case MLCommon::cumlCommunicator::INT: - return sizeof(int); - case MLCommon::cumlCommunicator::UINT: - return sizeof(unsigned int); - case MLCommon::cumlCommunicator::INT64: - return sizeof(int64_t); - case MLCommon::cumlCommunicator::UINT64: - return sizeof(uint64_t); - case MLCommon::cumlCommunicator::FLOAT: - return sizeof(float); - case MLCommon::cumlCommunicator::DOUBLE: - return sizeof(double); - } -} - -ncclDataType_t getNCCLDatatype( - const cumlStdCommunicator_impl::datatype_t datatype) { - switch (datatype) { - case MLCommon::cumlCommunicator::CHAR: - return ncclChar; - case MLCommon::cumlCommunicator::UINT8: - return ncclUint8; - case MLCommon::cumlCommunicator::INT: - return ncclInt; - case MLCommon::cumlCommunicator::UINT: - return ncclUint32; - case MLCommon::cumlCommunicator::INT64: - return ncclInt64; - case MLCommon::cumlCommunicator::UINT64: - return ncclUint64; - case MLCommon::cumlCommunicator::FLOAT: - return ncclFloat; - case MLCommon::cumlCommunicator::DOUBLE: - return ncclDouble; - } -} - -ncclRedOp_t getNCCLOp(const cumlStdCommunicator_impl::op_t op) { - switch (op) { - case MLCommon::cumlCommunicator::SUM: - return ncclSum; - case MLCommon::cumlCommunicator::PROD: - return ncclProd; - case MLCommon::cumlCommunicator::MIN: - return ncclMin; - case MLCommon::cumlCommunicator::MAX: - return ncclMax; - } -} -} // namespace - -bool ucx_enabled() { return UCX_ENABLED; } - -/** - * @brief Underlying comms, like NCCL and UCX, should be initialized and ready for use, - * and maintained, outside of the cuML Comms lifecycle. This allows us to decouple the - * ownership of the actual comms from cuml so that they can also be used outside of cuml. - * - * For instance, nccl-py can be used to bootstrap a ncclComm_t before it is - * used to construct a cuml comms instance. UCX endpoints can be bootstrapped - * in Python using ucx-py, before being used to construct a cuML comms instance. - */ -#ifdef WITH_UCX -void inject_comms(cumlHandle &handle, ncclComm_t comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int size, int rank) { - auto communicator = std::make_shared( - std::unique_ptr( - new cumlStdCommunicator_impl(comm, ucp_worker, eps, size, rank))); - handle.getImpl().setCommunicator(communicator); -} -#endif - -void inject_comms(cumlHandle &handle, ncclComm_t comm, int size, int rank) { - auto communicator = std::make_shared( - std::unique_ptr( - new cumlStdCommunicator_impl(comm, size, rank))); - handle.getImpl().setCommunicator(communicator); -} - -void inject_comms_py_coll(cumlHandle *handle, ncclComm_t comm, int size, - int rank) { - inject_comms(*handle, comm, size, rank); -} - -void inject_comms_py(ML::cumlHandle *handle, ncclComm_t comm, void *ucp_worker, - void *eps, int size, int rank) { -#ifdef WITH_UCX - std::shared_ptr eps_sp = - std::make_shared(new ucp_ep_h[size]); - - size_t *size_t_ep_arr = (size_t *)eps; - - for (int i = 0; i < size; i++) { - size_t ptr = size_t_ep_arr[i]; - ucp_ep_h *ucp_ep_v = (ucp_ep_h *)*eps_sp; - - if (ptr != 0) { - ucp_ep_h eps_ptr = (ucp_ep_h)size_t_ep_arr[i]; - ucp_ep_v[i] = eps_ptr; - } else { - ucp_ep_v[i] = nullptr; - } - } - - inject_comms(*handle, comm, (ucp_worker_h)ucp_worker, eps_sp, size, rank); -#else - inject_comms(*handle, comm, size, rank); -#endif -} - -void ncclUniqueIdFromChar(ncclUniqueId *id, char *uniqueId, int size) { - memcpy(id->internal, uniqueId, size); -} - -void get_unique_id(char *uid, int size) { - ncclUniqueId id; - ncclGetUniqueId(&id); - - memcpy(uid, id.internal, size); -} - -#ifdef WITH_UCX -cumlStdCommunicator_impl::cumlStdCommunicator_impl( - ncclComm_t comm, ucp_worker_h ucp_worker, std::shared_ptr eps, - int size, int rank) - : _nccl_comm(comm), - _ucp_worker(ucp_worker), - _ucp_eps(eps), - _size(size), - _rank(rank), - _next_request_id(0) { - initialize(); - p2p_enabled = true; -} -#endif - -cumlStdCommunicator_impl::cumlStdCommunicator_impl(ncclComm_t comm, int size, - int rank) - : _nccl_comm(comm), _size(size), _rank(rank) { - initialize(); -} - -void cumlStdCommunicator_impl::initialize() { - CUDA_CHECK(cudaStreamCreate(&_stream)); - - CUDA_CHECK(cudaMalloc(&_sendbuff, sizeof(int))); - CUDA_CHECK(cudaMalloc(&_recvbuff, sizeof(int))); -} - -cumlStdCommunicator_impl::~cumlStdCommunicator_impl() { - CUDA_CHECK_NO_THROW(cudaStreamDestroy(_stream)); - - CUDA_CHECK_NO_THROW(cudaFree(_sendbuff)); - CUDA_CHECK_NO_THROW(cudaFree(_recvbuff)); -} - -int cumlStdCommunicator_impl::getSize() const { return _size; } - -int cumlStdCommunicator_impl::getRank() const { return _rank; } - -std::unique_ptr -cumlStdCommunicator_impl::commSplit(int color, int key) const { - // Not supported by NCCL - ASSERT(false, - "ERROR: commSplit called but not yet supported in this comms " - "implementation."); -} - -void cumlStdCommunicator_impl::barrier() const { - CUDA_CHECK(cudaMemsetAsync(_sendbuff, 1, sizeof(int), _stream)); - CUDA_CHECK(cudaMemsetAsync(_recvbuff, 1, sizeof(int), _stream)); - - allreduce(_sendbuff, _recvbuff, 1, MLCommon::cumlCommunicator::INT, - MLCommon::cumlCommunicator::SUM, _stream); - - ASSERT(syncStream(_stream) == status_t::commStatusSuccess, - "ERROR: syncStream failed. This can be caused by a failed rank."); -} - -void cumlStdCommunicator_impl::get_request_id(request_t *req) const { -#ifdef WITH_UCX - - request_t req_id; - - if (this->_free_requests.empty()) - req_id = this->_next_request_id++; - else { - auto it = this->_free_requests.begin(); - req_id = *it; - this->_free_requests.erase(it); - } - *req = req_id; -#endif -} - -void cumlStdCommunicator_impl::isend(const void *buf, int size, int dest, - int tag, request_t *request) const { - ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); - ASSERT(p2p_enabled, - "cuML Comms instance was not initialized for point-to-point"); - -#ifdef WITH_UCX - ASSERT(_ucp_worker != nullptr, - "ERROR: UCX comms not initialized on communicator."); - - get_request_id(request); - ucp_ep_h ep_ptr = (*_ucp_eps)[dest]; - - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - - this->_ucp_handler.ucp_isend(ucp_req, ep_ptr, buf, size, tag, - default_tag_mask, getRank()); - - CUML_LOG_DEBUG( - "%d: Created send request [id=%llu], ptr=%llu, to=%llu, ep=%llu", getRank(), - (unsigned long long)*request, (unsigned long long)ucp_req->req, - (unsigned long long)dest, (unsigned long long)ep_ptr); - - _requests_in_flight.insert(std::make_pair(*request, ucp_req)); -#endif -} - -void cumlStdCommunicator_impl::irecv(void *buf, int size, int source, int tag, - request_t *request) const { - ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); - ASSERT(p2p_enabled, - "cuML Comms instance was not initialized for point-to-point"); - -#ifdef WITH_UCX - ASSERT(_ucp_worker != nullptr, - "ERROR: UCX comms not initialized on communicator."); - - get_request_id(request); - - ucp_ep_h ep_ptr = (*_ucp_eps)[source]; - - ucp_tag_t tag_mask = default_tag_mask; - - if (source == CUML_ANY_SOURCE) { - tag_mask = any_rank_tag_mask; - } - - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - _ucp_handler.ucp_irecv(ucp_req, _ucp_worker, ep_ptr, buf, size, tag, tag_mask, - source); - - CUML_LOG_DEBUG( - "%d: Created receive request [id=%llu], ptr=%llu, from=%llu, ep=%llu", - getRank(), (unsigned long long)*request, (unsigned long long)ucp_req->req, - (unsigned long long)source, (unsigned long long)ep_ptr); - - _requests_in_flight.insert(std::make_pair(*request, ucp_req)); -#endif -} - -void cumlStdCommunicator_impl::waitall(int count, - request_t array_of_requests[]) const { - ASSERT(UCX_ENABLED, "cuML Comms not built with UCX support"); - ASSERT(p2p_enabled, - "cuML Comms instance was not initialized for point-to-point"); - -#ifdef WITH_UCX - ASSERT(_ucp_worker != nullptr, - "ERROR: UCX comms not initialized on communicator."); - - std::vector requests; - requests.reserve(count); - - time_t start = time(NULL); - - for (int i = 0; i < count; ++i) { - auto req_it = _requests_in_flight.find(array_of_requests[i]); - ASSERT(_requests_in_flight.end() != req_it, - "ERROR: waitall on invalid request: %d", array_of_requests[i]); - requests.push_back(req_it->second); - _free_requests.insert(req_it->first); - _requests_in_flight.erase(req_it); - } - - while (requests.size() > 0) { - time_t now = time(NULL); - - // Timeout if we have not gotten progress or completed any requests - // in 10 or more seconds. - ASSERT(now - start < 10, "Timed out waiting for requests."); - - for (std::vector::iterator it = requests.begin(); - it != requests.end();) { - bool restart = false; // resets the timeout when any progress was made - - // Causes UCP to progress through the send/recv message queue - while (_ucp_handler.ucp_progress(_ucp_worker) != 0) { - restart = true; - } - - auto req = *it; - - // If the message needs release, we know it will be sent/received - // asynchronously, so we will need to track and verify its state - if (req->needs_release) { - ASSERT(UCS_PTR_IS_PTR(req->req), - "UCX Request Error. Request is not valid UCX pointer"); - ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", - UCS_PTR_STATUS(req->req)); - ASSERT(req->req->completed == 1 || req->req->completed == 0, - "request->completed not a valid value: %d\n", - req->req->completed); - } - - // If a message was sent synchronously (eg. completed before - // `isend`/`irecv` completed) or an asynchronous message - // is complete, we can go ahead and clean it up. - if (!req->needs_release || req->req->completed == 1) { - restart = true; - CUML_LOG_DEBUG( - "%d: request completed. [ptr=%llu, num_left=%lu," - " other_rank=%d, is_send=%d, completed_immediately=%d]", - getRank(), (unsigned long long)req->req, requests.size() - 1, - req->other_rank, req->is_send_request, !req->needs_release); - - // perform cleanup - _ucp_handler.free_ucp_request(req); - - // remove from pending requests - it = requests.erase(it); - } else { - ++it; - } - // if any progress was made, reset the timeout start time - if (restart) { - start = time(NULL); - } - } - } - -#endif -} - -void cumlStdCommunicator_impl::allreduce(const void *sendbuff, void *recvbuff, - int count, datatype_t datatype, - op_t op, cudaStream_t stream) const { - NCCL_CHECK(ncclAllReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), - getNCCLOp(op), _nccl_comm, stream)); -} - -void cumlStdCommunicator_impl::bcast(void *buff, int count, datatype_t datatype, - int root, cudaStream_t stream) const { - NCCL_CHECK(ncclBroadcast(buff, buff, count, getNCCLDatatype(datatype), root, - _nccl_comm, stream)); -} - -void cumlStdCommunicator_impl::reduce(const void *sendbuff, void *recvbuff, - int count, datatype_t datatype, op_t op, - int root, cudaStream_t stream) const { - NCCL_CHECK(ncclReduce(sendbuff, recvbuff, count, getNCCLDatatype(datatype), - getNCCLOp(op), root, _nccl_comm, stream)); -} - -void cumlStdCommunicator_impl::allgather(const void *sendbuff, void *recvbuff, - int sendcount, datatype_t datatype, - cudaStream_t stream) const { - NCCL_CHECK(ncclAllGather(sendbuff, recvbuff, sendcount, - getNCCLDatatype(datatype), _nccl_comm, stream)); -} - -void cumlStdCommunicator_impl::allgatherv(const void *sendbuf, void *recvbuf, - const int recvcounts[], - const int displs[], - datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. - for (int root = 0; root < _size; ++root) - NCCL_CHECK(ncclBroadcast( - sendbuf, - static_cast(recvbuf) + displs[root] * getDatatypeSize(datatype), - recvcounts[root], getNCCLDatatype(datatype), root, _nccl_comm, stream)); -} - -void cumlStdCommunicator_impl::reducescatter(const void *sendbuff, - void *recvbuff, int recvcount, - datatype_t datatype, op_t op, - cudaStream_t stream) const { - NCCL_CHECK(ncclReduceScatter(sendbuff, recvbuff, recvcount, - getNCCLDatatype(datatype), getNCCLOp(op), - _nccl_comm, stream)); -} - -MLCommon::cumlCommunicator::status_t cumlStdCommunicator_impl::syncStream( - cudaStream_t stream) const { - cudaError_t cudaErr; - ncclResult_t ncclErr, ncclAsyncErr; - while (1) { - cudaErr = cudaStreamQuery(stream); - if (cudaErr == cudaSuccess) return status_t::commStatusSuccess; - - if (cudaErr != cudaErrorNotReady) { - // An error occurred querying the status of the stream - return status_t::commStatusError; - } - - ncclErr = ncclCommGetAsyncError(_nccl_comm, &ncclAsyncErr); - if (ncclErr != ncclSuccess) { - // An error occurred retrieving the asynchronous error - return status_t::commStatusError; - } - - if (ncclAsyncErr != ncclSuccess) { - // An asynchronous error happened. Stop the operation and destroy - // the communicator - ncclErr = ncclCommAbort(_nccl_comm); - if (ncclErr != ncclSuccess) - // Caller may abort with an exception or try to re-create a new communicator. - return status_t::commStatusAbort; - } - - // Let other threads (including NCCL threads) use the CPU. - pthread_yield(); - } -} - -} // end namespace ML diff --git a/cpp/comms/std/src/cuML_std_comms_impl.hpp b/cpp/comms/std/src/cuML_std_comms_impl.hpp deleted file mode 100644 index 9237fc83f9..0000000000 --- a/cpp/comms/std/src/cuML_std_comms_impl.hpp +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include - -#include - -#ifdef WITH_UCX -#include -#include -#include "ucp_helper.h" -#endif - -namespace ML { - -/** - * @brief A cumlCommunicator implementation capable of running collective communications - * with NCCL and point-to-point-communications with UCX. Note that the latter is optional. - * - * Underlying comms, like NCCL and UCX, should be initialized and ready for use, - * and maintained, outside of the cuML Comms lifecycle. This allows us to decouple the - * ownership of the actual comms from cuml so that they can also be used outside of cuml. - * - * For instance, nccl-py can be used to bootstrap a ncclComm_t before it is - * used to construct a cuml comms instance. UCX endpoints can be bootstrapped - * in Python using ucx-py, before being used to construct a cuML comms instance. - */ -class cumlStdCommunicator_impl : public MLCommon::cumlCommunicator_iface { - public: - cumlStdCommunicator_impl() = delete; - -#ifdef WITH_UCX - - /** - * @brief Constructor for collective + point-to-point operation. - * @param comm initialized nccl comm - * @param ucp_worker initialized ucp_worker instance - * @param eps shared pointer to array of ucp endpoints - * @param size size of the cluster - * @param rank rank of the current worker - */ - cumlStdCommunicator_impl(ncclComm_t comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int size, int rank); -#endif - - /** - * @brief constructor for collective-only operation - * @param comm initilized nccl communicator - * @param size size of the cluster - * @param rank rank of the current worker - */ - cumlStdCommunicator_impl(ncclComm_t comm, int size, int rank); - - virtual ~cumlStdCommunicator_impl(); - - virtual int getSize() const; - - virtual int getRank() const; - - virtual std::unique_ptr commSplit( - int color, int key) const; - - virtual void barrier() const; - - virtual void isend(const void* buf, int size, int dest, int tag, - request_t* request) const; - - virtual void irecv(void* buf, int size, int source, int tag, - request_t* request) const; - - virtual void waitall(int count, request_t array_of_requests[]) const; - - virtual void allreduce(const void* sendbuff, void* recvbuff, int count, - datatype_t datatype, op_t op, - cudaStream_t stream) const; - - virtual void bcast(void* buff, int count, datatype_t datatype, int root, - cudaStream_t stream) const; - - virtual void reduce(const void* sendbuff, void* recvbuff, int count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const; - - virtual void allgather(const void* sendbuff, void* recvbuff, int sendcount, - datatype_t datatype, cudaStream_t stream) const; - - virtual void allgatherv(const void* sendbuf, void* recvbuf, - const int recvcounts[], const int displs[], - datatype_t datatype, cudaStream_t stream) const; - - virtual void reducescatter(const void* sendbuff, void* recvbuff, - int recvcount, datatype_t datatype, op_t op, - cudaStream_t stream) const; - - virtual status_t syncStream(cudaStream_t stream) const; - - private: - ncclComm_t _nccl_comm; - cudaStream_t _stream; - - int *_sendbuff, *_recvbuff; - - int _size; - int _rank; - - void initialize(); - void get_request_id(request_t* req) const; - bool p2p_enabled = false; - -#ifdef WITH_UCX - comms_ucp_handler _ucp_handler; - ucp_worker_h _ucp_worker; - std::shared_ptr _ucp_eps; - mutable request_t _next_request_id; - mutable std::unordered_map - _requests_in_flight; - mutable std::unordered_set _free_requests; -#endif -}; - -} // end namespace ML diff --git a/cpp/comms/std/src/ucp_helper.h b/cpp/comms/std/src/ucp_helper.h deleted file mode 100644 index fbb8b3e110..0000000000 --- a/cpp/comms/std/src/ucp_helper.h +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#pragma once - -typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); -typedef void (*dlsym_rec_free)(void *); -typedef int (*dlsym_worker_progress)(ucp_worker_h); - -typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t, - ucp_datatype_t, ucp_tag_t, - ucp_send_callback_t); -typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count, - ucp_datatype_t datatype, ucp_tag_t, - ucp_tag_t, ucp_tag_recv_callback_t); - -/** - * Standard UCX request object that will be passed - * around asynchronously. This object is really - * opaque and the comms layer only cares that it - * has been completed. Because cuml comms do not - * initialize the ucx application context, it doesn't - * own this object and thus it's important not to - * modify this struct. - */ -struct ucx_context { - int completed; -}; - -/** - * Wraps the `ucx_context` request and adds a few - * other fields for trace logging and cleanup. - */ -class ucp_request { - public: - struct ucx_context *req; - bool needs_release = true; - int other_rank = -1; - bool is_send_request = false; -}; - -// by default, match the whole tag -static const ucp_tag_t default_tag_mask = -1; - -// Only match the passed in tag, not the rank. This -// enables simulated multi-cast. -static const ucp_tag_t any_rank_tag_mask = 0xFFFF0000; - -// Per the MPI API, receiving from a rank of -1 denotes receiving -// from any rank that used the expected tag. -static const int UCP_ANY_RANK = -1; - -/** - * @brief Asynchronous send callback sets request to completed - */ -static void send_callback(void *request, ucs_status_t status) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; -} - -/** - * @brief Asynchronous recv callback sets request to completed - */ -static void recv_callback(void *request, ucs_status_t status, - ucp_tag_recv_info_t *info) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; -} - -/** - * Helper class for managing `dlopen` state and - * interacting with ucp. - */ -class comms_ucp_handler { - public: - comms_ucp_handler() { - load_ucp_handle(); - load_send_func(); - load_recv_func(); - load_free_req_func(); - load_print_info_func(); - load_worker_progress_func(); - } - - ~comms_ucp_handler() { dlclose(ucp_handle); } - - private: - void *ucp_handle; - - dlsym_print_info print_info_func; - dlsym_rec_free req_free_func; - dlsym_worker_progress worker_progress_func; - dlsym_send send_func; - dlsym_recv recv_func; - - void load_ucp_handle() { - ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE); - if (!ucp_handle) { - ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE); - ASSERT(ucp_handle, "Cannot open UCX library: %s\n", dlerror()); - } - // Reset any potential error - dlerror(); - } - - void assert_dlerror() { - char *error = dlerror(); - ASSERT(error == NULL, "Error loading function symbol: %s\n", error); - } - - void load_send_func() { - send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb"); - assert_dlerror(); - } - - void load_free_req_func() { - req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free"); - assert_dlerror(); - } - - void load_print_info_func() { - print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info"); - assert_dlerror(); - } - - void load_worker_progress_func() { - worker_progress_func = - (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); - assert_dlerror(); - } - - void load_recv_func() { - recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb"); - assert_dlerror(); - } - - ucp_tag_t build_message_tag(int rank, int tag) const { - // keeping the rank in the lower bits enables debugging. - return ((uint32_t)tag << 31) | (uint32_t)rank; - } - - public: - int ucp_progress(ucp_worker_h worker) const { - return (*(worker_progress_func))(worker); - } - - /** - * @brief Frees any memory underlying the given ucp request object - */ - void free_ucp_request(ucp_request *request) const { - if (request->needs_release) { - request->req->completed = 0; - (*(req_free_func))(request->req); - } - free(request); - } - - /** - * @brief Asynchronously send data to the given endpoint using the given tag - */ - void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, int size, - int tag, ucp_tag_t tag_mask, int rank) const { - ucp_tag_t ucp_tag = build_message_tag(rank, tag); - - CUML_LOG_DEBUG("Sending tag: %ld", ucp_tag); - - ucs_status_ptr_t send_result = (*(send_func))( - ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); - struct ucx_context *ucp_req = (struct ucx_context *)send_result; - if (UCS_PTR_IS_ERR(send_result)) { - ASSERT(!UCS_PTR_IS_ERR(send_result), - "unable to send UCX data message (%d)\n", - UCS_PTR_STATUS(send_result)); - /** - * If the request didn't fail, but it's not OK, it is in flight. - * Expect the handler to be invoked - */ - } else if (UCS_PTR_STATUS(send_result) != UCS_OK) { - /** - * If the request is OK, it's already been completed and we don't need to wait on it. - * The request will be a nullptr, however, so we need to create a new request - * and set it to completed to make the "waitall()" function work properly. - */ - req->needs_release = true; - } else { - req->needs_release = false; - } - - req->other_rank = rank; - req->is_send_request = true; - req->req = ucp_req; - } - - /** - * @brief Asynchronously receive data from given endpoint with the given tag. - */ - void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr, - void *buf, int size, int tag, ucp_tag_t tag_mask, - int sender_rank) const { - ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); - - CUML_LOG_DEBUG("%d: Receiving tag: %ld", ucp_tag); - - ucs_status_ptr_t recv_result = - (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, - tag_mask, recv_callback); - - struct ucx_context *ucp_req = (struct ucx_context *)recv_result; - - req->req = ucp_req; - req->needs_release = true; - req->is_send_request = false; - req->other_rank = sender_rank; - - ASSERT(!UCS_PTR_IS_ERR(recv_result), - "unable to receive UCX data message (%d)\n", - UCS_PTR_STATUS(recv_result)); - } -}; diff --git a/cpp/examples/dbscan/dbscan_example.cpp b/cpp/examples/dbscan/dbscan_example.cpp index db13720701..273d1fa71e 100644 --- a/cpp/examples/dbscan/dbscan_example.cpp +++ b/cpp/examples/dbscan/dbscan_example.cpp @@ -23,13 +23,7 @@ #include #include -#ifdef HAVE_CUB -#include -#endif //HAVE_CUB - -#ifdef HAVE_RMM -#include -#endif //HAVE_RMM +#include #include #include @@ -140,29 +134,12 @@ int main(int argc, char* argv[]) { } } - ML::cumlHandle cumlHandle; + raft::handle_t handle; -#ifdef HAVE_RMM - rmmOptions_t rmmOptions; - rmmOptions.allocation_mode = PoolAllocation; - rmmOptions.initial_pool_size = 0; - rmmOptions.enable_logging = false; - rmmError_t rmmStatus = rmmInitialize(&rmmOptions); - if (RMM_SUCCESS != rmmStatus) { - std::cerr << "WARN: Could not initialize RMM: " - << rmmGetErrorString(rmmStatus) << std::endl; - } -#endif //HAVE_RMM -#ifdef HAVE_RMM - std::shared_ptr allocator(new ML::rmmAllocatorAdapter()); -#elif defined(HAVE_CUB) std::shared_ptr allocator( - new ML::cachingDeviceAllocator()); -#else - std::shared_ptr allocator( - new ML::defaultDeviceAllocator()); -#endif // HAVE_RMM - cumlHandle.setDeviceAllocator(allocator); + new raft::mr::device::default_allocator()); + + handle.set_device_allocator(allocator); std::vector h_inputData; @@ -204,7 +181,7 @@ int main(int argc, char* argv[]) { cudaStream_t stream; CUDA_RT_CALL(cudaStreamCreate(&stream)); - cumlHandle.setStream(stream); + handle.set_stream(stream); std::vector h_labels(nRows); int* d_labels = nullptr; @@ -223,7 +200,7 @@ int main(int argc, char* argv[]) { << "eps - " << eps << std::endl << "max_bytes_per_batch - " << max_bytes_per_batch << std::endl; - ML::dbscanFit(cumlHandle, d_inputData, nRows, nCols, eps, minPts, d_labels, + ML::dbscanFit(handle, d_inputData, nRows, nCols, eps, minPts, d_labels, nullptr, max_bytes_per_batch, false); CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(int), cudaMemcpyDeviceToHost, stream)); diff --git a/cpp/examples/kmeans/kmeans_example.cpp b/cpp/examples/kmeans/kmeans_example.cpp index 20bba55298..aeb03b2c67 100644 --- a/cpp/examples/kmeans/kmeans_example.cpp +++ b/cpp/examples/kmeans/kmeans_example.cpp @@ -23,13 +23,7 @@ #include -#ifdef HAVE_CUB -#include -#endif //HAVE_CUB - -#ifdef HAVE_RMM -#include -#endif // HAVE_RMM +#include #include #include @@ -92,17 +86,6 @@ int main(int argc, char *argv[]) { << "(" << cudaGetErrorString(cudaStatus) << ")" << std::endl; return 1; } -#ifdef HAVE_RMM - rmmOptions_t rmmOptions; - rmmOptions.allocation_mode = PoolAllocation; - rmmOptions.initial_pool_size = 0; - rmmOptions.enable_logging = false; - rmmError_t rmmStatus = rmmInitialize(&rmmOptions); - if (RMM_SUCCESS != rmmStatus) { - std::cerr << "WARN: Could not initialize RMM: " - << rmmGetErrorString(rmmStatus) << std::endl; - } -#endif // HAVE_RMM } std::vector h_srcdata; @@ -143,22 +126,16 @@ int main(int argc, char *argv[]) { std::cout << "Run KMeans with k=" << params.n_clusters << ", max_iterations=" << params.max_iter << std::endl; - ML::cumlHandle cumlHandle; -#ifdef HAVE_RMM - std::shared_ptr allocator( - new ML::rmmAllocatorAdapter()); -#elif defined(HAVE_CUB) - std::shared_ptr allocator( - new ML::cachingDeviceAllocator()); -#else + raft::handle_t handle; + std::shared_ptr allocator( - new ML::defaultDeviceAllocator()); -#endif // HAVE_RMM - cumlHandle.setDeviceAllocator(allocator); + new raft::mr::device::default_allocator()); + + handle.set_device_allocator(allocator); cudaStream_t stream; CUDA_RT_CALL(cudaStreamCreate(&stream)); - cumlHandle.setStream(stream); + handle.set_stream(stream); // srcdata size n_samples * n_features double *d_srcdata = nullptr; @@ -178,9 +155,8 @@ int main(int argc, char *argv[]) { double inertia = 0; int n_iter = 0; - ML::kmeans::fit_predict(cumlHandle, params, d_srcdata, n_samples, - n_features, 0, d_pred_centroids, d_pred_labels, - inertia, n_iter); + ML::kmeans::fit_predict(handle, params, d_srcdata, n_samples, n_features, 0, + d_pred_centroids, d_pred_labels, inertia, n_iter); std::vector h_pred_labels(n_samples); CUDA_RT_CALL(cudaMemcpyAsync(h_pred_labels.data(), d_pred_labels, diff --git a/cpp/include/cuml/cluster/dbscan.hpp b/cpp/include/cuml/cluster/dbscan.hpp index ecd717c0c8..e1a1dbe350 100644 --- a/cpp/include/cuml/cluster/dbscan.hpp +++ b/cpp/include/cuml/cluster/dbscan.hpp @@ -21,8 +21,6 @@ namespace ML { -/** @} */ - /** * @defgroup DbscanCpp C++ implementation of Dbscan algo * @brief Fits a DBSCAN model on an input feature matrix and outputs the labels @@ -45,20 +43,20 @@ namespace ML { * @{ */ -void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, - float eps, int min_pts, int *labels, +void dbscanFit(const raft::handle_t &handle, float *input, int n_rows, + int n_cols, float eps, int min_pts, int *labels, int *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO); -void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, - double eps, int min_pts, int *labels, +void dbscanFit(const raft::handle_t &handle, double *input, int n_rows, + int n_cols, double eps, int min_pts, int *labels, int *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO); -void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows, +void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows, int64_t n_cols, float eps, int min_pts, int64_t *labels, int64_t *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO); -void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows, +void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows, int64_t n_cols, double eps, int min_pts, int64_t *labels, int64_t *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO); diff --git a/cpp/include/cuml/cluster/kmeans.hpp b/cpp/include/cuml/cluster/kmeans.hpp index 882b67e6fa..7ac7c5e4ae 100644 --- a/cpp/include/cuml/cluster/kmeans.hpp +++ b/cpp/include/cuml/cluster/kmeans.hpp @@ -53,7 +53,7 @@ struct KMeansParams { int seed = 0; // Metric to use for distance computation. Any metric from - // MLCommon::Distance::DistanceType can be used + // ML::Distance::DistanceType can be used int metric = 0; // Number of instance k-means algorithm will be run with different seeds. @@ -96,12 +96,12 @@ struct KMeansParams { closest cluster center. * @param[out] n_iter Number of iterations run. */ -void fit_predict(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void fit_predict(const raft::handle_t &handle, const KMeansParams ¶ms, const float *X, int n_samples, int n_features, const float *sample_weight, float *centroids, int *labels, float &inertia, int &n_iter); -void fit_predict(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void fit_predict(const raft::handle_t &handle, const KMeansParams ¶ms, const double *X, int n_samples, int n_features, const double *sample_weight, double *centroids, int *labels, double &inertia, int &n_iter); @@ -128,12 +128,12 @@ void fit_predict(const ML::cumlHandle &handle, const KMeansParams ¶ms, * @param[out] n_iter Number of iterations run. */ -void fit(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void fit(const raft::handle_t &handle, const KMeansParams ¶ms, const float *X, int n_samples, int n_features, const float *sample_weight, float *centroids, float &inertia, int &n_iter); -void fit(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void fit(const raft::handle_t &handle, const KMeansParams ¶ms, const double *X, int n_samples, int n_features, const double *sample_weight, double *centroids, double &inertia, int &n_iter); @@ -158,12 +158,12 @@ void fit(const ML::cumlHandle &handle, const KMeansParams ¶ms, * closest cluster center. */ -void predict(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void predict(const raft::handle_t &handle, const KMeansParams ¶ms, const float *centroids, const float *X, int n_samples, int n_features, const float *sample_weight, int *labels, float &inertia); -void predict(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void predict(const raft::handle_t &handle, const KMeansParams ¶ms, const double *centroids, const double *X, int n_samples, int n_features, const double *sample_weight, int *labels, double &inertia); @@ -184,14 +184,14 @@ void predict(const ML::cumlHandle &handle, const KMeansParams ¶ms, * sample in 'X' (it should be same as the dimension for each cluster centers in * 'centroids'). * @param[in] metric Metric to use for distance computation. Any - * metric from MLCommon::Distance::DistanceType can be used + * metric from ML::Distance::DistanceType can be used * @param[out] X_new X transformed in the new space.. */ -void transform(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void transform(const raft::handle_t &handle, const KMeansParams ¶ms, const float *centroids, const float *X, int n_samples, int n_features, int metric, float *X_new); -void transform(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void transform(const raft::handle_t &handle, const KMeansParams ¶ms, const double *centroids, const double *X, int n_samples, int n_features, int metric, double *X_new); diff --git a/cpp/include/cuml/cluster/kmeans_mg.hpp b/cpp/include/cuml/cluster/kmeans_mg.hpp index b10f5fe3f0..cba1fd3c72 100644 --- a/cpp/include/cuml/cluster/kmeans_mg.hpp +++ b/cpp/include/cuml/cluster/kmeans_mg.hpp @@ -43,11 +43,11 @@ namespace opg { * @param[out] n_iter Number of iterations run. */ -void fit(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void fit(const raft::handle_t &handle, const KMeansParams ¶ms, const float *X, int n_samples, int n_features, float *centroids, float &inertia, int &n_iter); -void fit(const ML::cumlHandle &handle, const KMeansParams ¶ms, +void fit(const raft::handle_t &handle, const KMeansParams ¶ms, const double *X, int n_samples, int n_features, double *centroids, double &inertia, int &n_iter); diff --git a/cpp/include/cuml/cluster/spectral.hpp b/cpp/include/cuml/cluster/spectral.hpp index 6a51e1773d..d984f217fd 100644 --- a/cpp/include/cuml/cluster/spectral.hpp +++ b/cpp/include/cuml/cluster/spectral.hpp @@ -35,8 +35,8 @@ namespace Spectral { * @param n_components the number of components to project the X into * @param out output array for embedding (size n*n_comonents) */ -void fit_embedding(const cumlHandle &handle, int *rows, int *cols, float *vals, - int nnz, int n, int n_components, float *out); +void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, + float *vals, int nnz, int n, int n_components, float *out); } // namespace Spectral } // namespace ML diff --git a/cpp/include/cuml/common/callbackSink.hpp b/cpp/include/cuml/common/callbackSink.hpp new file mode 100644 index 0000000000..abd4c33a7e --- /dev/null +++ b/cpp/include/cuml/common/callbackSink.hpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#define SPDLOG_HEADER_ONLY +#include +#include +#include + +namespace spdlog { +namespace sinks { + +typedef void (*LogCallback)(int lvl, const char* msg); + +template +class CallbackSink : public base_sink { + public: + explicit CallbackSink(std::string tag = "spdlog", + LogCallback callback = nullptr, + void (*flush)() = nullptr) + : _callback{callback}, _flush{flush} {}; + + void set_callback(LogCallback callback) { _callback = callback; } + void set_flush(void (*flush)()) { _flush = flush; } + + protected: + void sink_it_(const details::log_msg& msg) override { + spdlog::memory_buf_t formatted; + base_sink::formatter_->format(msg, formatted); + std::string msg_string = fmt::to_string(formatted); + + if (_callback) { + _callback(static_cast(msg.level), msg_string.c_str()); + } else { + std::cout << msg_string; + } + } + + void flush_() override { + if (_flush) { + _flush(); + } else { + std::cout << std::flush; + } + } + + LogCallback _callback; + void (*_flush)(); +}; + +using callback_sink_mt = CallbackSink; +using callback_sink_st = CallbackSink; + +} // end namespace sinks +} // end namespace spdlog diff --git a/cpp/include/cuml/common/cuml_allocator.hpp b/cpp/include/cuml/common/cuml_allocator.hpp index fd68f1d367..215c1ad3f2 100644 --- a/cpp/include/cuml/common/cuml_allocator.hpp +++ b/cpp/include/cuml/common/cuml_allocator.hpp @@ -19,148 +19,12 @@ #include #include -namespace MLCommon { - -/** - * @brief Interface for a asynchronous device allocator. - * - * A implementation of this interface can make the following assumptions - * - It does not need to be but it can allow asynchronous allocate and deallocate. - * - Allocations may be always on the device that was specified on construction. - */ -class deviceAllocator { - public: - /** - * @brief Asynchronously allocates device memory. - * - * An implementation of this need to return a allocation of n bytes properly align bytes - * on the configured device. The allocation can optionally be asynchronous in the sense - * that it is only save to use after all work submitted to the passed in stream prior to - * the call to allocate has completed. If the allocation is used before, e.g. in another - * stream the behaviour may be undefined. - * @todo: Add alignment requirments. - * - * @param[in] n number of bytes to allocate - * @param[in] stream stream to issue the possible asynchronous allocation in - */ - virtual void* allocate(std::size_t n, cudaStream_t stream) = 0; - - /** - * @brief Asynchronously deallocates device memory - * - * An implementation of this need to ensure that the allocation that the passed in pointer - * points to remains usable until all work sheduled in stream prior to the call to - * deallocate has completed. - * - * @param[inout] p pointer to the buffer to deallocte - * @param[in] n size of the buffer to deallocte in bytes - * @param[in] stream stream in which the allocation might be still in use - */ - virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) = 0; - - virtual ~deviceAllocator() {} -}; - -/** - * @brief Interface for a asynchronous host allocations. - * - * A implementation of this interface can make the following assumptions - * - It does not need to be but it can allow asynchronous allocate and deallocate. - * - Allocations don't need to be zero copy accessible form a device. - */ -class hostAllocator { - public: - /** - * @brief Asynchronously allocates host memory. - * - * An implementation of this need to return a allocation of n bytes properly align bytes - * on the host. The allocation can optionally be asynchronous in the sense - * that it is only save to use after all work submitted to the passed in stream prior to - * the call to allocate has completed. If the allocation is used before, e.g. in another - * stream the behaviour may be undefined. - * @todo: Add alignment requirments. - * - * @param[in] n number of bytes to allocate - * @param[in] stream stream to issue the possible asynchronous allocation in - */ - virtual void* allocate(std::size_t n, cudaStream_t stream) = 0; - - /** - * @brief Asynchronously deallocates host memory - * - * An implementation of this need to ensure that the allocation that the passed in pointer - * points to remains usable until all work sheduled in stream prior to the call to - * deallocate has completed. - * - * @param[inout] p pointer to the buffer to deallocte - * @param[in] n size of the buffer to deallocte in bytes - * @param[in] stream stream in which the allocation might be still in use - */ - virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) = 0; - - virtual ~hostAllocator() {} -}; - -/** Default cudaMalloc/cudaFree based device allocator */ -class defaultDeviceAllocator : public deviceAllocator { - public: - /** - * @brief asynchronosly allocate n bytes that can be used after all work in - * stream sheduled prior to this call has completetd. - * - * @param[in] n size of the allocation in bytes - * @param[in] stream the stream to use for the asynchronous allocations - */ - virtual void* allocate(std::size_t n, cudaStream_t stream) { - void* ptr = 0; - CUDA_CHECK(cudaMalloc(&ptr, n)); - return ptr; - } +#include +#include - /** - * @brief asynchronosly free an allocation of n bytes that can be reused after - * all work in stream scheduled prior to this call has completed. - * - * @param[in] p pointer to n bytes of memory to be deallocated - * @param[in] n size of the allocation to release in bytes - * @param[in] stream the stream to use for the asynchronous free - */ - virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) { - CUDA_CHECK_NO_THROW(cudaFree(p)); - } - - virtual ~defaultDeviceAllocator() {} -}; - -/** Default cudaMallocHost/cudaFreeHost based host allocator */ -class defaultHostAllocator : public hostAllocator { - public: - /** - * @brief allocate n bytes that can be used after all work in - * stream sheduled prior to this call has completetd. - * - * @param[in] n size of the allocation in bytes - * @param[in] stream the stream to use for the asynchronous allocations - */ - virtual void* allocate(std::size_t n, cudaStream_t stream) { - void* ptr = 0; - CUDA_CHECK(cudaMallocHost(&ptr, n)); - return ptr; - } - - /** - * @brief free an allocation of n bytes that can be reused after - * all work in stream scheduled prior to this call has completed. - * - * @param[in] p pointer to n bytes of memory to be deallocated - * @param[in] n size of the allocation to release in bytes - * @param[in] stream the stream to use for the asynchronous free - */ - virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) { - CUDA_CHECK_NO_THROW(cudaFreeHost(p)); - } +namespace MLCommon { - virtual ~defaultHostAllocator() {} -}; +using deviceAllocator = raft::mr::device::allocator; +using hostAllocator = raft::mr::host::allocator; }; // end namespace MLCommon diff --git a/cpp/include/cuml/common/logger.hpp b/cpp/include/cuml/common/logger.hpp index 0e9c2c285c..ac6e81ec81 100644 --- a/cpp/include/cuml/common/logger.hpp +++ b/cpp/include/cuml/common/logger.hpp @@ -17,12 +17,18 @@ #include #include +#include #include #include namespace spdlog { class logger; -}; +namespace sinks { +template +class CallbackSink; +using callback_sink_mt = CallbackSink; +}; // namespace sinks +}; // namespace spdlog namespace ML { @@ -104,6 +110,20 @@ class Logger { */ void setPattern(const std::string& pattern); + /** + * @brief Register a callback function to be run in place of usual log call + * + * @param[in] callback the function to be run on all logged messages + */ + void setCallback(void (*callback)(int lvl, const char* msg)); + + /** + * @brief Register a flush function compatible with the registered callback + * + * @param[in] flush the function to use when flushing logs + */ + void setFlush(void (*flush)()); + /** * @brief Tells whether messages will be logged for the given log level * @@ -133,10 +153,16 @@ class Logger { */ void log(int level, const char* fmt, ...); + /** + * @brief Flush logs by calling flush on underlying logger + */ + void flush(); + private: Logger(); ~Logger() {} + std::shared_ptr sink; std::shared_ptr logger; std::string currPattern; static const std::string DefaultPattern; diff --git a/cpp/include/cuml/common/rmmAllocatorAdapter.hpp b/cpp/include/cuml/common/rmmAllocatorAdapter.hpp deleted file mode 100644 index dcbac6ec43..0000000000 --- a/cpp/include/cuml/common/rmmAllocatorAdapter.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace ML { - -/** - * @brief Implemententation of ML::deviceAllocator using the - * RAPIDS Memory Manager (RMM) for allocations. - * - * rmmAllocatorAdapter does not initialize RMM. If RMM is not initialized on - * construction of rmmAllocatorAdapter allocations fall back to cudaMalloc. - */ -class rmmAllocatorAdapter : public ML::deviceAllocator { - public: - rmmAllocatorAdapter() {} - - /** - * @brief asynchronosly allocate n bytes that can be used after all work in - * stream sheduled prior to this call has completetd. - * - * @param[in] n size of the allocation in bytes - * @param[in] stream the stream to use for the asynchronous allocations - */ - virtual void* allocate(std::size_t n, cudaStream_t stream) { - void* ptr = 0; - ptr = rmm::mr::get_default_resource()->allocate(n, stream); - return ptr; - } - - /** - * @brief asynchronosly free an allocation of n bytes that can be reused after - * all work in stream scheduled prior to this call has completed. - * - * @param[in] p pointer to n bytes of memory to be deallocated - * @param[in] n size of the allocation to release in bytes - * @param[in] stream the stream to use for the asynchronous free - */ - virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) { - rmm::mr::get_default_resource()->deallocate(p, n, stream); - } - - virtual ~rmmAllocatorAdapter() {} -}; - -} // end namespace ML diff --git a/cpp/include/cuml/common/rmmPoolAllocatorAdapter.hpp b/cpp/include/cuml/common/rmmPoolAllocatorAdapter.hpp deleted file mode 100644 index 7282c67f9a..0000000000 --- a/cpp/include/cuml/common/rmmPoolAllocatorAdapter.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include "rmmAllocatorAdapter.hpp" - -namespace ML { - -/** - * @brief Implemententation of ML::deviceAllocator using the RMM pool - * - * @todo rmmPoolAllocatorAdapter currently only uses the default ctor of the - * underlying pool allocator (ie cnmem). - */ -class rmmPoolAllocatorAdapter : public rmmAllocatorAdapter { - public: - rmmPoolAllocatorAdapter() : cnmem_mr_() { - prev_mr_ = rmm::mr::set_default_resource(&cnmem_mr_); - } - - ~rmmPoolAllocatorAdapter() { - // restore the previous memory resource when this object goes out-of-scope - rmm::mr::set_default_resource(prev_mr_); - } - - private: - rmm::mr::cnmem_memory_resource cnmem_mr_; - rmm::mr::device_memory_resource* prev_mr_; -}; - -} // end namespace ML diff --git a/cpp/include/cuml/common/utils.hpp b/cpp/include/cuml/common/utils.hpp index 7962c6bb3d..3fcd535825 100644 --- a/cpp/include/cuml/common/utils.hpp +++ b/cpp/include/cuml/common/utils.hpp @@ -18,101 +18,10 @@ #include #include +#include #include +#include #include #include #include #include "logger.hpp" - -namespace MLCommon { -/** base exception class for the cuML or ml-prims project */ -class Exception : public std::exception { - public: - /** default ctor */ - Exception() throw() : std::exception(), msg() {} - - /** copy ctor */ - Exception(const Exception& src) throw() : std::exception(), msg(src.what()) { - collectCallStack(); - } - - /** ctor from an input message */ - Exception(const std::string& _msg) throw() : std::exception(), msg(_msg) { - collectCallStack(); - } - - /** dtor */ - virtual ~Exception() throw() {} - - /** get the message associated with this exception */ - virtual const char* what() const throw() { return msg.c_str(); } - - private: - /** message associated with this exception */ - std::string msg; - - /** append call stack info to this exception's message for ease of debug */ - // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html - void collectCallStack() throw() { -#ifdef __GNUC__ - const int MaxStackDepth = 64; - void* stack[MaxStackDepth]; - auto depth = backtrace(stack, MaxStackDepth); - std::ostringstream oss; - oss << std::endl << "Obtained " << depth << " stack frames" << std::endl; - char** strings = backtrace_symbols(stack, depth); - if (strings == nullptr) { - oss << "But no stack trace could be found!" << std::endl; - msg += oss.str(); - return; - } - ///@todo: support for demangling of C++ symbol names - for (int i = 0; i < depth; ++i) { - oss << "#" << i << " in " << strings[i] << std::endl; - } - free(strings); - msg += oss.str(); -#endif // __GNUC__ - } -}; - -/** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; \ - std::snprintf(errMsg, sizeof(errMsg), \ - "Exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw MLCommon::Exception(msg); \ - } while (0) - -/** macro to check for a conditional and assert on failure */ -#define ASSERT(check, fmt, ...) \ - do { \ - if (!(check)) THROW(fmt, ##__VA_ARGS__); \ - } while (0) - -/** check for cuda runtime API errors and assert accordingly */ -#define CUDA_CHECK(call) \ - do { \ - cudaError_t status = call; \ - ASSERT(status == cudaSuccess, "FAIL: call='%s'. Reason:%s", #call, \ - cudaGetErrorString(status)); \ - } while (0) - -/** - * @brief check for cuda runtime API errors but log error instead of raising - * exception. - */ -#define CUDA_CHECK_NO_THROW(call) \ - do { \ - cudaError_t status = call; \ - if (status != cudaSuccess) { \ - CUML_LOG_ERROR("CUDA call='%s' at file=%s line=%d failed with %s ", \ - #call, __FILE__, __LINE__, cudaGetErrorString(status)); \ - } \ - } while (0) -}; // namespace MLCommon diff --git a/cpp/include/cuml/cuml.hpp b/cpp/include/cuml/cuml.hpp index cc7a5cbca2..27e2c06f54 100644 --- a/cpp/include/cuml/cuml.hpp +++ b/cpp/include/cuml/cuml.hpp @@ -19,105 +19,10 @@ #include #include #include +#include #include namespace ML { - -class cumlHandle_impl; - using MLCommon::deviceAllocator; using MLCommon::hostAllocator; - -using MLCommon::defaultDeviceAllocator; -using MLCommon::defaultHostAllocator; - -/** - * @brief Handle to manage resources needed by cuML algorithms. - */ -class cumlHandle { - public: - /** - * @brief construct a cumlHandle with default paramters. - * @param n_streams number of internal streams to be setup - * - * The default paramters are - * - stream: default or NULL stream - * - DeviceAllocator: cudaMalloc - * - HostAllocator: cudaMallocHost - * @{ - */ - cumlHandle(int n_streams); - cumlHandle(); - /** @} */ - /** - * @brief releases all resources internally manged by cumlHandle. - */ - ~cumlHandle(); - /** - * @brief sets the stream to which all cuML work issued via this handle should be ordered. - * - * @param[in] stream the stream to which cuML work should be ordered. - */ - void setStream(cudaStream_t stream); - /** - * @brief gets the stream to which all cuML work issued via this handle should be ordered. - * - * @returns the stream to which cuML work should be ordered. - */ - cudaStream_t getStream() const; - /** Get the cached device properties of the device this handle is for */ - const cudaDeviceProp& getDeviceProperties() const; - /** - * @brief sets the allocator to use for all device allocations done in cuML. - * - * @param[in] allocator the deviceAllocator to use for device allocations. - */ - void setDeviceAllocator(std::shared_ptr allocator); - /** - * @brief gets the allocator to use for all device allocations done in cuML. - * - * @returns the deviceAllocator to use for device allocations. - */ - std::shared_ptr getDeviceAllocator() const; - /** - * @brief sets the allocator to use for substantial host allocations done in cuML. - * - * @param[in] allocator the hostAllocator to use for host allocations. - */ - void setHostAllocator(std::shared_ptr allocator); - /** - * @brief gets the allocator to use for substantial host allocations done in cuML. - * - * @returns the hostAllocator to use for host allocations. - */ - std::shared_ptr getHostAllocator() const; - /** - * @brief API to query Num of work streams set during handle creation. - * @returns num of streams in the handle. - */ - int getNumInternalStreams(); - - /** - * @brief API to get the internal streams as a vector. - * @return vector of internal streams in the handle - */ - std::vector getInternalStreams() const; - - /** - * @brief for internal use only. - */ - const cumlHandle_impl& getImpl() const; - /** - * @brief for internal use only. - */ - cumlHandle_impl& getImpl(); - - /** for internal use only */ - static int getDefaultNumInternalStreams(); - - private: - static constexpr int _default_num_internal_streams = 0; - std::unique_ptr _impl; -}; - -} // end namespace ML +} // namespace ML diff --git a/cpp/include/cuml/datasets/make_arima.hpp b/cpp/include/cuml/datasets/make_arima.hpp index f8da8c24b1..ab1ed41a24 100644 --- a/cpp/include/cuml/datasets/make_arima.hpp +++ b/cpp/include/cuml/datasets/make_arima.hpp @@ -37,11 +37,12 @@ namespace Datasets { * @param[in] seed Seed for the random number generator * @{ */ -void make_arima(const cumlHandle& handle, float* out, int batch_size, int n_obs, - ARIMAOrder order, float scale = 1.0f, float noise_scale = 0.2f, - float intercept_scale = 1.0f, uint64_t seed = 0ULL); +void make_arima(const raft::handle_t& handle, float* out, int batch_size, + int n_obs, ARIMAOrder order, float scale = 1.0f, + float noise_scale = 0.2f, float intercept_scale = 1.0f, + uint64_t seed = 0ULL); -void make_arima(const cumlHandle& handle, double* out, int batch_size, +void make_arima(const raft::handle_t& handle, double* out, int batch_size, int n_obs, ARIMAOrder order, double scale = 1.0, double noise_scale = 0.2, double intercept_scale = 1.0, uint64_t seed = 0ULL); diff --git a/cpp/include/cuml/datasets/make_blobs.hpp b/cpp/include/cuml/datasets/make_blobs.hpp index 2f9c6171b6..baacae16fa 100644 --- a/cpp/include/cuml/datasets/make_blobs.hpp +++ b/cpp/include/cuml/datasets/make_blobs.hpp @@ -53,29 +53,29 @@ namespace Datasets { * @param[in] seed seed for the RNG * @{ */ -void make_blobs(const cumlHandle& handle, float* out, int64_t* labels, +void make_blobs(const raft::handle_t& handle, float* out, int64_t* labels, int64_t n_rows, int64_t n_cols, int64_t n_clusters, bool row_major = true, const float* centers = nullptr, const float* cluster_std = nullptr, const float cluster_std_scalar = 1.f, bool shuffle = true, float center_box_min = -10.f, float center_box_max = 10.f, uint64_t seed = 0ULL); -void make_blobs(const cumlHandle& handle, double* out, int64_t* labels, +void make_blobs(const raft::handle_t& handle, double* out, int64_t* labels, int64_t n_rows, int64_t n_cols, int64_t n_clusters, bool row_major = true, const double* centers = nullptr, const double* cluster_std = nullptr, const double cluster_std_scalar = 1.0, bool shuffle = true, double center_box_min = -10.0, double center_box_max = 10.0, uint64_t seed = 0ULL); -void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows, - int n_cols, int n_clusters, bool row_major = true, +void make_blobs(const raft::handle_t& handle, float* out, int* labels, + int n_rows, int n_cols, int n_clusters, bool row_major = true, const float* centers = nullptr, const float* cluster_std = nullptr, const float cluster_std_scalar = 1.f, bool shuffle = true, float center_box_min = -10.f, float center_box_max = 10.0, uint64_t seed = 0ULL); -void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows, - int n_cols, int n_clusters, bool row_major = true, +void make_blobs(const raft::handle_t& handle, double* out, int* labels, + int n_rows, int n_cols, int n_clusters, bool row_major = true, const double* centers = nullptr, const double* cluster_std = nullptr, const double cluster_std_scalar = 1.0, bool shuffle = true, diff --git a/cpp/include/cuml/datasets/make_regression.hpp b/cpp/include/cuml/datasets/make_regression.hpp index f163cfac21..c6aa8c5f8f 100644 --- a/cpp/include/cuml/datasets/make_regression.hpp +++ b/cpp/include/cuml/datasets/make_regression.hpp @@ -51,28 +51,28 @@ namespace Datasets { * @param[in] shuffle Shuffle the samples and the features * @param[in] seed Seed for the random number generator */ -void make_regression(const cumlHandle& handle, float* out, float* values, +void make_regression(const raft::handle_t& handle, float* out, float* values, int64_t n_rows, int64_t n_cols, int64_t n_informative, float* coef = nullptr, int64_t n_targets = 1LL, float bias = 0.0f, int64_t effective_rank = -1LL, float tail_strength = 0.5f, float noise = 0.0f, bool shuffle = true, uint64_t seed = 0ULL); -void make_regression(const cumlHandle& handle, double* out, double* values, +void make_regression(const raft::handle_t& handle, double* out, double* values, int64_t n_rows, int64_t n_cols, int64_t n_informative, double* coef = nullptr, int64_t n_targets = 1LL, double bias = 0.0, int64_t effective_rank = -1LL, double tail_strength = 0.5, double noise = 0.0, bool shuffle = true, uint64_t seed = 0ULL); -void make_regression(const cumlHandle& handle, float* out, float* values, +void make_regression(const raft::handle_t& handle, float* out, float* values, int n_rows, int n_cols, int n_informative, float* coef = nullptr, int n_targets = 1LL, float bias = 0.0f, int effective_rank = -1LL, float tail_strength = 0.5f, float noise = 0.0f, bool shuffle = true, uint64_t seed = 0ULL); -void make_regression(const cumlHandle& handle, double* out, double* values, +void make_regression(const raft::handle_t& handle, double* out, double* values, int n_rows, int n_cols, int n_informative, double* coef = nullptr, int n_targets = 1LL, double bias = 0.0, int effective_rank = -1LL, diff --git a/cpp/include/cuml/decomposition/params.hpp b/cpp/include/cuml/decomposition/params.hpp index dabc904156..014d52735d 100644 --- a/cpp/include/cuml/decomposition/params.hpp +++ b/cpp/include/cuml/decomposition/params.hpp @@ -19,19 +19,12 @@ namespace ML { /** - * @defgroup pcaSolver: enumeration for pca solvers. - * @param AUTO: Fastest solver will be used based on input shape and n_components. - * @param FULL: All the eigenvectors and singular values (or eigenvalues) will be generated. - * @param ARPACK: tsvd using power method. Lanczos will be included in the future. - * @param RANDOMIZED: randomized svd * @param COV_EIG_DQ: covariance of input will be used along with eigen decomposition using divide and conquer method for symmetric matrices * @param COV_EIG_JACOBI: covariance of input will be used along with eigen decomposition using jacobi method for symmetric matrices - * @{ */ enum class solver : int { COV_EIG_DQ, COV_EIG_JACOBI, - RANDOMIZED, }; class params { @@ -48,7 +41,6 @@ class paramsSolver : public params { //math_t tol = 0.0; float tol = 0.0; int n_iterations = 15; - int random_state; int verbose = 0; }; @@ -56,9 +48,7 @@ template class paramsTSVDTemplate : public paramsSolver { public: int n_components = 1; - int max_sweeps = 15; enum_solver algorithm = enum_solver::COV_EIG_DQ; - bool trans_input = false; }; /** @@ -68,19 +58,16 @@ class paramsTSVDTemplate : public paramsSolver { * use fit_transform(X) instead. * @param whiten: When True (False by default) the components_ vectors are multiplied by the square root of n_samples and * then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. - * @param svd_solver: the solver to be used in PCA. + * @param algorithm: the solver to be used in PCA. * @param tol: Tolerance for singular values computed by svd_solver == ‘arpack’ or svd_solver == ‘COV_EIG_JACOBI’ - * @param iterated_power: Number of iterations for the power method computed by svd_solver == ‘randomized’ or - * jacobi method by svd_solver == 'COV_EIG_JACOBI'. - * @param random_state: RandomState instance or None, optional (default None) + * @param n_iterations: Number of iterations for the power method computed by jacobi method (svd_solver == 'COV_EIG_JACOBI'). * @param verbose: 0: no error message printing, 1: print error messages - * @param max_sweeps: number of sweeps jacobi method uses. The more the better accuracy. */ template class paramsPCATemplate : public paramsTSVDTemplate { public: - bool copy = true; + bool copy = true; // TODO unused, see #2830 and #2833 bool whiten = false; }; diff --git a/cpp/include/cuml/decomposition/pca.hpp b/cpp/include/cuml/decomposition/pca.hpp index 2bb577fd75..8710b03a2f 100644 --- a/cpp/include/cuml/decomposition/pca.hpp +++ b/cpp/include/cuml/decomposition/pca.hpp @@ -21,32 +21,32 @@ namespace ML { -void pcaFit(cumlHandle &handle, float *input, float *components, +void pcaFit(raft::handle_t &handle, float *input, float *components, float *explained_var, float *explained_var_ratio, float *singular_vals, float *mu, float *noise_vars, const paramsPCA &prms); -void pcaFit(cumlHandle &handle, double *input, double *components, +void pcaFit(raft::handle_t &handle, double *input, double *components, double *explained_var, double *explained_var_ratio, double *singular_vals, double *mu, double *noise_vars, const paramsPCA &prms); -void pcaFitTransform(cumlHandle &handle, float *input, float *trans_input, +void pcaFitTransform(raft::handle_t &handle, float *input, float *trans_input, float *components, float *explained_var, float *explained_var_ratio, float *singular_vals, float *mu, float *noise_vars, const paramsPCA &prms); -void pcaFitTransform(cumlHandle &handle, double *input, double *trans_input, +void pcaFitTransform(raft::handle_t &handle, double *input, double *trans_input, double *components, double *explained_var, double *explained_var_ratio, double *singular_vals, double *mu, double *noise_vars, const paramsPCA &prms); -void pcaInverseTransform(cumlHandle &handle, float *trans_input, +void pcaInverseTransform(raft::handle_t &handle, float *trans_input, float *components, float *singular_vals, float *mu, float *input, const paramsPCA &prms); -void pcaInverseTransform(cumlHandle &handle, double *trans_input, +void pcaInverseTransform(raft::handle_t &handle, double *trans_input, double *components, double *singular_vals, double *mu, double *input, const paramsPCA &prms); -void pcaTransform(cumlHandle &handle, float *input, float *components, +void pcaTransform(raft::handle_t &handle, float *input, float *components, float *trans_input, float *singular_vals, float *mu, const paramsPCA &prms); -void pcaTransform(cumlHandle &handle, double *input, double *components, +void pcaTransform(raft::handle_t &handle, double *input, double *components, double *trans_input, double *singular_vals, double *mu, const paramsPCA &prms); diff --git a/cpp/include/cuml/decomposition/pca_mg.hpp b/cpp/include/cuml/decomposition/pca_mg.hpp index 5b3f83a18b..302aaf4fd1 100644 --- a/cpp/include/cuml/decomposition/pca_mg.hpp +++ b/cpp/include/cuml/decomposition/pca_mg.hpp @@ -47,13 +47,13 @@ namespace opg { * @param[in] prms: data structure that includes all the parameters from input size to algorithm * @param[in] verbose */ -void fit(cumlHandle &handle, +void fit(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, float *components, float *explained_var, float *explained_var_ratio, float *singular_vals, float *mu, float *noise_vars, paramsPCAMG prms, bool verbose = false); -void fit(cumlHandle &handle, +void fit(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, double *components, double *explained_var, double *explained_var_ratio, @@ -76,7 +76,7 @@ void fit(cumlHandle &handle, * @param[in] prms: data structure that includes all the parameters from input size to algorithm * @param[in] verbose */ -void fit_transform(cumlHandle &handle, +void fit_transform(raft::handle_t &handle, MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, MLCommon::Matrix::floatData_t **input, MLCommon::Matrix::floatData_t **trans_input, @@ -84,7 +84,7 @@ void fit_transform(cumlHandle &handle, float *explained_var_ratio, float *singular_vals, float *mu, float *noise_vars, paramsPCAMG prms, bool verbose); -void fit_transform(cumlHandle &handle, +void fit_transform(raft::handle_t &handle, MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, MLCommon::Matrix::doubleData_t **input, MLCommon::Matrix::doubleData_t **trans_input, @@ -106,14 +106,16 @@ void fit_transform(cumlHandle &handle, * @param[in] prms: data structure that includes all the parameters from input size to algorithm * @param[in] verbose */ -void transform(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - float *components, MLCommon::Matrix::Data **trans_input, +void transform(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, float *components, + MLCommon::Matrix::Data **trans_input, float *singular_vals, float *mu, paramsPCAMG prms, bool verbose); -void transform(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - double *components, MLCommon::Matrix::Data **trans_input, +void transform(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, double *components, + MLCommon::Matrix::Data **trans_input, double *singular_vals, double *mu, paramsPCAMG prms, bool verbose); @@ -130,7 +132,7 @@ void transform(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, * @param[in] prms: data structure that includes all the parameters from input size to algorithm * @param[in] verbose */ -void inverse_transform(cumlHandle &handle, +void inverse_transform(raft::handle_t &handle, MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, MLCommon::Matrix::Data **trans_input, @@ -139,7 +141,7 @@ void inverse_transform(cumlHandle &handle, bool verbose); void inverse_transform( - cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, + raft::handle_t &handle, MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, MLCommon::Matrix::Data **trans_input, double *components, MLCommon::Matrix::Data **input, double *singular_vals, double *mu, paramsPCAMG prms, bool verbose); diff --git a/cpp/include/cuml/decomposition/sign_flip_mg.hpp b/cpp/include/cuml/decomposition/sign_flip_mg.hpp index 8930d03d79..2563a740cb 100644 --- a/cpp/include/cuml/decomposition/sign_flip_mg.hpp +++ b/cpp/include/cuml/decomposition/sign_flip_mg.hpp @@ -35,12 +35,12 @@ namespace opg { * @param[in] n_stream: number of streams * @{ */ -void sign_flip(cumlHandle &handle, +void sign_flip(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, float *components, int n_components, cudaStream_t *streams, int n_stream); -void sign_flip(cumlHandle &handle, +void sign_flip(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, double *components, int n_components, cudaStream_t *streams, int n_stream); diff --git a/cpp/include/cuml/decomposition/tsvd.hpp b/cpp/include/cuml/decomposition/tsvd.hpp index 8296f0dba0..66e76bac1d 100644 --- a/cpp/include/cuml/decomposition/tsvd.hpp +++ b/cpp/include/cuml/decomposition/tsvd.hpp @@ -21,27 +21,27 @@ namespace ML { -void tsvdFit(cumlHandle &handle, float *input, float *components, +void tsvdFit(raft::handle_t &handle, float *input, float *components, float *singular_vals, const paramsTSVD &prms); -void tsvdFit(cumlHandle &handle, double *input, double *components, +void tsvdFit(raft::handle_t &handle, double *input, double *components, double *singular_vals, const paramsTSVD &prms); -void tsvdInverseTransform(cumlHandle &handle, float *trans_input, +void tsvdInverseTransform(raft::handle_t &handle, float *trans_input, float *components, float *input, const paramsTSVD &prms); -void tsvdInverseTransform(cumlHandle &handle, double *trans_input, +void tsvdInverseTransform(raft::handle_t &handle, double *trans_input, double *components, double *input, const paramsTSVD &prms); -void tsvdTransform(cumlHandle &handle, float *input, float *components, +void tsvdTransform(raft::handle_t &handle, float *input, float *components, float *trans_input, const paramsTSVD &prms); -void tsvdTransform(cumlHandle &handle, double *input, double *components, +void tsvdTransform(raft::handle_t &handle, double *input, double *components, double *trans_input, const paramsTSVD &prms); -void tsvdFitTransform(cumlHandle &handle, float *input, float *trans_input, +void tsvdFitTransform(raft::handle_t &handle, float *input, float *trans_input, float *components, float *explained_var, float *explained_var_ratio, float *singular_vals, const paramsTSVD &prms); -void tsvdFitTransform(cumlHandle &handle, double *input, double *trans_input, - double *components, double *explained_var, - double *explained_var_ratio, double *singular_vals, - const paramsTSVD &prms); +void tsvdFitTransform(raft::handle_t &handle, double *input, + double *trans_input, double *components, + double *explained_var, double *explained_var_ratio, + double *singular_vals, const paramsTSVD &prms); } // namespace ML diff --git a/cpp/include/cuml/decomposition/tsvd_mg.hpp b/cpp/include/cuml/decomposition/tsvd_mg.hpp index 16573dba39..5c1b4d01b6 100644 --- a/cpp/include/cuml/decomposition/tsvd_mg.hpp +++ b/cpp/include/cuml/decomposition/tsvd_mg.hpp @@ -37,12 +37,12 @@ namespace opg { * @param[in] prms: data structure that includes all the parameters from input size to algorithm * @param[in] verbose */ -void fit(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, +void fit(raft::handle_t &handle, MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, MLCommon::Matrix::floatData_t **input, float *components, float *singular_vals, paramsTSVD prms, bool verbose = false); -void fit(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, +void fit(raft::handle_t &handle, MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, MLCommon::Matrix::doubleData_t **input, double *components, double *singular_vals, paramsTSVD prms, bool verbose = false); @@ -61,7 +61,7 @@ void fit(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, * @param[in] prms: data structure that includes all the parameters from input size to algorithm * @param[in] verbose */ -void fit_transform(cumlHandle &handle, +void fit_transform(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &trans_data, @@ -70,7 +70,7 @@ void fit_transform(cumlHandle &handle, float *explained_var_ratio, float *singular_vals, paramsTSVD prms, bool verbose); -void fit_transform(cumlHandle &handle, +void fit_transform(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &trans_data, @@ -90,15 +90,17 @@ void fit_transform(cumlHandle &handle, * @param[in] prms: data structure that includes all the parameters from input size to algorithm * @param[in] verbose */ -void transform(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - float *components, MLCommon::Matrix::Data **trans_input, - paramsTSVD prms, bool verbose); +void transform(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, float *components, + MLCommon::Matrix::Data **trans_input, paramsTSVD prms, + bool verbose); -void transform(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - double *components, MLCommon::Matrix::Data **trans_input, - paramsTSVD prms, bool verbose); +void transform(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, double *components, + MLCommon::Matrix::Data **trans_input, paramsTSVD prms, + bool verbose); /** * @brief performs MNMG inverse transform operation for the output. @@ -111,14 +113,14 @@ void transform(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, * @param[in] prms: data structure that includes all the parameters from input size to algorithm * @param[in] verbose */ -void inverse_transform(cumlHandle &handle, +void inverse_transform(raft::handle_t &handle, MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, MLCommon::Matrix::Data **trans_input, float *components, MLCommon::Matrix::Data **input, paramsTSVD prms, bool verbose); -void inverse_transform(cumlHandle &handle, +void inverse_transform(raft::handle_t &handle, MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, MLCommon::Matrix::Data **trans_input, diff --git a/cpp/include/cuml/distance/distance_type.h b/cpp/include/cuml/distance/distance_type.h new file mode 100644 index 0000000000..da881c35b9 --- /dev/null +++ b/cpp/include/cuml/distance/distance_type.h @@ -0,0 +1,23 @@ +#pragma once + +namespace ML { +namespace Distance { + +/** enum to tell how to compute euclidean distance */ +enum DistanceType : unsigned short { + /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */ + EucExpandedL2 = 0, + /** same as above, but inside the epilogue, perform square root operation */ + EucExpandedL2Sqrt = 1, + /** cosine distance */ + EucExpandedCosine = 2, + /** L1 distance */ + EucUnexpandedL1 = 3, + /** evaluate as dist_ij += (x_ik - y-jk)^2 */ + EucUnexpandedL2 = 4, + /** same as above, but inside the epilogue, perform square root operation */ + EucUnexpandedL2Sqrt = 5, +}; + +}; // end namespace Distance +}; // end namespace ML diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp index 545c5decad..cda6e88125 100644 --- a/cpp/include/cuml/ensemble/randomforest.hpp +++ b/cpp/include/cuml/ensemble/randomforest.hpp @@ -57,9 +57,14 @@ struct RF_params { */ int n_trees; /** - * Control bootstrapping. If set, each tree in the forest is built on a - * bootstrapped sample with replacement. - * If false, sampling without replacement is done. + * Control bootstrapping. + * If bootstrapping is set to true, bootstrapped samples are used for building + * each tree. Bootstrapped sampling is done by randomly drawing + * round(rows_sample * n_samples) number of samples with replacement. More on + * bootstrapping: + * https://en.wikipedia.org/wiki/Bootstrap_aggregating + * If boostrapping is set to false, whole dataset is used to build each + * tree. */ bool bootstrap; /** @@ -129,6 +134,9 @@ void print_rf_summary(const RandomForestMetaData* forest); template void print_rf_detailed(const RandomForestMetaData* forest); +template +std::string dump_rf_as_json(const RandomForestMetaData* forest); + template void build_treelite_forest(ModelHandle* model, const RandomForestMetaData* forest, @@ -143,37 +151,37 @@ void compare_concat_forest_to_subforests( typedef RandomForestMetaData RandomForestClassifierF; typedef RandomForestMetaData RandomForestClassifierD; -void fit(const cumlHandle& user_handle, RandomForestClassifierF*& forest, +void fit(const raft::handle_t& user_handle, RandomForestClassifierF*& forest, float* input, int n_rows, int n_cols, int* labels, int n_unique_labels, RF_params rf_params, int verbosity = CUML_LEVEL_INFO); -void fit(const cumlHandle& user_handle, RandomForestClassifierD*& forest, +void fit(const raft::handle_t& user_handle, RandomForestClassifierD*& forest, double* input, int n_rows, int n_cols, int* labels, int n_unique_labels, RF_params rf_params, int verbosity = CUML_LEVEL_INFO); -void predict(const cumlHandle& user_handle, +void predict(const raft::handle_t& user_handle, const RandomForestClassifierF* forest, const float* input, int n_rows, int n_cols, int* predictions, int verbosity = CUML_LEVEL_INFO); -void predict(const cumlHandle& user_handle, +void predict(const raft::handle_t& user_handle, const RandomForestClassifierD* forest, const double* input, int n_rows, int n_cols, int* predictions, int verbosity = CUML_LEVEL_INFO); -void predictGetAll(const cumlHandle& user_handle, +void predictGetAll(const raft::handle_t& user_handle, const RandomForestClassifierF* forest, const float* input, int n_rows, int n_cols, int* predictions, int verbosity = CUML_LEVEL_INFO); -void predictGetAll(const cumlHandle& user_handle, +void predictGetAll(const raft::handle_t& user_handle, const RandomForestClassifierD* forest, const double* input, int n_rows, int n_cols, int* predictions, int verbosity = CUML_LEVEL_INFO); -RF_metrics score(const cumlHandle& user_handle, +RF_metrics score(const raft::handle_t& user_handle, const RandomForestClassifierF* forest, const int* ref_labels, int n_rows, const int* predictions, int verbosity = CUML_LEVEL_INFO); -RF_metrics score(const cumlHandle& user_handle, +RF_metrics score(const raft::handle_t& user_handle, const RandomForestClassifierD* forest, const int* ref_labels, int n_rows, const int* predictions, int verbosity = CUML_LEVEL_INFO); @@ -183,34 +191,35 @@ RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features, float min_impurity_decrease, bool bootstrap_features, bool bootstrap, int n_trees, float rows_sample, int seed, CRITERION split_criterion, - bool quantile_per_tree, int cfg_n_streams); + bool quantile_per_tree, int cfg_n_streams, + bool use_experimental_backend, int max_batch_size); // ----------------------------- Regression ----------------------------------- // typedef RandomForestMetaData RandomForestRegressorF; typedef RandomForestMetaData RandomForestRegressorD; -void fit(const cumlHandle& user_handle, RandomForestRegressorF*& forest, +void fit(const raft::handle_t& user_handle, RandomForestRegressorF*& forest, float* input, int n_rows, int n_cols, float* labels, RF_params rf_params, int verbosity = CUML_LEVEL_INFO); -void fit(const cumlHandle& user_handle, RandomForestRegressorD*& forest, +void fit(const raft::handle_t& user_handle, RandomForestRegressorD*& forest, double* input, int n_rows, int n_cols, double* labels, RF_params rf_params, int verbosity = CUML_LEVEL_INFO); -void predict(const cumlHandle& user_handle, +void predict(const raft::handle_t& user_handle, const RandomForestRegressorF* forest, const float* input, int n_rows, int n_cols, float* predictions, int verbosity = CUML_LEVEL_INFO); -void predict(const cumlHandle& user_handle, +void predict(const raft::handle_t& user_handle, const RandomForestRegressorD* forest, const double* input, int n_rows, int n_cols, double* predictions, int verbosity = CUML_LEVEL_INFO); -RF_metrics score(const cumlHandle& user_handle, +RF_metrics score(const raft::handle_t& user_handle, const RandomForestRegressorF* forest, const float* ref_labels, int n_rows, const float* predictions, int verbosity = CUML_LEVEL_INFO); -RF_metrics score(const cumlHandle& user_handle, +RF_metrics score(const raft::handle_t& user_handle, const RandomForestRegressorD* forest, const double* ref_labels, int n_rows, const double* predictions, int verbosity = CUML_LEVEL_INFO); diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h index 7642375255..b2f8b924be 100644 --- a/cpp/include/cuml/fil/fil.h +++ b/cpp/include/cuml/fil/fil.h @@ -79,8 +79,14 @@ enum storage_type_t { AUTO, /** import the forest as dense */ DENSE, - /** import the forest as sparse */ - SPARSE + /** import the forest as sparse (currently always with 16-byte nodes) */ + SPARSE, + /** (experimental) import the forest as sparse with 8-byte nodes; can fail if + 8-byte nodes are not enough to store the forest, e.g. there are too many + nodes in a tree or too many features; note that the number of bits used to + store the child or feature index can change in the future; this can affect + whether a particular forest can be imported as SPARSE8 */ + SPARSE8, }; /** val_t is the payload within a FIL leaf */ @@ -98,57 +104,93 @@ struct dense_node_t { int bits; }; -/** sparse_node_extra_data is what's missing from a dense node to store +/** sparse_node16_extra_data is what's missing from a dense node to store a sparse node, that is, extra indexing information due to compressing a sparse tree. */ -struct sparse_node_extra_data { +struct sparse_node16_extra_data { int left_idx; int dummy; // make alignment explicit and reserve for future use }; -/** sparse_node_t is a node in a sparsely-stored forest */ -struct sparse_node_t : dense_node_t, sparse_node_extra_data { - sparse_node_t() = default; - sparse_node_t(dense_node_t dn, sparse_node_extra_data ed) - : dense_node_t(dn), sparse_node_extra_data(ed) {} +/** sparse_node16_t is a 16-byte node in a sparsely-stored forest */ +struct sparse_node16_t : dense_node_t, sparse_node16_extra_data { + sparse_node16_t() = default; + sparse_node16_t(dense_node_t dn, sparse_node16_extra_data ed) + : dense_node_t(dn), sparse_node16_extra_data(ed) {} }; -/** leaf_value_t describes what the leaves in a FIL forest store (predict) */ -enum leaf_value_t { - /** storing a class probability or regression summand */ - FLOAT_SCALAR = 0, - /** storing a class label */ - INT_CLASS_LABEL = 1 +/** sparse_node8_t is a node of reduced size (8 bytes) + in a sparsely-stored forest */ +struct sparse_node8_t : dense_node_t { + sparse_node8_t() = default; + sparse_node8_t(dense_node_t dn) : dense_node_t(dn) {} +}; + +/** leaf_algo_t describes what the leaves in a FIL forest store (predict) + and how FIL aggregates them into class margins/regression result/best class +**/ +enum leaf_algo_t { + /** storing a class probability or regression summand. We add all margins + together and determine regression result or use threshold to determine + one of the two classes. **/ + FLOAT_UNARY_BINARY = 0, + /** storing a class label. Trees vote on the resulting class. + Probabilities are just normalized votes. */ + CATEGORICAL_LEAF = 1, + /** 1-vs-rest, or tree-per-class, where trees are assigned round-robin to + consecutive categories and predict a floating-point margin. Used in + Gradient Boosted Decision Trees. We sum margins for each group separately + **/ + GROVE_PER_CLASS = 2, + /** 1-vs-rest, or tree-per-class, where trees are assigned round-robin to + consecutive categories and predict a floating-point margin. Used in + Gradient Boosted Decision Trees. We sum margins for each group separately + This is a more specific version of GROVE_PER_CLASS. + _FEW_CLASSES means fewer (or as many) classes than threads. **/ + GROVE_PER_CLASS_FEW_CLASSES = 3, + /** 1-vs-rest, or tree-per-class, where trees are assigned round-robin to + consecutive categories and predict a floating-point margin. Used in + Gradient Boosted Decision Trees. We sum margins for each group separately + This is a more specific version of GROVE_PER_CLASS. + _MANY_CLASSES means more classes than threads. **/ + GROVE_PER_CLASS_MANY_CLASSES = 4, // to be extended }; -template +template struct leaf_output_t {}; template <> -struct leaf_output_t { +struct leaf_output_t { typedef float T; }; template <> -struct leaf_output_t { +struct leaf_output_t { typedef int T; }; +template <> +struct leaf_output_t { + typedef float T; +}; +template <> +struct leaf_output_t { + typedef float T; +}; -/** dense_node_init initializes node from paramters */ -void dense_node_init(dense_node_t* n, val_t output, float thresh, int fid, - bool def_left, bool is_leaf); - -/** dense_node_decode extracts individual members from node */ -void dense_node_decode(const dense_node_t* node, val_t* output, float* thresh, - int* fid, bool* def_left, bool* is_leaf); - -/** sparse_node_init initializes node from parameters */ -void sparse_node_init(sparse_node_t* node, val_t output, float thresh, int fid, - bool def_left, bool is_leaf, int left_index); +/** node_init initializes node from paramters */ +void node_init(dense_node_t* n, val_t output, float thresh, int fid, + bool def_left, bool is_leaf); +void node_init(sparse_node16_t* node, val_t output, float thresh, int fid, + bool def_left, bool is_leaf, int left_index); +void node_init(sparse_node8_t* node, val_t output, float thresh, int fid, + bool def_left, bool is_leaf, int left_index); -/** sparse_node_decode extracts individual members from node */ -void sparse_node_decode(const sparse_node_t* node, val_t* output, float* thresh, - int* fid, bool* def_left, bool* is_leaf, - int* left_index); +/** node_decode extracts individual members from node */ +void node_decode(const dense_node_t* node, val_t* output, float* thresh, + int* fid, bool* def_left, bool* is_leaf); +void node_decode(const sparse_node16_t* node, val_t* output, float* thresh, + int* fid, bool* def_left, bool* is_leaf, int* left_index); +void node_decode(const sparse_node8_t* node, val_t* output, float* thresh, + int* fid, bool* def_left, bool* is_leaf, int* left_index); struct forest; @@ -165,20 +207,20 @@ struct forest_params_t { int num_trees; // num_cols is the number of columns in the data int num_cols; - // leaf_payload_type determines what the leaves store (predict) - leaf_value_t leaf_payload_type; + // leaf_algo determines what the leaves store (predict) + leaf_algo_t leaf_algo; // algo is the inference algorithm; // sparse forests do not distinguish between NAIVE and TREE_REORG algo_t algo; // output is the desired output type output_t output; - // threshold is used to for classification if leaf_payload_type == FLOAT_SCALAR && (output & OUTPUT_CLASS) != 0 && !predict_proba, + // threshold is used to for classification if leaf_algo == FLOAT_UNARY_BINARY && (output & OUTPUT_CLASS) != 0 && !predict_proba, // and is ignored otherwise float threshold; // global_bias is added to the sum of tree predictions // (after averaging, if it is used, but before any further transformations) float global_bias; - // only used for INT_CLASS_LABEL inference. since we're storing the + // only used for CATEGORICAL_LEAF inference. since we're storing the // labels in leaves instead of the whole vector, this keeps track // of the number of classes int num_classes; @@ -207,18 +249,30 @@ struct treelite_params_t { (2**(params->depth + 1) - 1) * params->ntrees * @param params pointer to parameters used to initialize the forest */ -void init_dense(const cumlHandle& h, forest_t* pf, const dense_node_t* nodes, - const forest_params_t* params); +void init_dense(const raft::handle_t& h, forest_t* pf, + const dense_node_t* nodes, const forest_params_t* params); + +/** init_sparse uses params, trees and nodes to initialize the sparse forest + * with 16-byte nodes stored in pf + * @param h cuML handle used by this function + * @param pf pointer to where to store the newly created forest + * @param trees indices of tree roots in the nodes arrray, of length params->ntrees + * @param nodes nodes for the forest, of length params->num_nodes + * @param params pointer to parameters used to initialize the forest + */ +void init_sparse(const raft::handle_t& h, forest_t* pf, const int* trees, + const sparse_node16_t* nodes, const forest_params_t* params); -/** init_sparse uses params, trees and nodes to initialize the sparse forest stored in pf +/** init_sparse uses params, trees and nodes to initialize the sparse forest + * with 8-byte nodes stored in pf * @param h cuML handle used by this function * @param pf pointer to where to store the newly created forest * @param trees indices of tree roots in the nodes arrray, of length params->ntrees * @param nodes nodes for the forest, of length params->num_nodes * @param params pointer to parameters used to initialize the forest */ -void init_sparse(const cumlHandle& h, forest_t* pf, const int* trees, - const sparse_node_t* nodes, const forest_params_t* params); +void init_sparse(const raft::handle_t& h, forest_t* pf, const int* trees, + const sparse_node8_t* nodes, const forest_params_t* params); /** from_treelite uses a treelite model to initialize the forest * @param handle cuML handle used by this function @@ -226,14 +280,14 @@ void init_sparse(const cumlHandle& h, forest_t* pf, const int* trees, * @param model treelite model used to initialize the forest * @param tl_params additional parameters for the forest */ -void from_treelite(const cumlHandle& handle, forest_t* pforest, +void from_treelite(const raft::handle_t& handle, forest_t* pforest, ModelHandle model, const treelite_params_t* tl_params); /** free deletes forest and all resources held by it; after this, forest is no longer usable * @param h cuML handle used by this function * @param f the forest to free; not usable after the call to this function */ -void free(const cumlHandle& h, forest_t f); +void free(const raft::handle_t& h, forest_t f); /** predict predicts on data (with n rows) using forest and writes results into preds; * the number of columns is stored in forest, and both preds and data point to GPU memory @@ -247,8 +301,8 @@ void free(const cumlHandle& h, forest_t f); * @param predict_proba for classifier models, this forces to output both class probabilities * instead of binary class prediction. format matches scikit-learn API */ -void predict(const cumlHandle& h, forest_t f, float* preds, const float* data, - size_t num_rows, bool predict_proba = false); +void predict(const raft::handle_t& h, forest_t f, float* preds, + const float* data, size_t num_rows, bool predict_proba = false); } // namespace fil } // namespace ML diff --git a/cpp/include/cuml/linear_model/glm.hpp b/cpp/include/cuml/linear_model/glm.hpp index 90f08e40f8..6eff9f4e28 100644 --- a/cpp/include/cuml/linear_model/glm.hpp +++ b/cpp/include/cuml/linear_model/glm.hpp @@ -33,10 +33,10 @@ namespace GLM { * @param algo specifies which solver to use (0: SVD, 1: Eigendecomposition, 2: QR-decomposition) * @{ */ -void olsFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, +void olsFit(const raft::handle_t &handle, float *input, int n_rows, int n_cols, float *labels, float *coef, float *intercept, bool fit_intercept, bool normalize, int algo = 0); -void olsFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, +void olsFit(const raft::handle_t &handle, double *input, int n_rows, int n_cols, double *labels, double *coef, double *intercept, bool fit_intercept, bool normalize, int algo = 0); /** @} */ @@ -56,14 +56,14 @@ void olsFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, * @param algo specifies which solver to use (0: SVD, 1: Eigendecomposition) * @{ */ -void ridgeFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, - float *labels, float *alpha, int n_alpha, float *coef, +void ridgeFit(const raft::handle_t &handle, float *input, int n_rows, + int n_cols, float *labels, float *alpha, int n_alpha, float *coef, float *intercept, bool fit_intercept, bool normalize, int algo = 0); -void ridgeFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, - double *labels, double *alpha, int n_alpha, double *coef, - double *intercept, bool fit_intercept, bool normalize, - int algo = 0); +void ridgeFit(const raft::handle_t &handle, double *input, int n_rows, + int n_cols, double *labels, double *alpha, int n_alpha, + double *coef, double *intercept, bool fit_intercept, + bool normalize, int algo = 0); /** @} */ /** @@ -76,21 +76,21 @@ void ridgeFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, * @param preds device pointer to store predictions of size n_rows * @{ */ -void olsPredict(const cumlHandle &handle, const float *input, int n_rows, +void olsPredict(const raft::handle_t &handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds); -void olsPredict(const cumlHandle &handle, const double *input, int n_rows, +void olsPredict(const raft::handle_t &handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds); -void ridgePredict(const cumlHandle &handle, const float *input, int n_rows, +void ridgePredict(const raft::handle_t &handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds); -void ridgePredict(const cumlHandle &handle, const double *input, int n_rows, +void ridgePredict(const raft::handle_t &handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds); /** @} */ /** * @defgroup qnFit to fit a GLM using quasi newton methods. - * @param cuml_handle reference to cumlHandle object + * @param cuml_handle reference to raft::handle_t object * @param X device pointer to feature matrix of dimension * NxD (row- or column major: see X_col_major param) * @param y device pointer to label vector of length N (for @@ -125,13 +125,13 @@ void ridgePredict(const cumlHandle &handle, const double *input, int n_rows, * normal/squared, 2: multinomial/softmax) * @{ */ -void qnFit(const cumlHandle &cuml_handle, float *X, float *y, int N, int D, +void qnFit(const raft::handle_t &cuml_handle, float *X, float *y, int N, int D, int C, bool fit_intercept, float l1, float l2, int max_iter, float grad_tol, int linesearch_max_iter, int lbfgs_memory, int verbosity, float *w0, float *f, int *num_iters, bool X_col_major, int loss_type); -void qnFit(const cumlHandle &cuml_handle, double *X, double *y, int N, int D, - int C, bool fit_intercept, double l1, double l2, int max_iter, +void qnFit(const raft::handle_t &cuml_handle, double *X, double *y, int N, + int D, int C, bool fit_intercept, double l1, double l2, int max_iter, double grad_tol, int linesearch_max_iter, int lbfgs_memory, int verbosity, double *w0, double *f, int *num_iters, bool X_col_major, int loss_type); @@ -139,7 +139,7 @@ void qnFit(const cumlHandle &cuml_handle, double *X, double *y, int N, int D, /** * @defgroup qnDecisionFunction to obtain the confidence scores of samples - * @param cuml_handle reference to cumlHandle object + * @param cuml_handle reference to raft::handle_t object * @param X device pointer to feature matrix of dimension NxD (row- or column major: see X_col_major param) * @param N number of examples * @param D number of features @@ -151,17 +151,17 @@ void qnFit(const cumlHandle &cuml_handle, double *X, double *y, int N, int D, * @param scores device pointer to confidence scores of length N (for binary logistic: [0,1], for multinomial: [0,...,C-1]) * @{ */ -void qnDecisionFunction(const cumlHandle &cuml_handle, float *X, int N, int D, - int C, bool fit_intercept, float *params, +void qnDecisionFunction(const raft::handle_t &cuml_handle, float *X, int N, + int D, int C, bool fit_intercept, float *params, bool X_col_major, int loss_type, float *scores); -void qnDecisionFunction(const cumlHandle &cuml_handle, double *X, int N, int D, - int C, bool fit_intercept, double *params, +void qnDecisionFunction(const raft::handle_t &cuml_handle, double *X, int N, + int D, int C, bool fit_intercept, double *params, bool X_col_major, int loss_type, double *scores); /** @} */ /** * @defgroup qnPredict to fit a GLM using quasi newton methods. - * @param cuml_handle reference to cumlHandle object + * @param cuml_handle reference to raft::handle_t object * @param X device pointer to feature matrix of dimension NxD (row- or column major: see X_col_major param) * @param N number of examples * @param D number of features @@ -173,11 +173,11 @@ void qnDecisionFunction(const cumlHandle &cuml_handle, double *X, int N, int D, * @param preds device pointer to predictions of length N (for binary logistic: [0,1], for multinomial: [0,...,C-1]) * @{ */ -void qnPredict(const cumlHandle &cuml_handle, float *X, int N, int D, int C, +void qnPredict(const raft::handle_t &cuml_handle, float *X, int N, int D, int C, bool fit_intercept, float *params, bool X_col_major, int loss_type, float *preds); -void qnPredict(const cumlHandle &cuml_handle, double *X, int N, int D, int C, - bool fit_intercept, double *params, bool X_col_major, +void qnPredict(const raft::handle_t &cuml_handle, double *X, int N, int D, + int C, bool fit_intercept, double *params, bool X_col_major, int loss_type, double *preds); /** @} */ diff --git a/cpp/include/cuml/linear_model/ols_mg.hpp b/cpp/include/cuml/linear_model/ols_mg.hpp index 5308acdca7..37dea89df8 100644 --- a/cpp/include/cuml/linear_model/ols_mg.hpp +++ b/cpp/include/cuml/linear_model/ols_mg.hpp @@ -39,14 +39,14 @@ namespace opg { * @param[in] algo: which algorithm is used for OLS. 0 is for SVD, 1 is for eig. * @param[in] verbose */ -void fit(cumlHandle &handle, +void fit(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, float *coef, float *intercept, bool fit_intercept, bool normalize, int algo, bool verbose); -void fit(cumlHandle &handle, +void fit(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, double *coef, @@ -66,14 +66,16 @@ void fit(cumlHandle &handle, * @param[out] preds: predictions * @param[in] verbose */ -void predict(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - size_t n_rows, size_t n_cols, float *coef, float intercept, +void predict(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, size_t n_rows, + size_t n_cols, float *coef, float intercept, MLCommon::Matrix::Data **preds, bool verbose); -void predict(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - size_t n_rows, size_t n_cols, double *coef, double intercept, +void predict(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, size_t n_rows, + size_t n_cols, double *coef, double intercept, MLCommon::Matrix::Data **preds, bool verbose); }; // end namespace opg diff --git a/cpp/include/cuml/linear_model/preprocess_mg.hpp b/cpp/include/cuml/linear_model/preprocess_mg.hpp index 46e79b5b48..a204648b14 100644 --- a/cpp/include/cuml/linear_model/preprocess_mg.hpp +++ b/cpp/include/cuml/linear_model/preprocess_mg.hpp @@ -17,16 +17,16 @@ #pragma once #include -#include #include #include #include +#include namespace ML { namespace GLM { namespace opg { -void preProcessData(cumlHandle &handle, +void preProcessData(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, @@ -34,7 +34,7 @@ void preProcessData(cumlHandle &handle, bool fit_intercept, bool normalize, cudaStream_t *streams, int n_streams, bool verbose); -void preProcessData(cumlHandle &handle, +void preProcessData(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, @@ -42,7 +42,7 @@ void preProcessData(cumlHandle &handle, bool fit_intercept, bool normalize, cudaStream_t *streams, int n_streams, bool verbose); -void postProcessData(cumlHandle &handle, +void postProcessData(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, @@ -51,7 +51,7 @@ void postProcessData(cumlHandle &handle, bool normalize, cudaStream_t *streams, int n_streams, bool verbose); -void postProcessData(cumlHandle &handle, +void postProcessData(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, diff --git a/cpp/include/cuml/linear_model/ridge_mg.hpp b/cpp/include/cuml/linear_model/ridge_mg.hpp index bc8ca50e09..b5cb23a47e 100644 --- a/cpp/include/cuml/linear_model/ridge_mg.hpp +++ b/cpp/include/cuml/linear_model/ridge_mg.hpp @@ -41,14 +41,14 @@ namespace opg { * @param[in] algo: the algorithm to use for fitting * @param[in] verbose */ -void fit(cumlHandle &handle, +void fit(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, float *alpha, int n_alpha, float *coef, float *intercept, bool fit_intercept, bool normalize, int algo, bool verbose); -void fit(cumlHandle &handle, +void fit(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, double *alpha, @@ -68,14 +68,16 @@ void fit(cumlHandle &handle, * @param[out] preds: predictions * @param[in] verbose */ -void predict(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - size_t n_rows, size_t n_cols, float *coef, float intercept, +void predict(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, size_t n_rows, + size_t n_cols, float *coef, float intercept, MLCommon::Matrix::Data **preds, bool verbose); -void predict(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - size_t n_rows, size_t n_cols, double *coef, double intercept, +void predict(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, size_t n_rows, + size_t n_cols, double *coef, double intercept, MLCommon::Matrix::Data **preds, bool verbose); }; // end namespace opg diff --git a/cpp/include/cuml/manifold/tsne.h b/cpp/include/cuml/manifold/tsne.h index 14ba176b07..e94d1dd4d7 100644 --- a/cpp/include/cuml/manifold/tsne.h +++ b/cpp/include/cuml/manifold/tsne.h @@ -59,7 +59,7 @@ namespace ML { * or >= 0 for reproducible outputs. * @param[in] verbosity verbosity level for logging messages during * execution - * @param[in] intialize_embeddings Whether to overwrite the current Y vector + * @param[in] initialize_embeddings Whether to overwrite the current Y vector * with random noise. * @param[in] barnes_hut Whether to use the fast Barnes Hut or use the * slower exact version. @@ -71,10 +71,11 @@ namespace ML { * approach is available in their article t-SNE-CUDA: GPU-Accelerated t-SNE and * its Applications to Modern Data (https://arxiv.org/abs/1807.11824). */ -void TSNE_fit(const cumlHandle &handle, const float *X, float *Y, const int n, - const int p, const int dim = 2, int n_neighbors = 1023, - const float theta = 0.5f, const float epssq = 0.0025, - float perplexity = 50.0f, const int perplexity_max_iter = 100, +void TSNE_fit(const raft::handle_t &handle, const float *X, float *Y, + const int n, const int p, const int dim = 2, + int n_neighbors = 1023, const float theta = 0.5f, + const float epssq = 0.0025, float perplexity = 50.0f, + const int perplexity_max_iter = 100, const float perplexity_tol = 1e-5, const float early_exaggeration = 12.0f, const int exaggeration_iter = 250, const float min_gain = 0.01f, @@ -84,6 +85,6 @@ void TSNE_fit(const cumlHandle &handle, const float *X, float *Y, const int n, const float pre_momentum = 0.5, const float post_momentum = 0.8, const long long random_state = -1, int verbosity = CUML_LEVEL_INFO, - const bool intialize_embeddings = true, bool barnes_hut = true); + const bool initialize_embeddings = true, bool barnes_hut = true); } // namespace ML diff --git a/cpp/include/cuml/manifold/umap.hpp b/cpp/include/cuml/manifold/umap.hpp index af6f9d9966..d90464c9f9 100644 --- a/cpp/include/cuml/manifold/umap.hpp +++ b/cpp/include/cuml/manifold/umap.hpp @@ -22,20 +22,20 @@ namespace ML { -void transform(const cumlHandle &handle, float *X, int n, int d, +void transform(const raft::handle_t &handle, float *X, int n, int d, int64_t *knn_indices, float *knn_dists, float *orig_X, int orig_n, float *embedding, int embedding_n, UMAPParams *params, float *transformed); -void find_ab(const cumlHandle &handle, UMAPParams *params); +void find_ab(const raft::handle_t &handle, UMAPParams *params); -void fit(const cumlHandle &handle, +void fit(const raft::handle_t &handle, float *X, // input matrix float *y, // labels int n, int d, int64_t *knn_indices, float *knn_dists, UMAPParams *params, float *embeddings); -void fit(const cumlHandle &handle, +void fit(const raft::handle_t &handle, float *X, // input matrix int n, // rows int d, // cols @@ -45,11 +45,11 @@ void fit(const cumlHandle &handle, class UMAP_API { float *orig_X; int orig_n; - cumlHandle *handle; + raft::handle_t *handle; UMAPParams *params; public: - UMAP_API(const cumlHandle &handle, UMAPParams *params); + UMAP_API(const raft::handle_t &handle, UMAPParams *params); ~UMAP_API(); /** diff --git a/cpp/include/cuml/metrics/metrics.hpp b/cpp/include/cuml/metrics/metrics.hpp index e54e72f5bf..4b7fdd1070 100644 --- a/cpp/include/cuml/metrics/metrics.hpp +++ b/cpp/include/cuml/metrics/metrics.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include namespace ML { @@ -32,13 +33,13 @@ namespace Metrics { * in a linear regression model. The larger the R-squared value, the * more variability is explained by the linear regression model. * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of ground-truth response variables * @param y_hat: Array of predicted response variables * @param n: Number of elements in y and y_hat * @return: The R-squared value. */ -float r2_score_py(const cumlHandle &handle, float *y, float *y_hat, int n); +float r2_score_py(const raft::handle_t &handle, float *y, float *y_hat, int n); /** * Calculates the "Coefficient of Determination" (R-Squared) score @@ -50,27 +51,28 @@ float r2_score_py(const cumlHandle &handle, float *y, float *y_hat, int n); * in a linear regression model. The larger the R-squared value, the * more variability is explained by the linear regression model. * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of ground-truth response variables * @param y_hat: Array of predicted response variables * @param n: Number of elements in y and y_hat * @return: The R-squared value. */ -double r2_score_py(const cumlHandle &handle, double *y, double *y_hat, int n); +double r2_score_py(const raft::handle_t &handle, double *y, double *y_hat, + int n); /** * Calculates the "rand index" * * This metric is a measure of similarity between two data clusterings. * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of response variables of the first clustering classifications * @param y_hat: Array of response variables of the second clustering classifications * @param n: Number of elements in y and y_hat * @return: The rand index value */ -double randIndex(const cumlHandle &handle, double *y, double *y_hat, int n); +double randIndex(const raft::handle_t &handle, double *y, double *y_hat, int n); /** * Calculates the "Silhouette Score" @@ -81,7 +83,7 @@ double randIndex(const cumlHandle &handle, double *y, double *y_hat, int n); * and the nearest cluster that the sample is not a part of. Note that Silhouette Coefficient * is only defined if number of labels is 2 <= n_labels <= n_samples - 1. * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of data samples with dimensions (nRows x nCols) * @param nRows: number of data samples * @param nCols: number of features @@ -90,7 +92,7 @@ double randIndex(const cumlHandle &handle, double *y, double *y_hat, int n); * @param metric: the numerical value that maps to the type of distance metric to be used in the calculations * @param silScores: Array that is optionally taken in as input if required to be populated with the silhouette score for every sample (1 x nRows), else nullptr is passed */ -double silhouetteScore(const cumlHandle &handle, double *y, int nRows, +double silhouetteScore(const raft::handle_t &handle, double *y, int nRows, int nCols, int *labels, int nLabels, double *silScores, int metric); /** @@ -98,16 +100,16 @@ double silhouetteScore(const cumlHandle &handle, double *y, int nRows, * * This metric is the corrected-for-chance version of the rand index * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of response variables of the first clustering classifications * @param y_hat: Array of response variables of the second clustering classifications * @param n: Number of elements in y and y_hat * @return: The adjusted rand index value * @{ */ -double adjustedRandIndex(const cumlHandle &handle, const int64_t *y, +double adjustedRandIndex(const raft::handle_t &handle, const int64_t *y, const int64_t *y_hat, const int64_t n); -double adjustedRandIndex(const cumlHandle &handle, const int *y, +double adjustedRandIndex(const raft::handle_t &handle, const int *y, const int *y_hat, const int n); /** @} */ @@ -118,13 +120,13 @@ double adjustedRandIndex(const cumlHandle &handle, const int *y, * approximates the probability distribution P * It is often also used as a 'distance metric' between two probablity ditributions (not symmetric) * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of probabilities corresponding to distribution P * @param y_hat: Array of probabilities corresponding to distribution Q * @param n: Number of elements in y and y_hat * @return: The KL Divergence value */ -double klDivergence(const cumlHandle &handle, const double *y, +double klDivergence(const raft::handle_t &handle, const double *y, const double *y_hat, int n); /** @@ -134,28 +136,28 @@ double klDivergence(const cumlHandle &handle, const double *y, * approximates the probability distribution P * It is often also used as a 'distance metric' between two probablity ditributions (not symmetric) * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of probabilities corresponding to distribution P * @param y_hat: Array of probabilities corresponding to distribution Q * @param n: Number of elements in y and y_hat * @return: The KL Divergence value */ -float klDivergence(const cumlHandle &handle, const float *y, const float *y_hat, - int n); +float klDivergence(const raft::handle_t &handle, const float *y, + const float *y_hat, int n); /** * Calculates the "entropy" of a labelling * * This metric is a measure of the purity/polarity of the clustering * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of response variables of the clustering * @param n: Number of elements in y * @param lower_class_range: the lowest value in the range of classes * @param upper_class_range: the highest value in the range of classes * @return: The entropy value of the clustering */ -double entropy(const cumlHandle &handle, const int *y, const int n, +double entropy(const raft::handle_t &handle, const int *y, const int n, const int lower_class_range, const int upper_class_range); /** @@ -164,7 +166,7 @@ double entropy(const cumlHandle &handle, const int *y, const int n, * Mutual Information is a measure of the similarity between two labels of * the same data. * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: Array of response variables of the first clustering classifications * @param y_hat: Array of response variables of the second clustering classifications * @param n: Number of elements in y and y_hat @@ -172,8 +174,9 @@ double entropy(const cumlHandle &handle, const int *y, const int n, * @param upper_class_range: the highest value in the range of classes * @return: The mutual information score */ -double mutualInfoScore(const cumlHandle &handle, const int *y, const int *y_hat, - const int n, const int lower_class_range, +double mutualInfoScore(const raft::handle_t &handle, const int *y, + const int *y_hat, const int n, + const int lower_class_range, const int upper_class_range); /** @@ -182,7 +185,7 @@ double mutualInfoScore(const cumlHandle &handle, const int *y, const int *y_hat, * A clustering result satisfies homogeneity if all of its clusters * contain only data points which are members of a single class. * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: truth labels * @param y_hat: predicted labels * @param n: Number of elements in y and y_hat @@ -190,7 +193,7 @@ double mutualInfoScore(const cumlHandle &handle, const int *y, const int *y_hat, * @param upper_class_range: the highest value in the range of classes * @return: The homogeneity score */ -double homogeneityScore(const cumlHandle &handle, const int *y, +double homogeneityScore(const raft::handle_t &handle, const int *y, const int *y_hat, const int n, const int lower_class_range, const int upper_class_range); @@ -201,7 +204,7 @@ double homogeneityScore(const cumlHandle &handle, const int *y, * A clustering result satisfies completeness if all the data points * that are members of a given class are elements of the same cluster. * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: truth labels * @param y_hat: predicted labels * @param n: Number of elements in y and y_hat @@ -209,7 +212,7 @@ double homogeneityScore(const cumlHandle &handle, const int *y, * @param upper_class_range: the highest value in the range of classes * @return: The completeness score */ -double completenessScore(const cumlHandle &handle, const int *y, +double completenessScore(const raft::handle_t &handle, const int *y, const int *y_hat, const int n, const int lower_class_range, const int upper_class_range); @@ -220,7 +223,7 @@ double completenessScore(const cumlHandle &handle, const int *y, * v-measure is the harmonic mean between the homogeneity * and completeness scores of 2 cluster classifications * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param y: truth labels * @param y_hat: predicted labels * @param n: Number of elements in y and y_hat @@ -228,7 +231,7 @@ double completenessScore(const cumlHandle &handle, const int *y, * @param upper_class_range: the highest value in the range of classes * @return: The v-measure */ -double vMeasure(const cumlHandle &handle, const int *y, const int *y_hat, +double vMeasure(const raft::handle_t &handle, const int *y, const int *y_hat, const int n, const int lower_class_range, const int upper_class_range); @@ -238,13 +241,55 @@ double vMeasure(const cumlHandle &handle, const int *y, const int *y_hat, * The accuracy metric is used to calculate the accuracy of the predict labels * predict labels * -* @param handle: cumlHandle +* @param handle: raft::handle_t * @param predictions: predicted labels * @param ref_predictions: truth labels * @param n: Number of elements in y and y_hat * @return: The accuracy */ -float accuracy_score_py(const cumlHandle &handle, const int *predictions, +float accuracy_score_py(const raft::handle_t &handle, const int *predictions, const int *ref_predictions, int n); + +/** + * @brief Calculates the ij pairwise distances between two input arrays of + * double type + * + * @param handle raft::handle_t + * @param x pointer to the input data samples array (mRows x kCols) + * @param y pointer to the second input data samples array. Can use the same + * pointer as x (nRows x kCols) + * @param dist output pointer where the results will be stored (mRows x nCols) + * @param m number of rows in x + * @param n number of rows in y + * @param k number of cols in x and y (must be the same) + * @param metric the distance metric to use for the calculation + * @param isRowMajor specifies whether the x and y data pointers are row (C + * type array) or col (F type array) major + */ +void pairwiseDistance(const raft::handle_t &handle, const double *x, + const double *y, double *dist, int m, int n, int k, + ML::Distance::DistanceType metric, + bool isRowMajor = true); + +/** + * @brief Calculates the ij pairwise distances between two input arrays of float type + * + * @param handle raft::handle_t + * @param x pointer to the input data samples array (mRows x kCols) + * @param y pointer to the second input data samples array. Can use the same + * pointer as x (nRows x kCols) + * @param dist output pointer where the results will be stored (mRows x nCols) + * @param m number of rows in x + * @param n number of rows in y + * @param k number of cols in x and y (must be the same) + * @param metric the distance metric to use for the calculation + * @param isRowMajor specifies whether the x and y data pointers are row (C + * type array) or col (F type array) major + */ +void pairwiseDistance(const raft::handle_t &handle, const float *x, + const float *y, float *dist, int m, int n, int k, + ML::Distance::DistanceType metric, + bool isRowMajor = true); + } // namespace Metrics } // namespace ML diff --git a/cpp/include/cuml/neighbors/knn.hpp b/cpp/include/cuml/neighbors/knn.hpp index 198fdfd091..ff52791b8b 100644 --- a/cpp/include/cuml/neighbors/knn.hpp +++ b/cpp/include/cuml/neighbors/knn.hpp @@ -41,25 +41,25 @@ enum MetricType { * a series of input arrays and combine the results into a single * output array for indexes and distances. * - * @param handle the cuml handle to use - * @param input vector of pointers to the input arrays - * @param sizes vector of sizes of input arrays - * @param D the dimensionality of the arrays - * @param search_items array of items to search of dimensionality D - * @param n number of rows in search_items - * @param res_I the resulting index array of size n * k - * @param res_D the resulting distance array of size n * k - * @param k the number of nearest neighbors to return - * @param rowMajorIndex are the index arrays in row-major order? - * @param rowMajorQuery are the query arrays in row-major order? - * @param metric distance metric to use. Euclidean (L2) is used by + * @param[in] handle the cuml handle to use + * @param[in] input vector of pointers to the input arrays + * @param[in] sizes vector of sizes of input arrays + * @param[in] D the dimensionality of the arrays + * @param[in] search_items array of items to search of dimensionality D + * @param[in] n number of rows in search_items + * @param[out] res_I the resulting index array of size n * k + * @param[out] res_D the resulting distance array of size n * k + * @param[in] k the number of nearest neighbors to return + * @param[in] rowMajorIndex are the index arrays in row-major order? + * @param[in] rowMajorQuery are the query arrays in row-major order? + * @param[in] metric distance metric to use. Euclidean (L2) is used by * default - * @param metric_arg the value of `p` for Minkowski (l-p) distances. This + * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This * is ignored if the metric_type is not Minkowski. - * @param expanded should lp-based distances be returned in their expanded + * @param[in] expanded should lp-based distances be returned in their expanded * form (e.g., without raising to the 1/p power). */ -void brute_force_knn(cumlHandle &handle, std::vector &input, +void brute_force_knn(raft::handle_t &handle, std::vector &input, std::vector &sizes, int D, float *search_items, int n, int64_t *res_I, float *res_D, int k, bool rowMajorIndex = false, bool rowMajorQuery = false, @@ -72,17 +72,17 @@ void brute_force_knn(cumlHandle &handle, std::vector &input, * by classifying on multiple label arrays. Note that each label is * classified independently, as is done in scikit-learn. * - * @param handle the cuml handle to use - * @param out output array on device (size n_samples * size of y vector) - * @param knn_indices index array on device resulting from knn query (size n_samples * k) - * @param y vector of label arrays on device vector size is number of (size n_samples) - * @param n_index_rows number of vertices in index (eg. size of each y array) - * @param n_samples number of samples in knn_indices - * @param k number of nearest neighbors in knn_indices + * @param[in] handle the cuml handle to use + * @param[out] out output array on device (size n_samples * size of y vector) + * @param[in] knn_indices index array on device resulting from knn query (size n_samples * k) + * @param[in] y vector of label arrays on device vector size is number of (size n_samples) + * @param[in] n_index_rows number of vertices in index (eg. size of each y array) + * @param[in] n_query_rows number of samples in knn_indices + * @param[in] k number of nearest neighbors in knn_indices */ -void knn_classify(cumlHandle &handle, int *out, int64_t *knn_indices, - std::vector &y, size_t n_index_rows, size_t n_samples, - int k); +void knn_classify(raft::handle_t &handle, int *out, int64_t *knn_indices, + std::vector &y, size_t n_index_rows, + size_t n_query_rows, int k); /** * @brief Flat C++ API function to perform a knn regression using @@ -90,33 +90,33 @@ void knn_classify(cumlHandle &handle, int *out, int64_t *knn_indices, * regression by clasifying on multiple label arrays. Note that * each label is classified independently, as is done in scikit-learn. * - * @param handle the cuml handle to use - * @param out output array on device (size n_samples) - * @param knn_indices array on device of knn indices (size n_samples * k) - * @param y array of labels on device (size n_samples) - * @param n_query_rows number of vertices in index (eg. size of each y array) - * @param n_samples number of samples in knn_indices and out - * @param k number of nearest neighbors in knn_indices + * @param[in] handle the cuml handle to use + * @param[out] out output array on device (size n_samples) + * @param[in] knn_indices array on device of knn indices (size n_samples * k) + * @param[in] y array of labels on device (size n_samples) + * @param[in] n_index_rows number of vertices in index (eg. size of each y array) + * @param[in] n_query_rows number of samples in knn_indices and out + * @param[in] k number of nearest neighbors in knn_indices */ -void knn_regress(cumlHandle &handle, float *out, int64_t *knn_indices, - std::vector &y, size_t n_query_rows, size_t n_samples, - int k); +void knn_regress(raft::handle_t &handle, float *out, int64_t *knn_indices, + std::vector &y, size_t n_index_rows, + size_t n_query_rows, int k); /** * @brief Flat C++ API function to compute knn class probabilities * using a vector of device arrays containing discrete class labels. * Note that the output is a vector, which is * - * @param handle the cuml handle to use - * @param out vector of output arrays on device. vector size = n_outputs. + * @param[in] handle the cuml handle to use + * @param[out] out vector of output arrays on device. vector size = n_outputs. * Each array should have size(n_samples, n_classes) - * @param knn_indices array on device of knn indices (size n_samples * k) - * @param y array of labels on device (size n_samples) - * @param n_index_rows number of labels - * @param n_samples number of samples in knn_indices and out - * @param k number of nearest neighbors in knn_indices + * @param[in] knn_indices array on device of knn indices (size n_samples * k) + * @param[in] y array of labels on device (size n_samples) + * @param[in] n_index_rows number of labels in y + * @param[in] n_query_rows number of rows in knn_indices and out + * @param[in] k number of nearest neighbors in knn_indices */ -void knn_class_proba(cumlHandle &handle, std::vector &out, +void knn_class_proba(raft::handle_t &handle, std::vector &out, int64_t *knn_indices, std::vector &y, - size_t n_index_rows, size_t n_samples, int k); + size_t n_index_rows, size_t n_query_rows, int k); }; // namespace ML diff --git a/cpp/include/cuml/neighbors/knn_api.h b/cpp/include/cuml/neighbors/knn_api.h index 0b49fd4e12..a1ec1b20f7 100644 --- a/cpp/include/cuml/neighbors/knn_api.h +++ b/cpp/include/cuml/neighbors/knn_api.h @@ -27,24 +27,24 @@ extern "C" { * a series of input arrays and combine the results into a single * output array for indexes and distances. * - * @param handle the cuml handle to use - * @param input an array of pointers to the input arrays - * @param size an array of sizes of input arrays - * @param n_params array size of input and sizes - * @param D the dimensionality of the arrays - * @param search_items array of items to search of dimensionality D - * @param n number of rows in search_items - * @param res_I the resulting index array of size n * k - * @param res_D the resulting distance array of size n * k - * @param k the number of nearest neighbors to return - * @param rowMajorIndex is the index array in row major layout? - * @param rowMajorQuery is the query array in row major layout? - * @param metric_type the type of distance metric to use. This corresponds + * @param[in] handle the cuml handle to use + * @param[in] input an array of pointers to the input arrays + * @param[in] size an array of sizes of input arrays + * @param[in] n_params array size of input and sizes + * @param[in] D the dimensionality of the arrays + * @param[in] search_items array of items to search of dimensionality D + * @param[in] n number of rows in search_items + * @param[out] res_I the resulting index array of size n * k + * @param[out] res_D the resulting distance array of size n * k + * @param[in] k the number of nearest neighbors to return + * @param[in] rowMajorIndex is the index array in row major layout? + * @param[in] rowMajorQuery is the query array in row major layout? + * @param[in] metric_type the type of distance metric to use. This corresponds * to the value in the ML::MetricType enum. Default is * Euclidean (L2). - * @param metric_arg the value of `p` for Minkowski (l-p) distances. This + * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This * is ignored if the metric_type is not Minkowski. - * @param expanded should lp-based distances be returned in their expanded + * @param[in] expanded should lp-based distances be returned in their expanded * form (e.g., without raising to the 1/p power). */ cumlError_t knn_search(const cumlHandle_t handle, float **input, int *size, diff --git a/cpp/include/cuml/neighbors/knn_mg.hpp b/cpp/include/cuml/neighbors/knn_mg.hpp index 33fda79046..ddc6ad2108 100644 --- a/cpp/include/cuml/neighbors/knn_mg.hpp +++ b/cpp/include/cuml/neighbors/knn_mg.hpp @@ -31,7 +31,7 @@ namespace opg { /** * @brief Performs a multi-node multi-GPU brute force nearest neighbors. - * @param handle: the cumlHandle to use for managing resources + * @param handle: the raft::handle_t to use for managing resources * @param[out] out_I: vector of output index partitions. size should match the * number of local input partitions. * @param[out] out_D: vector of output distance partitions. size should match @@ -49,7 +49,7 @@ namespace opg { * @param[in] verbose: print extra logging info * */ -void brute_force_knn(ML::cumlHandle &handle, +void brute_force_knn(raft::handle_t &handle, std::vector *> &out_I, std::vector &out_D, std::vector &idx_data, @@ -62,31 +62,33 @@ void brute_force_knn(ML::cumlHandle &handle, /** * Performs a multi-node multi-GPU KNN classify. - * @param handle the cumlHandle to use for managing resources - * @param out vector of output labels partitions. size should match the + * @param[in] handle the raft::handle_t to use for managing resources + * @param[out] out vector of output labels partitions. size should match the * number of local input partitions. - * @param out_I vector of output index partitions. size should match the + * @param[out] out_I vector of output index partitions. size should match the * number of local input partitions. - * @param out_D vector of output distance partitions. size should match + * @param[out] out_D vector of output distance partitions. size should match * the number of local input partitions. - * @param probas (optional) pointer to a vector containing arrays of probabilities - * @param idx_data vector of local indices to query - * @param idx_desc describes how the index partitions are distributed + * @param[in] probas (optional) pointer to a vector containing arrays of probabilities + * @param[in] idx_data vector of local indices to query + * @param[in] idx_desc describes how the index partitions are distributed * across the ranks. - * @param query_data vector of local query partitions - * @param query_desc describes how the query partitions are distributed + * @param[in] query_data vector of local query partitions + * @param[in] query_desc describes how the query partitions are distributed * across the cluster. - * @param y vector of vector of label arrays. for multilabel classification, each + * @param[in] y vector of vector of label arrays. for multilabel classification, each * element in the vector is a different "output" array of labels corresponding * to the i'th output. size should match the number of local input partitions. - * @param uniq_labels vector of the sorted unique labels for each array in y - * @param n_unique vector of sizes for each array in uniq_labels - * @param probas_only return probas instead of performing complete knn_classify - * @param k the number of neighbors to query - * @param batch_size the max number of rows to broadcast at a time - * @param verbose print extra logging info + * @param[in] uniq_labels vector of the sorted unique labels for each array in y + * @param[in] n_unique vector of sizes for each array in uniq_labels + * @param[in] rowMajorIndex boolean indicating whether the index is row major. + * @param[in] rowMajorQuery boolean indicating whether the query is row major. + * @param[in] probas_only return probas instead of performing complete knn_classify + * @param[in] k the number of neighbors to query + * @param[in] batch_size the max number of rows to broadcast at a time + * @param[in] verbose print extra logging info */ -void knn_classify(ML::cumlHandle &handle, std::vector *> *out, +void knn_classify(raft::handle_t &handle, std::vector *> *out, std::vector *> *out_I, std::vector *out_D, std::vector> *probas, @@ -102,28 +104,30 @@ void knn_classify(ML::cumlHandle &handle, std::vector *> *out, /** * Performs a multi-node multi-GPU KNN regress. - * @param handle the cumlHandle to use for managing resources - * @param out vector of output partitions. size should match the + * @param[in] handle the raft::handle_t to use for managing resources + * @param[out] out vector of output partitions. size should match the * number of local input partitions. - * @param out_I vector of output index partitions. size should match the + * @param[out] out_I vector of output index partitions. size should match the * number of local input partitions. - * @param out_D vector of output distance partitions. size should match + * @param[out] out_D vector of output distance partitions. size should match * the number of local input partitions. - * @param idx_data vector of local indices to query - * @param idx_desc describes how the index partitions are distributed + * @param[in] idx_data vector of local indices to query + * @param[in] idx_desc describes how the index partitions are distributed * across the ranks. - * @param query_data vector of local query partitions - * @param query_desc describes how the query partitions are distributed + * @param[in] query_data vector of local query partitions + * @param[in] query_desc describes how the query partitions are distributed * across the cluster. - * @param y vector of vector of output arrays. for multi-output regression, each + * @param[in] y vector of vector of output arrays. for multi-output regression, each * element in the vector is a different "output" array corresponding * to the i'th output. size should match the number of local input partitions. - * @param k the number of neighbors to query - * @param n_outputs number of outputs - * @param batch_size the max number of rows to broadcast at a time - * @param verbose print extra logging info + * @param[in] rowMajorIndex boolean indicating whether the index is row major. + * @param[in] rowMajorQuery boolean indicating whether the query is row major. + * @param[in] k the number of neighbors to query + * @param[in] n_outputs number of outputs + * @param[in] batch_size the max number of rows to broadcast at a time + * @param[in] verbose print extra logging info */ -void knn_regress(ML::cumlHandle &handle, +void knn_regress(raft::handle_t &handle, std::vector *> *out, std::vector *> *out_I, std::vector *out_D, diff --git a/cpp/include/cuml/random_projection/rproj_c.h b/cpp/include/cuml/random_projection/rproj_c.h index 1ea3ff4dd0..ddeb682918 100644 --- a/cpp/include/cuml/random_projection/rproj_c.h +++ b/cpp/include/cuml/random_projection/rproj_c.h @@ -82,11 +82,11 @@ struct rand_mat { }; template -void RPROJfit(const cumlHandle &handle, rand_mat *random_matrix, +void RPROJfit(const raft::handle_t &handle, rand_mat *random_matrix, paramsRPROJ *params); template -void RPROJtransform(const cumlHandle &handle, math_t *input, +void RPROJtransform(const raft::handle_t &handle, math_t *input, rand_mat *random_matrix, math_t *output, paramsRPROJ *params); diff --git a/cpp/include/cuml/solvers/cd_mg.hpp b/cpp/include/cuml/solvers/cd_mg.hpp index 2a7fa26974..8ed64c924d 100644 --- a/cpp/include/cuml/solvers/cd_mg.hpp +++ b/cpp/include/cuml/solvers/cd_mg.hpp @@ -43,14 +43,14 @@ namespace opg { * @param[in] tol: tolerance for early stopping during fitting * @param[in] verbose */ -void fit(cumlHandle &handle, +void fit(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, float *coef, float *intercept, bool fit_intercept, bool normalize, int epochs, float alpha, float l1_ratio, bool shuffle, float tol, bool verbose); -void fit(cumlHandle &handle, +void fit(raft::handle_t &handle, std::vector *> &input_data, MLCommon::Matrix::PartDescriptor &input_desc, std::vector *> &labels, double *coef, @@ -70,14 +70,16 @@ void fit(cumlHandle &handle, * @param[out] preds: predictions * @param[in] verbose */ -void predict(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - size_t n_rows, size_t n_cols, float *coef, float intercept, +void predict(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, size_t n_rows, + size_t n_cols, float *coef, float intercept, MLCommon::Matrix::Data **preds, bool verbose); -void predict(cumlHandle &handle, MLCommon::Matrix::RankSizePair **rank_sizes, - size_t n_parts, MLCommon::Matrix::Data **input, - size_t n_rows, size_t n_cols, double *coef, double intercept, +void predict(raft::handle_t &handle, + MLCommon::Matrix::RankSizePair **rank_sizes, size_t n_parts, + MLCommon::Matrix::Data **input, size_t n_rows, + size_t n_cols, double *coef, double intercept, MLCommon::Matrix::Data **preds, bool verbose); }; // end namespace opg diff --git a/cpp/include/cuml/solvers/solver.hpp b/cpp/include/cuml/solvers/solver.hpp index df4ea5de8c..2db32a4a03 100644 --- a/cpp/include/cuml/solvers/solver.hpp +++ b/cpp/include/cuml/solvers/solver.hpp @@ -21,47 +21,51 @@ namespace ML { namespace Solver { -void sgdFit(cumlHandle &handle, float *input, int n_rows, int n_cols, +void sgdFit(raft::handle_t &handle, float *input, int n_rows, int n_cols, float *labels, float *coef, float *intercept, bool fit_intercept, int batch_size, int epochs, int lr_type, float eta0, float power_t, int loss, int penalty, float alpha, float l1_ratio, bool shuffle, float tol, int n_iter_no_change); -void sgdFit(cumlHandle &handle, double *input, int n_rows, int n_cols, +void sgdFit(raft::handle_t &handle, double *input, int n_rows, int n_cols, double *labels, double *coef, double *intercept, bool fit_intercept, int batch_size, int epochs, int lr_type, double eta0, double power_t, int loss, int penalty, double alpha, double l1_ratio, bool shuffle, double tol, int n_iter_no_change); -void sgdPredict(cumlHandle &handle, const float *input, int n_rows, int n_cols, - const float *coef, float intercept, float *preds, int loss); +void sgdPredict(raft::handle_t &handle, const float *input, int n_rows, + int n_cols, const float *coef, float intercept, float *preds, + int loss); -void sgdPredict(cumlHandle &handle, const double *input, int n_rows, int n_cols, - const double *coef, double intercept, double *preds, int loss); +void sgdPredict(raft::handle_t &handle, const double *input, int n_rows, + int n_cols, const double *coef, double intercept, double *preds, + int loss); -void sgdPredictBinaryClass(cumlHandle &handle, const float *input, int n_rows, - int n_cols, const float *coef, float intercept, - float *preds, int loss); +void sgdPredictBinaryClass(raft::handle_t &handle, const float *input, + int n_rows, int n_cols, const float *coef, + float intercept, float *preds, int loss); -void sgdPredictBinaryClass(cumlHandle &handle, const double *input, int n_rows, - int n_cols, const double *coef, double intercept, - double *preds, int loss); +void sgdPredictBinaryClass(raft::handle_t &handle, const double *input, + int n_rows, int n_cols, const double *coef, + double intercept, double *preds, int loss); -void cdFit(cumlHandle &handle, float *input, int n_rows, int n_cols, +void cdFit(raft::handle_t &handle, float *input, int n_rows, int n_cols, float *labels, float *coef, float *intercept, bool fit_intercept, bool normalize, int epochs, int loss, float alpha, float l1_ratio, bool shuffle, float tol); -void cdFit(cumlHandle &handle, double *input, int n_rows, int n_cols, +void cdFit(raft::handle_t &handle, double *input, int n_rows, int n_cols, double *labels, double *coef, double *intercept, bool fit_intercept, bool normalize, int epochs, int loss, double alpha, double l1_ratio, bool shuffle, double tol); -void cdPredict(cumlHandle &handle, const float *input, int n_rows, int n_cols, - const float *coef, float intercept, float *preds, int loss); +void cdPredict(raft::handle_t &handle, const float *input, int n_rows, + int n_cols, const float *coef, float intercept, float *preds, + int loss); -void cdPredict(cumlHandle &handle, const double *input, int n_rows, int n_cols, - const double *coef, double intercept, double *preds, int loss); +void cdPredict(raft::handle_t &handle, const double *input, int n_rows, + int n_cols, const double *coef, double intercept, double *preds, + int loss); }; // namespace Solver }; // end namespace ML diff --git a/cpp/include/cuml/svm/svc.hpp b/cpp/include/cuml/svm/svc.hpp index 02b291a865..7b559356f9 100644 --- a/cpp/include/cuml/svm/svc.hpp +++ b/cpp/include/cuml/svm/svc.hpp @@ -44,12 +44,13 @@ namespace SVM { * @param [in] param parameters for training * @param [in] kernel_params parameters for the kernel function * @param [out] model parameters of the trained model + * @param [in] sample_weight optional sample weights, size [n_rows] */ template -void svcFit(const cumlHandle &handle, math_t *input, int n_rows, int n_cols, +void svcFit(const raft::handle_t &handle, math_t *input, int n_rows, int n_cols, math_t *labels, const svmParameter ¶m, MLCommon::Matrix::KernelParams &kernel_params, - svmModel &model); + svmModel &model, const math_t *sample_weight = nullptr); /** * @brief Predict classes or decision function value for samples in input. @@ -81,8 +82,8 @@ void svcFit(const cumlHandle &handle, math_t *input, int n_rows, int n_cols, * return the decision function value (false) */ template -void svcPredict(const cumlHandle &handle, math_t *input, int n_rows, int n_cols, - MLCommon::Matrix::KernelParams &kernel_params, +void svcPredict(const raft::handle_t &handle, math_t *input, int n_rows, + int n_cols, MLCommon::Matrix::KernelParams &kernel_params, const svmModel &model, math_t *preds, math_t buffer_size, bool predict_class = true); @@ -93,7 +94,7 @@ void svcPredict(const cumlHandle &handle, math_t *input, int n_rows, int n_cols, * @param [inout] m SVM model parameters */ template -void svmFreeBuffers(const cumlHandle &handle, svmModel &m); +void svmFreeBuffers(const raft::handle_t &handle, svmModel &m); /** * @brief C-Support Vector Classification @@ -133,7 +134,7 @@ class SVC { * @param nochange_steps number of steps with no change wrt convergence * @param verbosity verbosity level for logging messages during execution */ - SVC(cumlHandle &handle, math_t C = 1, math_t tol = 1.0e-3, + SVC(raft::handle_t &handle, math_t C = 1, math_t tol = 1.0e-3, MLCommon::Matrix::KernelParams kernel_params = MLCommon::Matrix::KernelParams{MLCommon::Matrix::LINEAR, 3, 1, 0}, math_t cache_size = 200, int max_iter = -1, int nochange_steps = 1000, @@ -151,8 +152,10 @@ class SVC { * @param n_rows number of rows * @param n_cols number of columns * @param labels device pointer for the labels. Size n_rows. + * @param [in] sample_weight optional sample weights, size [n_rows] */ - void fit(math_t *input, int n_rows, int n_cols, math_t *labels); + void fit(math_t *input, int n_rows, int n_cols, math_t *labels, + const math_t *sample_weight = nullptr); /** * @brief Predict classes for samples in input. @@ -177,7 +180,7 @@ class SVC { void decisionFunction(math_t *input, int n_rows, int n_cols, math_t *preds); private: - const cumlHandle &handle; + const raft::handle_t &handle; }; }; // end namespace SVM diff --git a/cpp/include/cuml/svm/svr.hpp b/cpp/include/cuml/svm/svr.hpp index 679f59ae4f..8ca308acac 100644 --- a/cpp/include/cuml/svm/svr.hpp +++ b/cpp/include/cuml/svm/svr.hpp @@ -43,12 +43,13 @@ namespace SVM { * @param [in] param parameters for training * @param [in] kernel_params parameters for the kernel function * @param [out] model parameters of the trained model + * @param [in] sample_weight optional sample weights, size [n_rows] */ template -void svrFit(const cumlHandle &handle, math_t *X, int n_rows, int n_cols, +void svrFit(const raft::handle_t &handle, math_t *X, int n_rows, int n_cols, math_t *y, const svmParameter ¶m, MLCommon::Matrix::KernelParams &kernel_params, - svmModel &model); + svmModel &model, const math_t *sample_weight = nullptr); // For prediction we use svcPredict diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp index 2236dbba0c..a5b7a91c82 100644 --- a/cpp/include/cuml/tree/decisiontree.hpp +++ b/cpp/include/cuml/tree/decisiontree.hpp @@ -15,7 +15,8 @@ */ #pragma once -#include +#include +#include #include "algo_helper.h" #include "flatnode.h" @@ -61,14 +62,24 @@ struct DecisionTreeParams { * Node split criterion. GINI and Entropy for classification, MSE or MAE for regression. */ CRITERION split_criterion; - /** - * Weahther to fully reshuffle the features for subsampling at each tree node. Default is one shuffle per depth with random start point in the shuffled feature list per node - */ - bool shuffle_features; /** * Minimum impurity decrease required for spliting a node. If the impurity decrease is below this value, node is leafed out. Default is 0.0 */ float min_impurity_decrease = 0.0f; + + /** + * Maximum number of nodes that can be processed in a given batch. This is + * used only for batched-level algo + */ + int max_batch_size; + /** + * If set to true and following conditions are also met, experimental decision + * tree training implementation would be used: + * split_algo = 1 (GLOBAL_QUANTILE) + * max_features = 1.0 (Feature sub-sampling disabled) + * quantile_per_tree = false (No per tree quantile computation) + */ + bool use_experimental_backend; }; /** @@ -86,7 +97,12 @@ struct DecisionTreeParams { * @param[in] cfg_split_criterion: split criterion; default CRITERION_END, * i.e., GINI for classification or MSE for regression * @param[in] cfg_quantile_per_tree: compute quantile per tree; default false - * @param[in] cfg_shuffle_features: whether to shuffle features or not + * @param[in] cfg_use_experimental_backend: If set to true, experimental batched + * backend is used (provided other conditions are met). Default is + false. + * @param[in] cfg_max_batch_size: Maximum number of nodes that can be processed + in a batch. This is used only for batched-level algo. Default + value 128. */ void set_tree_params(DecisionTreeParams ¶ms, int cfg_max_depth = -1, int cfg_max_leaves = -1, float cfg_max_features = 1.0f, @@ -96,7 +112,8 @@ void set_tree_params(DecisionTreeParams ¶ms, int cfg_max_depth = -1, bool cfg_bootstrap_features = false, CRITERION cfg_split_criterion = CRITERION_END, bool cfg_quantile_per_tree = false, - bool cfg_shuffle_features = false); + bool cfg_use_experimental_backend = false, + int cfg_max_batch_size = 128); /** * @brief Check validity of all decision tree hyper-parameters. @@ -138,6 +155,9 @@ void print_tree_summary(const TreeMetaDataNode *tree); template void print_tree(const TreeMetaDataNode *tree); +template +std::string dump_tree_as_json(const TreeMetaDataNode *tree); + // ----------------------------- Classification ----------------------------------- // typedef TreeMetaDataNode TreeClassifierF; @@ -146,7 +166,7 @@ typedef TreeMetaDataNode TreeClassifierD; /** * @defgroup DecisionTreeClassifierFit Fit functions * @brief Build (i.e., fit, train) Decision Tree classifier for input data. - * @param[in] handle: cumlHandle + * @param[in] handle: raft::handle_t * @param[in, out] tree: CPU pointer to TreeMetaDataNode. User allocated. * @param[in] data: train data (nrows samples, ncols features) in column major format, * excluding labels. Device pointer. @@ -166,13 +186,13 @@ typedef TreeMetaDataNode TreeClassifierD; * @param[in] tree_params: Decision Tree training hyper parameter struct. * @{ */ -void decisionTreeClassifierFit(const ML::cumlHandle &handle, +void decisionTreeClassifierFit(const raft::handle_t &handle, TreeClassifierF *&tree, float *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, DecisionTree::DecisionTreeParams tree_params); -void decisionTreeClassifierFit(const ML::cumlHandle &handle, +void decisionTreeClassifierFit(const raft::handle_t &handle, TreeClassifierD *&tree, double *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, @@ -184,7 +204,7 @@ void decisionTreeClassifierFit(const ML::cumlHandle &handle, * @defgroup DecisionTreeClassifierPredict Predict functions * @brief Predict target feature for input data; n-ary classification for * single feature supported. Inference of trees is CPU only for now. - * @param[in] handle: cumlHandle (currently unused; API placeholder) + * @param[in] handle: raft::handle_t (currently unused; API placeholder) * @param[in] tree: CPU pointer to TreeMetaDataNode. * @param[in] rows: test data (n_rows samples, n_cols features) in row major format. * Current impl. expects a CPU pointer. TODO future API change. @@ -198,12 +218,12 @@ void decisionTreeClassifierFit(const ML::cumlHandle &handle, * the caller itself might have set. * @{ */ -void decisionTreeClassifierPredict(const ML::cumlHandle &handle, +void decisionTreeClassifierPredict(const raft::handle_t &handle, const TreeClassifierF *tree, const float *rows, const int n_rows, const int n_cols, int *predictions, int verbosity = -1); -void decisionTreeClassifierPredict(const ML::cumlHandle &handle, +void decisionTreeClassifierPredict(const raft::handle_t &handle, const TreeClassifierD *tree, const double *rows, const int n_rows, const int n_cols, int *predictions, @@ -218,7 +238,7 @@ typedef TreeMetaDataNode TreeRegressorD; /** * @defgroup DecisionTreeRegressorFit Fit functions * @brief Build (i.e., fit, train) Decision Tree regressor for input data. - * @param[in] handle: cumlHandle + * @param[in] handle: raft::handle_t * @param[in, out] tree: CPU pointer to TreeMetaDataNode. User allocated. * @param[in] data: train data (nrows samples, ncols features) in column major format, * excluding labels. Device pointer. @@ -234,12 +254,12 @@ typedef TreeMetaDataNode TreeRegressorD; * @param[in] tree_params: Decision Tree training hyper parameter struct. * @{ */ -void decisionTreeRegressorFit(const ML::cumlHandle &handle, +void decisionTreeRegressorFit(const raft::handle_t &handle, TreeRegressorF *&tree, float *data, const int ncols, const int nrows, float *labels, unsigned int *rowids, const int n_sampled_rows, DecisionTree::DecisionTreeParams tree_params); -void decisionTreeRegressorFit(const ML::cumlHandle &handle, +void decisionTreeRegressorFit(const raft::handle_t &handle, TreeRegressorD *&tree, double *data, const int ncols, const int nrows, double *labels, unsigned int *rowids, const int n_sampled_rows, @@ -250,7 +270,7 @@ void decisionTreeRegressorFit(const ML::cumlHandle &handle, * @defgroup DecisionTreeRegressorPredict Predict functions * @brief Predict target feature for input data; regression for single feature supported. * Inference of trees is CPU only for now. - * @param[in] handle: cumlHandle (currently unused; API placeholder) + * @param[in] handle: raft::handle_t (currently unused; API placeholder) * @param[in] tree: CPU pointer to TreeMetaDataNode. * @param[in] rows: test data (n_rows samples, n_cols features) in row major format. * Current impl. expects a CPU pointer. TODO future API change. @@ -264,11 +284,11 @@ void decisionTreeRegressorFit(const ML::cumlHandle &handle, * the caller itself might have set. * @{ */ -void decisionTreeRegressorPredict(const ML::cumlHandle &handle, +void decisionTreeRegressorPredict(const raft::handle_t &handle, const TreeRegressorF *tree, const float *rows, const int n_rows, const int n_cols, float *predictions, int verbosity = -1); -void decisionTreeRegressorPredict(const ML::cumlHandle &handle, +void decisionTreeRegressorPredict(const raft::handle_t &handle, const TreeRegressorD *tree, const double *rows, const int n_rows, const int n_cols, double *predictions, diff --git a/cpp/include/cuml/tree/flatnode.h b/cpp/include/cuml/tree/flatnode.h index 6604c1b411..74eba5e235 100644 --- a/cpp/include/cuml/tree/flatnode.h +++ b/cpp/include/cuml/tree/flatnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,16 +13,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once -/* sparse node same tree node in Decsion Tree. -* This however used an index instead of pointer to left child -* Right child index is left_child_id + 1 -*/ -template + +/** + * A node in Decision Tree. + * This however uses an index instead of pointer to left child. Right child + * index is assumed to be `left_child_id + 1` + * @tparam T data type + * @tparam L label type + * @tparam IdxT type used for indexing operations + */ +template struct SparseTreeNode { - L prediction; - int colid = -1; - T quesval; - T best_metric_val; - int left_child_id = -1; + LabelT prediction; + IdxT colid = IdxT(-1); + DataT quesval; + DataT best_metric_val; + IdxT left_child_id = IdxT(-1); }; diff --git a/cpp/include/cuml/tsa/arima_common.h b/cpp/include/cuml/tsa/arima_common.h index afcd61c397..e2220727e8 100644 --- a/cpp/include/cuml/tsa/arima_common.h +++ b/cpp/include/cuml/tsa/arima_common.h @@ -20,6 +20,7 @@ #include +#include #include #include #include @@ -39,13 +40,13 @@ struct ARIMAOrder { int s; // Seasonal period int k; // Fit intercept? - inline int r() const { return std::max(p + s * P, q + s * Q + 1); } - inline int complexity() const { return p + P + q + Q + k + 1; } - inline int lost_in_diff() const { return d + s * D; } + inline int n_diff() const { return d + s * D; } inline int n_phi() const { return p + s * P; } inline int n_theta() const { return q + s * Q; } - - inline bool need_prep() const { return static_cast(d + D); } + inline int r() const { return std::max(n_phi(), n_theta() + 1); } + inline int rd() const { return n_diff() + r(); } + inline int complexity() const { return p + P + q + Q + k + 1; } + inline bool need_diff() const { return static_cast(d + D); } }; /** diff --git a/cpp/include/cuml/tsa/auto_arima.h b/cpp/include/cuml/tsa/auto_arima.h index de3ef1c218..cd9a2fa357 100644 --- a/cpp/include/cuml/tsa/auto_arima.h +++ b/cpp/include/cuml/tsa/auto_arima.h @@ -30,7 +30,7 @@ namespace ML { * @param[in] batch_size Batch size * @return The number of 'true' series in the mask */ -int divide_by_mask_build_index(const cumlHandle& handle, const bool* d_mask, +int divide_by_mask_build_index(const raft::handle_t& handle, const bool* d_mask, int* d_index, int batch_size); /** @@ -46,15 +46,15 @@ int divide_by_mask_build_index(const cumlHandle& handle, const bool* d_mask, * @param[in] batch_size Batch size * @param[in] n_obs Number of data points per series */ -void divide_by_mask_execute(const cumlHandle& handle, const float* d_in, +void divide_by_mask_execute(const raft::handle_t& handle, const float* d_in, const bool* d_mask, const int* d_index, float* d_out0, float* d_out1, int batch_size, int n_obs); -void divide_by_mask_execute(const cumlHandle& handle, const double* d_in, +void divide_by_mask_execute(const raft::handle_t& handle, const double* d_in, const bool* d_mask, const int* d_index, double* d_out0, double* d_out1, int batch_size, int n_obs); -void divide_by_mask_execute(const cumlHandle& handle, const int* d_in, +void divide_by_mask_execute(const raft::handle_t& handle, const int* d_in, const bool* d_mask, const int* d_index, int* d_out0, int* d_out1, int batch_size, int n_obs); @@ -72,12 +72,14 @@ void divide_by_mask_execute(const cumlHandle& handle, const int* d_in, * @param[in] batch_size Batch size * @param[in] n_sub Number of sub-batches */ -void divide_by_min_build_index(const cumlHandle& handle, const float* d_matrix, - int* d_batch, int* d_index, int* h_size, - int batch_size, int n_sub); -void divide_by_min_build_index(const cumlHandle& handle, const double* d_matrix, - int* d_batch, int* d_index, int* h_size, - int batch_size, int n_sub); +void divide_by_min_build_index(const raft::handle_t& handle, + const float* d_matrix, int* d_batch, + int* d_index, int* h_size, int batch_size, + int n_sub); +void divide_by_min_build_index(const raft::handle_t& handle, + const double* d_matrix, int* d_batch, + int* d_index, int* h_size, int batch_size, + int n_sub); /** * Batch division by minimum value step 2: create all the sub-batches @@ -92,15 +94,15 @@ void divide_by_min_build_index(const cumlHandle& handle, const double* d_matrix, * @param[in] n_sub Number of sub-batches * @param[in] n_obs Number of data points per series */ -void divide_by_min_execute(const cumlHandle& handle, const float* d_in, +void divide_by_min_execute(const raft::handle_t& handle, const float* d_in, const int* d_batch, const int* d_index, float** hd_out, int batch_size, int n_sub, int n_obs); -void divide_by_min_execute(const cumlHandle& handle, const double* d_in, +void divide_by_min_execute(const raft::handle_t& handle, const double* d_in, const int* d_batch, const int* d_index, double** hd_out, int batch_size, int n_sub, int n_obs); -void divide_by_min_execute(const cumlHandle& handle, const int* d_in, +void divide_by_min_execute(const raft::handle_t& handle, const int* d_in, const int* d_batch, const int* d_index, int** hd_out, int batch_size, int n_sub, int n_obs); @@ -119,7 +121,7 @@ void divide_by_min_execute(const cumlHandle& handle, const int* d_in, * @param[in] batch_size Batch size * @param[in] n_sub Number of sub-batches */ -void build_division_map(const cumlHandle& handle, const int* const* hd_id, +void build_division_map(const raft::handle_t& handle, const int* const* hd_id, const int* h_size, int* d_id_to_pos, int* d_id_to_model, int batch_size, int n_sub); @@ -140,10 +142,10 @@ void build_division_map(const cumlHandle& handle, const int* const* hd_id, * @param[in] n_sub Number of sub-batches * @param[in] n_obs Number of observations (or forecasts) per series */ -void merge_series(const cumlHandle& handle, const float* const* hd_in, +void merge_series(const raft::handle_t& handle, const float* const* hd_in, const int* d_id_to_pos, const int* d_id_to_sub, float* d_out, int batch_size, int n_sub, int n_obs); -void merge_series(const cumlHandle& handle, const double* const* hd_in, +void merge_series(const raft::handle_t& handle, const double* const* hd_in, const int* d_id_to_pos, const int* d_id_to_sub, double* d_out, int batch_size, int n_sub, int n_obs); diff --git a/cpp/include/cuml/tsa/batched_arima.hpp b/cpp/include/cuml/tsa/batched_arima.hpp index 5cdd442236..569515b45b 100644 --- a/cpp/include/cuml/tsa/batched_arima.hpp +++ b/cpp/include/cuml/tsa/batched_arima.hpp @@ -33,7 +33,7 @@ enum LoglikeMethod { CSS, MLE }; * @param[in] n_obs Number of observations * @param[in] order ARIMA order */ -void batched_diff(cumlHandle& handle, double* d_y_diff, const double* d_y, +void batched_diff(raft::handle_t& handle, double* d_y_diff, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order); /** @@ -59,13 +59,18 @@ void batched_diff(cumlHandle& handle, double* d_y_diff, const double* d_y, * number of observations * @param[in] fc_steps Number of steps to forecast * @param[in] d_fc Array to store the forecast + * @param[in] level Confidence level for prediction intervals. 0 to + * skip the computation. Else 0 < level < 1 + * @param[out] d_lower Lower limit of the prediction interval + * @param[out] d_upper Upper limit of the prediction interval */ -void batched_loglike(cumlHandle& handle, const double* d_y, int batch_size, +void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order, const double* d_params, double* loglike, double* d_vs, bool trans = true, bool host_loglike = true, LoglikeMethod method = MLE, - int truncate = 0, int fc_steps = 0, - double* d_fc = nullptr); + int truncate = 0, int fc_steps = 0, double* d_fc = nullptr, + double level = 0, double* d_lower = nullptr, + double* d_upper = nullptr); /** * Compute the loglikelihood of the given parameter on the given time series @@ -92,13 +97,18 @@ void batched_loglike(cumlHandle& handle, const double* d_y, int batch_size, * number of observations * @param[in] fc_steps Number of steps to forecast * @param[in] d_fc Array to store the forecast + * @param[in] level Confidence level for prediction intervals. 0 to + * skip the computation. Else 0 < level < 1 + * @param[out] d_lower Lower limit of the prediction interval + * @param[out] d_upper Upper limit of the prediction interval */ -void batched_loglike(cumlHandle& handle, const double* d_y, int batch_size, +void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order, const ARIMAParams& params, double* loglike, double* d_vs, bool trans = true, bool host_loglike = true, LoglikeMethod method = MLE, int truncate = 0, - int fc_steps = 0, double* d_fc = nullptr); + int fc_steps = 0, double* d_fc = nullptr, double level = 0, + double* d_lower = nullptr, double* d_upper = nullptr); /** * Compute the gradient of the log-likelihood @@ -117,10 +127,11 @@ void batched_loglike(cumlHandle& handle, const double* d_y, int batch_size, * @param[in] truncate For CSS, start the sum-of-squares after a given * number of observations */ -void batched_loglike_grad(cumlHandle& handle, const double* d_y, int batch_size, - int n_obs, const ARIMAOrder& order, const double* d_x, - double* d_grad, double h, bool trans = true, - LoglikeMethod method = MLE, int truncate = 0); +void batched_loglike_grad(raft::handle_t& handle, const double* d_y, + int batch_size, int n_obs, const ARIMAOrder& order, + const double* d_x, double* d_grad, double h, + bool trans = true, LoglikeMethod method = MLE, + int truncate = 0); /** * Batched in-sample and out-of-sample prediction of a time-series given all @@ -136,12 +147,18 @@ void batched_loglike_grad(cumlHandle& handle, const double* d_y, int batch_size, * @param[in] end Index to end the prediction (excluded) * @param[in] order ARIMA hyper-parameters * @param[in] params ARIMA parameters (device) - * @param[out] d_vs Residual output (device) * @param[out] d_y_p Prediction output (device) + * @param[in] pre_diff Whether to use pre-differencing + * @param[in] level Confidence level for prediction intervals. 0 to + * skip the computation. Else 0 < level < 1 + * @param[out] d_lower Lower limit of the prediction interval + * @param[out] d_upper Upper limit of the prediction interval */ -void predict(cumlHandle& handle, const double* d_y, int batch_size, int n_obs, - int start, int end, const ARIMAOrder& order, - const ARIMAParams& params, double* d_vs, double* d_y_p); +void predict(raft::handle_t& handle, const double* d_y, int batch_size, + int n_obs, int start, int end, const ARIMAOrder& order, + const ARIMAParams& params, double* d_y_p, + bool pre_diff = true, double level = 0, double* d_lower = nullptr, + double* d_upper = nullptr); /** * Compute an information criterion (AIC, AICc, BIC) @@ -159,7 +176,7 @@ void predict(cumlHandle& handle, const double* d_y, int batch_size, int n_obs, * @param[in] ic_type Type of information criterion wanted. * 0: AIC, 1: AICc, 2: BIC */ -void information_criterion(cumlHandle& handle, const double* d_y, +void information_criterion(raft::handle_t& handle, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order, const ARIMAParams& params, double* ic, int ic_type); @@ -176,7 +193,7 @@ void information_criterion(cumlHandle& handle, const double* d_y, * (all series must be identical) * @param[in] order ARIMA hyper-parameters */ -void estimate_x0(cumlHandle& handle, ARIMAParams& params, +void estimate_x0(raft::handle_t& handle, ARIMAParams& params, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order); diff --git a/cpp/include/cuml/tsa/batched_kalman.hpp b/cpp/include/cuml/tsa/batched_kalman.hpp index f5935b3752..1cec632aa3 100644 --- a/cpp/include/cuml/tsa/batched_kalman.hpp +++ b/cpp/include/cuml/tsa/batched_kalman.hpp @@ -38,12 +38,18 @@ namespace ML { * shape=(nobs-d-s*D, batch_size) (device) * @param[in] fc_steps Number of steps to forecast * @param[in] d_fc Array to store the forecast + * @param[in] level Confidence level for prediction intervals. 0 to + * skip the computation. Else 0 < level < 1 + * @param[out] d_lower Lower limit of the prediction interval + * @param[out] d_upper Upper limit of the prediction interval */ -void batched_kalman_filter(cumlHandle& handle, const double* d_ys_b, int nobs, - const ARIMAParams& params, +void batched_kalman_filter(raft::handle_t& handle, const double* d_ys_b, + int nobs, const ARIMAParams& params, const ARIMAOrder& order, int batch_size, double* d_loglike, double* d_vs, int fc_steps = 0, - double* d_fc = nullptr); + double* d_fc = nullptr, double level = 0, + double* d_lower = nullptr, + double* d_upper = nullptr); /** * Convenience function for batched "jones transform" used in ARIMA to ensure @@ -59,7 +65,7 @@ void batched_kalman_filter(cumlHandle& handle, const double* d_ys_b, int nobs, * (expects pre-allocated array of size * (p+q)*batch_size) (host) */ -void batched_jones_transform(cumlHandle& handle, const ARIMAOrder& order, +void batched_jones_transform(raft::handle_t& handle, const ARIMAOrder& order, int batch_size, bool isInv, const double* h_params, double* h_Tparams); } // namespace ML diff --git a/cpp/include/cuml/tsa/holtwinters.h b/cpp/include/cuml/tsa/holtwinters.h index 2f6a5d08b8..bd3c4f0f28 100644 --- a/cpp/include/cuml/tsa/holtwinters.h +++ b/cpp/include/cuml/tsa/holtwinters.h @@ -75,11 +75,11 @@ void buffer_size(int n, int batch_size, int frequency, * @param[out] error_d * device pointer to array which will hold training SSE error */ -void fit(const ML::cumlHandle &handle, int n, int batch_size, int frequency, +void fit(const raft::handle_t &handle, int n, int batch_size, int frequency, int start_periods, ML::SeasonalType seasonal, float epsilon, float *data, float *level_d, float *trend_d, float *season_d, float *error_d); -void fit(const ML::cumlHandle &handle, int n, int batch_size, int frequency, +void fit(const raft::handle_t &handle, int n, int batch_size, int frequency, int start_periods, ML::SeasonalType seasonal, double epsilon, double *data, double *level_d, double *trend_d, double *season_d, double *error_d); @@ -107,10 +107,10 @@ void fit(const ML::cumlHandle &handle, int n, int batch_size, int frequency, * @param[out] forecast_d * device pointer to array which will hold the forecast points */ -void forecast(const ML::cumlHandle &handle, int n, int batch_size, +void forecast(const raft::handle_t &handle, int n, int batch_size, int frequency, int h, ML::SeasonalType seasonal, float *level_d, float *trend_d, float *season_d, float *forecast_d); -void forecast(const ML::cumlHandle &handle, int n, int batch_size, +void forecast(const raft::handle_t &handle, int n, int batch_size, int frequency, int h, ML::SeasonalType seasonal, double *level_d, double *trend_d, double *season_d, double *forecast_d); diff --git a/cpp/include/cuml/tsa/stationarity.h b/cpp/include/cuml/tsa/stationarity.h index 0b1240fbec..98dea34a6e 100644 --- a/cpp/include/cuml/tsa/stationarity.h +++ b/cpp/include/cuml/tsa/stationarity.h @@ -36,10 +36,10 @@ namespace Stationarity { * @param[in] pval_threshold P-value threshold above which a series is * considered stationary */ -void kpss_test(const cumlHandle& handle, const float* d_y, bool* results, +void kpss_test(const raft::handle_t& handle, const float* d_y, bool* results, int batch_size, int n_obs, int d, int D, int s, float pval_threshold); -void kpss_test(const cumlHandle& handle, const double* d_y, bool* results, +void kpss_test(const raft::handle_t& handle, const double* d_y, bool* results, int batch_size, int n_obs, int d, int D, int s, double pval_threshold); diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu index 994d894151..3b7570fdf9 100644 --- a/cpp/src/arima/batched_arima.cu +++ b/cpp/src/arima/batched_arima.cu @@ -28,57 +28,71 @@ #include #include -#include +#include #include #include #include -#include #include -#include #include +#include +#include #include namespace ML { -void batched_diff(cumlHandle& handle, double* d_y_diff, const double* d_y, +void batched_diff(raft::handle_t& handle, double* d_y_diff, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order) { - const auto stream = handle.getStream(); + const auto stream = handle.get_stream(); MLCommon::TimeSeries::prepare_data(d_y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream); } -void predict(cumlHandle& handle, const double* d_y, int batch_size, int n_obs, - int start, int end, const ARIMAOrder& order, - const ARIMAParams& params, double* d_vs, double* d_y_p) { +void predict(raft::handle_t& handle, const double* d_y, int batch_size, + int n_obs, int start, int end, const ARIMAOrder& order, + const ARIMAParams& params, double* d_y_p, bool pre_diff, + double level, double* d_lower, double* d_upper) { ML::PUSH_RANGE(__func__); - auto allocator = handle.getDeviceAllocator(); - const auto stream = handle.getStream(); + auto allocator = handle.get_device_allocator(); + const auto stream = handle.get_stream(); + + bool diff = order.need_diff() && pre_diff && level == 0; // Prepare data - int diff_obs = order.lost_in_diff(); - int ld_yprep = n_obs - diff_obs; - double* d_y_prep = (double*)allocator->allocate( - ld_yprep * batch_size * sizeof(double), stream); - MLCommon::TimeSeries::prepare_data(d_y_prep, d_y, batch_size, n_obs, order.d, - order.D, order.s, stream); + int n_obs_kf; + const double* d_y_kf; + MLCommon::device_buffer diff_buffer(allocator, stream); + ARIMAOrder order_after_prep = order; + if (diff) { + n_obs_kf = n_obs - order.n_diff(); + diff_buffer.resize(n_obs_kf * batch_size, stream); + MLCommon::TimeSeries::prepare_data(diff_buffer.data(), d_y, batch_size, + n_obs, order.d, order.D, order.s, + stream); + d_y_kf = diff_buffer.data(); + order_after_prep.d = 0; + order_after_prep.D = 0; + } else { + n_obs_kf = n_obs; + d_y_kf = d_y; + } + + // Create temporary array for the residuals + MLCommon::device_buffer v_buffer(allocator, stream, + n_obs_kf * batch_size); + double* d_vs = v_buffer.data(); // Create temporary array for the forecasts int num_steps = std::max(end - n_obs, 0); - double* d_y_fc = nullptr; - if (num_steps) { - d_y_fc = (double*)allocator->allocate( - num_steps * batch_size * sizeof(double), stream); - } + MLCommon::device_buffer fc_buffer(allocator, stream, + num_steps * batch_size); + double* d_y_fc = fc_buffer.data(); - // Compute the residual and forecast - provide already prepared data and - // extracted parameters - ARIMAOrder order_after_prep = {order.p, 0, order.q, order.P, - 0, order.Q, order.s, order.k}; + // Compute the residual and forecast std::vector loglike = std::vector(batch_size); /// TODO: use device loglike to avoid useless copy ; part of #2233 - batched_loglike(handle, d_y_prep, batch_size, n_obs - diff_obs, - order_after_prep, params, loglike.data(), d_vs, false, true, - MLE, 0, num_steps, d_y_fc); + batched_loglike(handle, d_y_kf, batch_size, n_obs_kf, order_after_prep, + params, loglike.data(), d_vs, false, true, MLE, 0, num_steps, + d_y_fc, level, d_lower, d_upper); auto counting = thrust::make_counting_iterator(0); int predict_ld = end - start; @@ -87,7 +101,8 @@ void predict(cumlHandle& handle, const double* d_y, int batch_size, int n_obs, // In-sample prediction // - int p_start = std::max(start, diff_obs); + int res_offset = diff ? order.d + order.s * order.D : 0; + int p_start = std::max(start, res_offset); int p_end = std::min(n_obs, end); // The prediction loop starts by filling undefined predictions with NaN, @@ -96,13 +111,13 @@ void predict(cumlHandle& handle, const double* d_y, int batch_size, int n_obs, thrust::for_each(thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) { d_y_p[0] = 0.0; - for (int i = 0; i < diff_obs - start; i++) { + for (int i = 0; i < res_offset - start; i++) { d_y_p[bid * predict_ld + i] = nan(""); } for (int i = p_start; i < p_end; i++) { d_y_p[bid * predict_ld + i - start] = d_y[bid * n_obs + i] - - d_vs[bid * ld_yprep + i - diff_obs]; + d_vs[bid * n_obs_kf + i - res_offset]; } }); } @@ -112,10 +127,11 @@ void predict(cumlHandle& handle, const double* d_y, int batch_size, int n_obs, // if (num_steps) { - // Add trend and/or undiff - MLCommon::TimeSeries::finalize_forecast(d_y_fc, d_y, num_steps, batch_size, - n_obs, n_obs, order.d, order.D, - order.s, stream); + if (diff) { + MLCommon::TimeSeries::finalize_forecast(d_y_fc, d_y, num_steps, + batch_size, n_obs, n_obs, order.d, + order.D, order.s, stream); + } // Copy forecast in d_y_p thrust::for_each(thrust::cuda::par.on(stream), counting, @@ -125,13 +141,9 @@ void predict(cumlHandle& handle, const double* d_y, int batch_size, int n_obs, d_y_fc[num_steps * bid + i]; } }); - - allocator->deallocate(d_y_fc, num_steps * batch_size * sizeof(double), - stream); + /// TODO: 2D copy kernel? } - allocator->deallocate(d_y_prep, ld_yprep * batch_size * sizeof(double), - stream); ML::POP_RANGE(); } @@ -199,7 +211,7 @@ __global__ void sum_of_squares_kernel(const DataT* d_y, const DataT* d_mu, threadIdx.x < n_phi ? phi * b_y[i - threadIdx.x - 1 - start_y] : (DataT)0; res -= threadIdx.x < n_theta ? theta * b_vs[i - threadIdx.x - 1 - start_v] : (DataT)0; - res = MLCommon::blockReduce(res, temp_smem); + res = raft::blockReduce(res, temp_smem); if (threadIdx.x == 0) { res += b_y[i - start_y] - mu; b_vs[i - start_v] = res; @@ -211,7 +223,7 @@ __global__ void sum_of_squares_kernel(const DataT* d_y, const DataT* d_mu, if (threadIdx.x == 0) { d_loglike[blockIdx.x] = -0.5 * static_cast(n_obs) * - MLCommon::myLog(ssq / static_cast(n_obs - start_sum)); + raft::myLog(ssq / static_cast(n_obs - start_sum)); } } @@ -227,13 +239,13 @@ __global__ void sum_of_squares_kernel(const DataT* d_y, const DataT* d_mu, * @param[out] d_loglike Evaluated log-likelihood (device) * @param[in] truncate Number of observations to skip in the sum */ -void conditional_sum_of_squares(cumlHandle& handle, const double* d_y, +void conditional_sum_of_squares(raft::handle_t& handle, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order, const ARIMAParams& Tparams, double* d_loglike, int truncate) { ML::PUSH_RANGE(__func__); - auto stream = handle.getStream(); + auto stream = handle.get_stream(); int n_phi = order.n_phi(); int n_theta = order.n_theta(); @@ -243,7 +255,7 @@ void conditional_sum_of_squares(cumlHandle& handle, const double* d_y, int start_v = start_sum - n_theta; // Compute the sum-of-squares and the log-likelihood - int n_warps = std::max(MLCommon::ceildiv(max_lags, 32), 1); + int n_warps = std::max(raft::ceildiv(max_lags, 32), 1); size_t shared_mem_size = (2 * n_obs - start_y - start_v + n_warps) * sizeof(double); sum_of_squares_kernel<<>>( @@ -255,22 +267,21 @@ void conditional_sum_of_squares(cumlHandle& handle, const double* d_y, ML::POP_RANGE(); } -void batched_loglike(cumlHandle& handle, const double* d_y, int batch_size, +void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order, const ARIMAParams& params, double* loglike, double* d_vs, bool trans, bool host_loglike, LoglikeMethod method, int truncate, int fc_steps, - double* d_fc) { + double* d_fc, double level, double* d_lower, + double* d_upper) { ML::PUSH_RANGE(__func__); - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); ARIMAParams Tparams; - if (method != MLE && fc_steps) { - /// TODO: add warning when solving #2232 - method = MLE; - } + ASSERT(method == MLE || fc_steps == 0, + "Only MLE method is valid for forecasting"); /* Create log-likelihood device array if host pointer is provided */ double* d_loglike; @@ -294,36 +305,18 @@ void batched_loglike(cumlHandle& handle, const double* d_y, int batch_size, Tparams = params; } - if (!order.need_prep()) { - if (method == CSS) { - conditional_sum_of_squares(handle, d_y, batch_size, n_obs, order, Tparams, - d_loglike, truncate); - } else { - batched_kalman_filter(handle, d_y, n_obs, Tparams, order, batch_size, - d_loglike, d_vs, fc_steps, d_fc); - } + if (method == CSS) { + conditional_sum_of_squares(handle, d_y, batch_size, n_obs, order, Tparams, + d_loglike, truncate); } else { - MLCommon::device_buffer y_prep( - allocator, stream, batch_size * (n_obs - order.lost_in_diff())); - double* d_y_prep = y_prep.data(); - - MLCommon::TimeSeries::prepare_data(d_y_prep, d_y, batch_size, n_obs, - order.d, order.D, order.s, stream); - - if (method == CSS) { - conditional_sum_of_squares(handle, d_y_prep, batch_size, - n_obs - order.lost_in_diff(), order, Tparams, - d_loglike, truncate); - } else { - batched_kalman_filter(handle, d_y_prep, n_obs - order.lost_in_diff(), - Tparams, order, batch_size, d_loglike, d_vs, - fc_steps, d_fc); - } + batched_kalman_filter(handle, d_y, n_obs, Tparams, order, batch_size, + d_loglike, d_vs, fc_steps, d_fc, level, d_lower, + d_upper); } if (host_loglike) { /* Tranfer log-likelihood device -> host */ - MLCommon::updateHost(loglike, d_loglike, batch_size, stream); + raft::update_host(loglike, d_loglike, batch_size, stream); } if (trans) { @@ -332,49 +325,55 @@ void batched_loglike(cumlHandle& handle, const double* d_y, int batch_size, ML::POP_RANGE(); } -void batched_loglike(cumlHandle& handle, const double* d_y, int batch_size, +void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order, const double* d_params, double* loglike, double* d_vs, bool trans, bool host_loglike, LoglikeMethod method, int truncate, - int fc_steps, double* d_fc) { + int fc_steps, double* d_fc, double level, double* d_lower, + double* d_upper) { ML::PUSH_RANGE(__func__); // unpack parameters - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); ARIMAParams params; params.allocate(order, batch_size, allocator, stream, false); params.unpack(order, batch_size, d_params, stream); batched_loglike(handle, d_y, batch_size, n_obs, order, params, loglike, d_vs, - trans, host_loglike, method, truncate, fc_steps, d_fc); + trans, host_loglike, method, truncate, fc_steps, d_fc, level, + d_lower, d_upper); params.deallocate(order, batch_size, allocator, stream, false); + ML::POP_RANGE(); } -void batched_loglike_grad(cumlHandle& handle, const double* d_y, int batch_size, - int n_obs, const ARIMAOrder& order, const double* d_x, - double* d_grad, double h, bool trans, - LoglikeMethod method, int truncate) { +void batched_loglike_grad(raft::handle_t& handle, const double* d_y, + int batch_size, int n_obs, const ARIMAOrder& order, + const double* d_x, double* d_grad, double h, + bool trans, LoglikeMethod method, int truncate) { ML::PUSH_RANGE(__func__); - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); auto counting = thrust::make_counting_iterator(0); int N = order.complexity(); // Initialize the perturbed x vector MLCommon::device_buffer x_pert(allocator, stream, N * batch_size); double* d_x_pert = x_pert.data(); - MLCommon::copy(d_x_pert, d_x, N * batch_size, stream); + raft::copy(d_x_pert, d_x, N * batch_size, stream); // Create buffers for the log-likelihood and residuals - MLCommon::device_buffer ll_pos(allocator, stream, batch_size); - MLCommon::device_buffer ll_neg(allocator, stream, batch_size); - MLCommon::device_buffer res( - allocator, stream, (n_obs - order.lost_in_diff()) * batch_size); - double* d_ll_pos = ll_pos.data(); - double* d_ll_neg = ll_neg.data(); + MLCommon::device_buffer ll_base(allocator, stream, batch_size); + MLCommon::device_buffer ll_pert(allocator, stream, batch_size); + MLCommon::device_buffer res(allocator, stream, n_obs * batch_size); + double* d_ll_base = ll_base.data(); + double* d_ll_pert = ll_pert.data(); + + // Evaluate the log-likelihood with the given parameter vector + batched_loglike(handle, d_y, batch_size, n_obs, order, d_x, d_ll_base, + res.data(), trans, false, method, truncate); for (int i = 0; i < N; i++) { // Add the perturbation to the i-th parameter @@ -384,24 +383,14 @@ void batched_loglike_grad(cumlHandle& handle, const double* d_y, int batch_size, }); // Evaluate the log-likelihood with the positive perturbation - batched_loglike(handle, d_y, batch_size, n_obs, order, d_x_pert, d_ll_pos, - res.data(), trans, false, method, truncate); - - // Subtract the perturbation to the i-th parameter - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + batch_size, [=] __device__(int bid) { - d_x_pert[N * bid + i] = d_x[N * bid + i] - h; - }); - - // Evaluate the log-likelihood with the negative perturbation - batched_loglike(handle, d_y, batch_size, n_obs, order, d_x_pert, d_ll_neg, + batched_loglike(handle, d_y, batch_size, n_obs, order, d_x_pert, d_ll_pert, res.data(), trans, false, method, truncate); - // First derivative with a second-order accuracy + // First derivative with a first-order accuracy thrust::for_each(thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) { d_grad[N * bid + i] = - (d_ll_pos[bid] - d_ll_neg[bid]) / (2.0 * h); + (d_ll_pert[bid] - d_ll_base[bid]) / h; }); // Reset the i-th parameter @@ -413,27 +402,26 @@ void batched_loglike_grad(cumlHandle& handle, const double* d_y, int batch_size, ML::POP_RANGE(); } -void information_criterion(cumlHandle& handle, const double* d_y, +void information_criterion(raft::handle_t& handle, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order, const ARIMAParams& params, double* d_ic, int ic_type) { ML::PUSH_RANGE(__func__); - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); - double* d_vs = (double*)allocator->allocate( - sizeof(double) * (n_obs - order.lost_in_diff()) * batch_size, stream); + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); + + MLCommon::device_buffer v_buffer(allocator, stream, + n_obs * batch_size); /* Compute log-likelihood in d_ic */ - batched_loglike(handle, d_y, batch_size, n_obs, order, params, d_ic, d_vs, - false, false); + batched_loglike(handle, d_y, batch_size, n_obs, order, params, d_ic, + v_buffer.data(), false, false, MLE); /* Compute information criterion from log-likelihood and base term */ MLCommon::Metrics::Batched::information_criterion( d_ic, d_ic, static_cast(ic_type), - order.complexity(), batch_size, n_obs - order.lost_in_diff(), stream); + order.complexity(), batch_size, n_obs - order.n_diff(), stream); - allocator->deallocate( - d_vs, sizeof(double) * (n_obs - order.lost_in_diff()) * batch_size, stream); ML::POP_RANGE(); } @@ -481,15 +469,15 @@ DI bool test_invparams(const double* params, int pq) { * ARMA model (with or without seasonality) * @note: in this function the non-seasonal case has s=1, not s=0! */ -void _arma_least_squares(cumlHandle& handle, double* d_ar, double* d_ma, +void _arma_least_squares(raft::handle_t& handle, double* d_ar, double* d_ma, double* d_sigma2, const MLCommon::LinAlg::Batched::Matrix& bm_y, int p, int q, int s, bool estimate_sigma2, int k = 0, double* d_mu = nullptr) { - const auto& handle_impl = handle.getImpl(); - auto stream = handle_impl.getStream(); - auto cublas_handle = handle_impl.getCublasHandle(); - auto allocator = handle_impl.getDeviceAllocator(); + const auto& handle_impl = handle; + auto stream = handle_impl.get_stream(); + auto cublas_handle = handle_impl.get_cublas_handle(); + auto allocator = handle_impl.get_device_allocator(); auto counting = thrust::make_counting_iterator(0); int batch_size = bm_y.batches(); @@ -581,8 +569,8 @@ void _arma_least_squares(cumlHandle& handle, double* d_ar, double* d_ma, MLCommon::LinAlg::Batched::Matrix bm_final_residual( n_obs - r, 1, batch_size, cublas_handle, allocator, stream, false); if (estimate_sigma2) { - MLCommon::copy(bm_final_residual.raw_data(), bm_arma_fit.raw_data(), - (n_obs - r) * batch_size, stream); + raft::copy(bm_final_residual.raw_data(), bm_arma_fit.raw_data(), + (n_obs - r) * batch_size, stream); } // ARMA fit @@ -655,7 +643,7 @@ void _arma_least_squares(cumlHandle& handle, double* d_ar, double* d_ma, * Auxiliary function of estimate_x0: compute the starting parameters for * the series pre-processed by estimate_x0 */ -void _start_params(cumlHandle& handle, ARIMAParams& params, +void _start_params(raft::handle_t& handle, ARIMAParams& params, const MLCommon::LinAlg::Batched::Matrix& bm_y, const ARIMAOrder& order) { // Estimate an ARMA fit without seasonality @@ -670,14 +658,14 @@ void _start_params(cumlHandle& handle, ARIMAParams& params, order.p + order.q + order.k == 0); } -void estimate_x0(cumlHandle& handle, ARIMAParams& params, +void estimate_x0(raft::handle_t& handle, ARIMAParams& params, const double* d_y, int batch_size, int n_obs, const ARIMAOrder& order) { ML::PUSH_RANGE(__func__); - const auto& handle_impl = handle.getImpl(); - auto stream = handle_impl.getStream(); - auto cublas_handle = handle_impl.getCublasHandle(); - auto allocator = handle_impl.getDeviceAllocator(); + const auto& handle_impl = handle; + auto stream = handle_impl.get_stream(); + auto cublas_handle = handle_impl.get_cublas_handle(); + auto allocator = handle_impl.get_device_allocator(); // Difference if necessary, copy otherwise MLCommon::LinAlg::Batched::Matrix bm_yd( diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu index 7c1304c716..932f50d07c 100644 --- a/cpp/src/arima/batched_kalman.cu +++ b/cpp/src/arima/batched_kalman.cu @@ -24,42 +24,55 @@ #include #include -#include -#include +#include +#include #include +#include #include -#include #include -#include +#include +#include #include #include namespace ML { //! Thread-local Matrix-Vector multiplication. -template -__device__ void Mv_l(const double* A, const double* v, double* out) { - for (int i = 0; i < r; i++) { +template +DI void Mv_l(const double* A, const double* v, double* out) { + for (int i = 0; i < n; i++) { double sum = 0.0; - for (int j = 0; j < r; j++) { - sum += A[i + j * r] * v[j]; + for (int j = 0; j < n; j++) { + sum += A[i + j * n] * v[j]; } out[i] = sum; } } +template +DI void Mv_l(double alpha, const double* A, const double* v, double beta, + double* out) { + for (int i = 0; i < n; i++) { + double sum = 0.0; + for (int j = 0; j < n; j++) { + sum += A[i + j * n] * v[j]; + } + out[i] = alpha * sum + beta * out[i]; + } +} + //! Thread-local Matrix-Matrix multiplication. -template -__device__ void MM_l(const double* A, const double* B, double* out) { - for (int i = 0; i < r; i++) { - for (int j = 0; j < r; j++) { +template +DI void MM_l(const double* A, const double* B, double* out) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { double sum = 0.0; - for (int k = 0; k < r; k++) { - double Aik = aT ? A[k + i * r] : A[i + k * r]; - double Bkj = bT ? B[j + k * r] : B[k + j * r]; + for (int k = 0; k < n; k++) { + double Aik = aT ? A[k + i * n] : A[i + k * n]; + double Bkj = bT ? B[j + k * n] : B[k + j * n]; sum += Aik * Bkj; } - out[i + j * r] = sum; + out[i + j * n] = sum; } } } @@ -73,7 +86,7 @@ __device__ void MM_l(const double* A, const double* B, double* out) { * @param[in] nobs Number of observation per series * @param[in] T Batched transition matrix. (r x r) * @param[in] Z Batched "design" vector (1 x r) - * @param[in] RRT Batched R*R.T (R="selection" vector) (r x r) + * @param[in] RQR Batched R*Q*R' (r x r) * @param[in] P Batched P (r x r) * @param[in] alpha Batched state vector (r x 1) * @param[in] intercept Do we fit an intercept? @@ -82,40 +95,44 @@ __device__ void MM_l(const double* A, const double* B, double* out) { * @param[out] vs Batched residuals (nobs) * @param[out] Fs Batched variance of prediction errors (nobs) * @param[out] sum_logFs Batched sum of the logs of Fs (1) + * @param[in] n_diff d + s*D * @param[in] fc_steps Number of steps to forecast - * @param[in] d_fc Array to store the forecast + * @param[out] d_fc Array to store the forecast + * @param[in] conf_int Whether to compute confidence intervals + * @param[in] d_F_fc Batched variance of forecast errors (fc_steps) */ -template +template __global__ void batched_kalman_loop_kernel( const double* ys, int nobs, const double* T, const double* Z, - const double* RRT, const double* P, const double* alpha, bool intercept, + const double* RQR, const double* P, const double* alpha, bool intercept, const double* d_mu, int batch_size, double* vs, double* Fs, double* sum_logFs, - int fc_steps = 0, double* d_fc = nullptr) { - constexpr int r2 = r * r; - double l_RRT[r2]; - double l_T[r2]; - // double l_Z[r]; // note: will be used when introducing exogeneous var. - double l_P[r2]; - double l_alpha[r]; - double l_K[r]; - double l_tmp[r2]; - double l_TP[r2]; + int n_diff, int fc_steps = 0, double* d_fc = nullptr, bool conf_int = false, + double* d_F_fc = nullptr) { + constexpr int rd2 = rd * rd; + double l_RQR[rd2]; + double l_T[rd2]; + double l_Z[rd]; + double l_P[rd2]; + double l_alpha[rd]; + double l_K[rd]; + double l_tmp[rd2]; + double l_TP[rd2]; int bid = blockDim.x * blockIdx.x + threadIdx.x; if (bid < batch_size) { // Load global mem into registers { - int b_r_offset = bid * r; - int b_r2_offset = bid * r2; - for (int i = 0; i < r2; i++) { - l_RRT[i] = RRT[b_r2_offset + i]; - l_T[i] = T[b_r2_offset + i]; - l_P[i] = P[b_r2_offset + i]; + int b_rd_offset = bid * rd; + int b_rd2_offset = bid * rd2; + for (int i = 0; i < rd2; i++) { + l_RQR[i] = RQR[b_rd2_offset + i]; + l_T[i] = T[b_rd2_offset + i]; + l_P[i] = P[b_rd2_offset + i]; } - for (int i = 0; i < r; i++) { - // l_Z[i] = Z[b_r_offset + i]; - l_alpha[i] = alpha[b_r_offset + i]; + for (int i = 0; i < rd; i++) { + if (n_diff > 0) l_Z[i] = Z[b_rd_offset + i]; + l_alpha[i] = alpha[b_rd_offset + i]; } } @@ -127,65 +144,128 @@ __global__ void batched_kalman_loop_kernel( double mu = intercept ? d_mu[bid] : 0.0; for (int it = 0; it < nobs; it++) { - // 1. & 2. - double vs_it; - double _Fs = l_P[0]; - vs_it = b_ys[it] - l_alpha[0]; + // 1. v = y - Z*alpha + double vs_it = b_ys[it]; + if (n_diff == 0) + vs_it -= l_alpha[0]; + else { + for (int i = 0; i < rd; i++) { + vs_it -= l_alpha[i] * l_Z[i]; + } + } b_vs[it] = vs_it; + + // 2. F = Z*P*Z' + double _Fs; + if (n_diff == 0) + _Fs = l_P[0]; + else { + _Fs = 0.0; + for (int i = 0; i < rd; i++) { + for (int j = 0; j < rd; j++) { + _Fs += l_P[j * rd + i] * l_Z[i] * l_Z[j]; + } + } + } b_Fs[it] = _Fs; - b_sum_logFs += log(_Fs); + if (it >= n_diff) b_sum_logFs += log(_Fs); // 3. K = 1/Fs[it] * T*P*Z' // TP = T*P - MM_l(l_T, l_P, l_TP); - // K = 1/Fs[it] * TP*Z' ; optimized for Z = (1 0 ... 0) + MM_l(l_T, l_P, l_TP); + // K = 1/Fs[it] * TP*Z' double _1_Fs = 1.0 / _Fs; - for (int i = 0; i < r; i++) { - l_K[i] = _1_Fs * l_TP[i]; - } + if (n_diff == 0) { + for (int i = 0; i < rd; i++) { + l_K[i] = _1_Fs * l_TP[i]; + } + } else + Mv_l(_1_Fs, l_TP, l_Z, 0.0, l_K); // 4. alpha = T*alpha + K*vs[it] + c // tmp = T*alpha - Mv_l(l_T, l_alpha, l_tmp); + Mv_l(l_T, l_alpha, l_tmp); // alpha = tmp + K*vs[it] - for (int i = 0; i < r; i++) { + for (int i = 0; i < rd; i++) { l_alpha[i] = l_tmp[i] + l_K[i] * vs_it; } - // alpha_0 = alpha_0 + mu - l_alpha[0] += mu; + // alpha = alpha + c + l_alpha[n_diff] += mu; // 5. L = T - K * Z // L = T (L is tmp) - for (int i = 0; i < r2; i++) { + for (int i = 0; i < rd2; i++) { l_tmp[i] = l_T[i]; } - // L = L - K * Z ; optimized for Z = (1 0 ... 0): - // substract K to the first column of L - for (int i = 0; i < r; i++) { - l_tmp[i] -= l_K[i]; + // L = L - K * Z + if (n_diff == 0) { + for (int i = 0; i < rd; i++) { + l_tmp[i] -= l_K[i]; + } + } else { + for (int i = 0; i < rd; i++) { + for (int j = 0; j < rd; j++) { + l_tmp[j * rd + i] -= l_K[i] * l_Z[j]; + } + } } - // 6. P = T*P*L' + R*R' + // 6. P = T*P*L' + R*Q*R' // P = TP*L' - MM_l(l_TP, l_tmp, l_P); - // P = P + RRT - for (int i = 0; i < r2; i++) { - l_P[i] += l_RRT[i]; + MM_l(l_TP, l_tmp, l_P); + // P = P + RQR + for (int i = 0; i < rd2; i++) { + l_P[i] += l_RQR[i]; } } sum_logFs[bid] = b_sum_logFs; // Forecast - double* b_fc = fc_steps ? d_fc + bid * fc_steps : nullptr; - for (int i = 0; i < fc_steps; i++) { - b_fc[i] = l_alpha[0]; - - // alpha = T*alpha + c - Mv_l(l_T, l_alpha, l_tmp); - for (int i = 0; i < r; i++) { - l_alpha[i] = l_tmp[i]; + { + double* b_fc = fc_steps ? d_fc + bid * fc_steps : nullptr; + double* b_F_fc = conf_int ? d_F_fc + bid * fc_steps : nullptr; + for (int it = 0; it < fc_steps; it++) { + if (n_diff == 0) + b_fc[it] = l_alpha[0]; + else { + double pred = 0.0; + for (int i = 0; i < rd; i++) { + pred += l_alpha[i] * l_Z[i]; + } + b_fc[it] = pred; + } + + // alpha = T*alpha + c + Mv_l(l_T, l_alpha, l_tmp); + for (int i = 0; i < rd; i++) { + l_alpha[i] = l_tmp[i]; + } + l_alpha[n_diff] += mu; + + if (conf_int) { + if (n_diff == 0) + b_F_fc[it] = l_P[0]; + else { + double _Fs = 0.0; + for (int i = 0; i < rd; i++) { + for (int j = 0; j < rd; j++) { + _Fs += l_P[j * rd + i] * l_Z[i] * l_Z[j]; + } + } + b_F_fc[it] = _Fs; + } + + // P = T*P*T' + RR' + // TP = T*P + MM_l(l_T, l_P, l_TP); + // P = TP*T' + MM_l(l_TP, l_T, l_P); + // P = P + RR' + for (int i = 0; i < rd2; i++) { + l_P[i] += l_RQR[i]; + } + } } - l_alpha[0] += mu; } } } @@ -198,7 +278,7 @@ __global__ void batched_kalman_loop_kernel( * @param[in] T Batched transition matrix. (r x r) * @param[in] T_sparse Batched sparse matrix T (r x r) * @param[in] Z Batched "design" vector (1 x r) - * @param[in] RRT Batched R*R' (R="selection" vector) (r x r) + * @param[in] RQR Batched R*Q*R' (r x r) * @param[in] P Batched P (r x r) * @param[in] alpha Batched state vector (r x 1) * @param[in] intercept Do we fit an intercept? @@ -207,37 +287,42 @@ __global__ void batched_kalman_loop_kernel( * @param[out] d_vs Batched residuals (nobs) * @param[out] d_Fs Batched variance of prediction errors (nobs) * @param[out] d_sum_logFs Batched sum of the logs of Fs (1) + * @param[in] n_diff d + s*D * @param[in] fc_steps Number of steps to forecast - * @param[in] d_fc Array to store the forecast + * @param[out] d_fc Array to store the forecast + * @param[in] conf_int Whether to compute confidence intervals + * @param[out] d_F_fc Batched variance of forecast errors (fc_steps) */ void _batched_kalman_loop_large( const double* d_ys, int nobs, const MLCommon::LinAlg::Batched::Matrix& T, const MLCommon::Sparse::Batched::CSR& T_sparse, const MLCommon::LinAlg::Batched::Matrix& Z, - const MLCommon::LinAlg::Batched::Matrix& RRT, + const MLCommon::LinAlg::Batched::Matrix& RQR, MLCommon::LinAlg::Batched::Matrix& P, MLCommon::LinAlg::Batched::Matrix& alpha, bool intercept, - const double* d_mu, int r, double* d_vs, double* d_Fs, double* d_sum_logFs, - int fc_steps = 0, double* d_fc = nullptr) { + const double* d_mu, int rd, double* d_vs, double* d_Fs, double* d_sum_logFs, + int n_diff, int fc_steps = 0, double* d_fc = nullptr, bool conf_int = false, + double* d_F_fc = nullptr) { auto stream = T.stream(); auto allocator = T.allocator(); auto cublasHandle = T.cublasHandle(); int nb = T.batches(); - int r2 = r * r; + int rd2 = rd * rd; auto counting = thrust::make_counting_iterator(0); // Temporary matrices and vectors - MLCommon::LinAlg::Batched::Matrix v_tmp(r, 1, nb, cublasHandle, + MLCommon::LinAlg::Batched::Matrix v_tmp(rd, 1, nb, cublasHandle, allocator, stream, false); - MLCommon::LinAlg::Batched::Matrix m_tmp(r, r, nb, cublasHandle, + MLCommon::LinAlg::Batched::Matrix m_tmp(rd, rd, nb, cublasHandle, allocator, stream, false); - MLCommon::LinAlg::Batched::Matrix K(r, 1, nb, cublasHandle, allocator, - stream, false); - MLCommon::LinAlg::Batched::Matrix TP(r, r, nb, cublasHandle, + MLCommon::LinAlg::Batched::Matrix K(rd, 1, nb, cublasHandle, + allocator, stream, false); + MLCommon::LinAlg::Batched::Matrix TP(rd, rd, nb, cublasHandle, allocator, stream, false); // Shortcuts + const double* d_Z = Z.raw_data(); double* d_P = P.raw_data(); double* d_alpha = alpha.raw_data(); double* d_K = K.raw_data(); @@ -251,157 +336,263 @@ void _batched_kalman_loop_large( // 1. & 2. thrust::for_each(thrust::cuda::par.on(stream), counting, counting + nb, [=] __device__(int bid) { - d_vs[bid * nobs + it] = - d_ys[bid * nobs + it] - d_alpha[bid * r]; - double l_P = d_P[bid * r2]; - d_Fs[bid * nobs + it] = l_P; - d_sum_logFs[bid] += log(l_P); + const double* b_P = d_P + bid * rd2; + const double* b_Z = d_Z + bid * rd; + const double* b_alpha = d_alpha + bid * rd; + + double vt = d_ys[bid * nobs + it]; + if (n_diff == 0) { + vt -= b_alpha[0]; + } else { + for (int i = 0; i < rd; i++) { + vt -= b_alpha[i] * b_Z[i]; + } + } + d_vs[bid * nobs + it] = vt; + + double _F; + if (n_diff == 0) + _F = b_P[0]; + else { + _F = 0.0; + for (int i = 0; i < rd; i++) { + for (int j = 0; j < rd; j++) { + _F += b_P[j * rd + i] * b_Z[i] * b_Z[j]; + } + } + } + d_Fs[bid * nobs + it] = _F; + if (it >= n_diff) d_sum_logFs[bid] += log(_F); }); // 3. K = 1/Fs[it] * T*P*Z' // TP = T*P (also used later) - if (r <= 32) + if (rd <= 32) MLCommon::Sparse::Batched::b_spmm(1.0, T_sparse, P, 0.0, TP); else - MLCommon::LinAlg::Batched::b_gemm(false, false, r, r, r, 1.0, T, P, 0.0, - TP); - // K = 1/Fs[it] * TP*Z' ; optimized for Z = (1 0 ... 0) + MLCommon::LinAlg::Batched::b_gemm(false, false, rd, rd, rd, 1.0, T, P, + 0.0, TP); + // K = 1/Fs[it] * TP*Z' thrust::for_each(thrust::cuda::par.on(stream), counting, counting + nb, [=] __device__(int bid) { + const double* b_TP = d_TP + bid * rd2; + double* b_K = d_K + bid * rd; + double _1_Fs = 1.0 / d_Fs[bid * nobs + it]; - for (int i = 0; i < r; i++) { - d_K[bid * r + i] = _1_Fs * d_TP[bid * r2 + i]; + if (n_diff == 0) { + for (int i = 0; i < rd; i++) { + b_K[i] = _1_Fs * b_TP[i]; + } + } else { + const double* b_Z = d_Z + bid * rd; + for (int i = 0; i < rd; i++) { + double acc = 0.0; + for (int j = 0; j < rd; j++) { + acc += b_TP[rd * j + i] * b_Z[j]; + } + b_K[i] = _1_Fs * acc; + } } }); // 4. alpha = T*alpha + K*vs[it] + c // v_tmp = T*alpha MLCommon::Sparse::Batched::b_spmv(1.0, T_sparse, alpha, 0.0, v_tmp); - // alpha = v_tmp + K*vs[it] - // alpha_0 = alpha_0 + mu + // alpha = v_tmp + K*vs[it] + c thrust::for_each(thrust::cuda::par.on(stream), counting, counting + nb, [=] __device__(int bid) { + const double* b_Talpha = d_v_tmp + bid * rd; + const double* b_K = d_K + bid * rd; + double* b_alpha = d_alpha + bid * rd; + double _vs = d_vs[bid * nobs + it]; - for (int i = 0; i < r; i++) { - double mu = (intercept && i == 0) ? d_mu[bid] : 0.0; - d_alpha[bid * r + i] = - d_v_tmp[bid * r + i] + _vs * d_K[bid * r + i] + mu; + for (int i = 0; i < rd; i++) { + double mu = + (intercept && i == n_diff) ? d_mu[bid] : 0.0; + b_alpha[i] = b_Talpha[i] + b_K[i] * _vs + mu; } }); // 5. L = T - K * Z // L = T (L is m_tmp) - MLCommon::copy(m_tmp.raw_data(), T.raw_data(), nb * r2, stream); - // L = L - K * Z ; optimized for Z = (1 0 ... 0): - // substract K to the first column of L + raft::copy(m_tmp.raw_data(), T.raw_data(), nb * rd2, stream); + // L = L - K * Z thrust::for_each(thrust::cuda::par.on(stream), counting, counting + nb, [=] __device__(int bid) { - for (int i = 0; i < r; i++) { - d_m_tmp[bid * r2 + i] -= d_K[bid * r + i]; + const double* b_K = d_K + bid * rd; + double* b_L = d_m_tmp + bid * rd2; + + if (n_diff == 0) { + for (int i = 0; i < rd; i++) { + b_L[i] -= b_K[i]; + } + } else { + const double* b_Z = d_Z + bid * rd; + for (int i = 0; i < rd; i++) { + for (int j = 0; j < rd; j++) { + b_L[j * rd + i] -= b_K[i] * b_Z[j]; + } + } } }); - // MLCommon::LinAlg::Batched::b_gemm(false, false, r, r, 1, -1.0, K, Z, 1.0, + // MLCommon::LinAlg::Batched::b_gemm(false, false, rd, rd, 1, -1.0, K, Z, 1.0, // m_tmp); // generic - // 6. P = T*P*L' + R*R' + // 6. P = T*P*L' + R*Q*R' // P = TP*L' - MLCommon::LinAlg::Batched::b_gemm(false, true, r, r, r, 1.0, TP, m_tmp, 0.0, - P); - // P = P + R*R' - MLCommon::LinAlg::binaryOp( - d_P, d_P, RRT.raw_data(), r2 * nb, + MLCommon::LinAlg::Batched::b_gemm(false, true, rd, rd, rd, 1.0, TP, m_tmp, + 0.0, P); + // P = P + R*Q*R' + raft::linalg::binaryOp( + d_P, d_P, RQR.raw_data(), rd2 * nb, [=] __device__(double a, double b) { return a + b; }, stream); } // Forecast - for (int i = 0; i < fc_steps; i++) { - thrust::for_each( - thrust::cuda::par.on(stream), counting, counting + nb, - [=] __device__(int bid) { d_fc[bid * fc_steps + i] = d_alpha[bid * r]; }); + for (int it = 0; it < fc_steps; it++) { + thrust::for_each(thrust::cuda::par.on(stream), counting, counting + nb, + [=] __device__(int bid) { + const double* b_alpha = d_alpha + bid * rd; - MLCommon::Sparse::Batched::b_spmv(1.0, T_sparse, alpha, 0.0, v_tmp); - MLCommon::copy(d_alpha, v_tmp.raw_data(), r * nb, stream); + double pred; + if (n_diff == 0) { + pred = b_alpha[0]; + } else { + const double* b_Z = d_Z + bid * rd; + pred = 0.0; + for (int i = 0; i < rd; i++) { + pred += b_alpha[i] * b_Z[i]; + } + } + d_fc[bid * fc_steps + it] = pred; + }); + + // alpha = T*alpha + c + // alpha = T*alpha + MLCommon::Sparse::Batched::b_spmv(1.0, T_sparse, alpha, 0.0, v_tmp); + raft::copy(d_alpha, v_tmp.raw_data(), rd * nb, stream); + // alpha += c if (intercept) { thrust::for_each( thrust::cuda::par.on(stream), counting, counting + nb, - [=] __device__(int bid) { d_alpha[bid * r] += d_mu[bid]; }); + [=] __device__(int bid) { d_alpha[bid * rd + n_diff] += d_mu[bid]; }); + } + + if (conf_int) { + thrust::for_each(thrust::cuda::par.on(stream), counting, counting + nb, + [=] __device__(int bid) { + const double* b_P = d_P + bid * rd2; + + double Ft; + if (n_diff == 0) + Ft = b_P[0]; + else { + const double* b_Z = d_Z + bid * rd; + Ft = 0.0; + for (int i = 0; i < rd; i++) { + for (int j = 0; j < rd; j++) { + Ft += b_P[j * rd + i] * b_Z[i] * b_Z[j]; + } + } + } + + d_F_fc[bid * fc_steps + it] = Ft; + }); + + // P = T*P*T' + R*Q*R' + // TP = T*P + if (rd <= 32) + MLCommon::Sparse::Batched::b_spmm(1.0, T_sparse, P, 0.0, TP); + else + MLCommon::LinAlg::Batched::b_gemm(false, false, rd, rd, rd, 1.0, T, P, + 0.0, TP); + // P = TP*T' + MLCommon::LinAlg::Batched::b_gemm(false, true, rd, rd, rd, 1.0, TP, T, + 0.0, P); + // P = P + R*Q*R' + raft::linalg::binaryOp( + d_P, d_P, RQR.raw_data(), rd2 * nb, + [=] __device__(double a, double b) { return a + b; }, stream); } } } /// Wrapper around functions that execute the Kalman loop (for performance) -void batched_kalman_loop(cumlHandle& handle, const double* ys, int nobs, +void batched_kalman_loop(raft::handle_t& handle, const double* ys, int nobs, const MLCommon::LinAlg::Batched::Matrix& T, const MLCommon::LinAlg::Batched::Matrix& Z, - const MLCommon::LinAlg::Batched::Matrix& RRT, + const MLCommon::LinAlg::Batched::Matrix& RQR, MLCommon::LinAlg::Batched::Matrix& P0, MLCommon::LinAlg::Batched::Matrix& alpha, std::vector& T_mask, bool intercept, - const double* d_mu, int r, double* vs, double* Fs, - double* sum_logFs, int fc_steps = 0, - double* d_fc = nullptr) { + const double* d_mu, const ARIMAOrder& order, + double* vs, double* Fs, double* sum_logFs, + int fc_steps = 0, double* d_fc = nullptr, + bool conf_int = false, double* d_F_fc = nullptr) { const int batch_size = T.batches(); auto stream = T.stream(); + int rd = order.rd(); + int n_diff = order.n_diff(); dim3 numThreadsPerBlock(32, 1); - dim3 numBlocks(MLCommon::ceildiv(batch_size, numThreadsPerBlock.x), 1); - if (r <= 8) { - switch (r) { + dim3 numBlocks(raft::ceildiv(batch_size, numThreadsPerBlock.x), 1); + if (rd <= 8) { + switch (rd) { case 1: batched_kalman_loop_kernel<1> <<>>( - ys, nobs, T.raw_data(), Z.raw_data(), RRT.raw_data(), P0.raw_data(), + ys, nobs, T.raw_data(), Z.raw_data(), RQR.raw_data(), P0.raw_data(), alpha.raw_data(), intercept, d_mu, batch_size, vs, Fs, sum_logFs, - fc_steps, d_fc); + n_diff, fc_steps, d_fc, conf_int, d_F_fc); break; case 2: batched_kalman_loop_kernel<2> <<>>( - ys, nobs, T.raw_data(), Z.raw_data(), RRT.raw_data(), P0.raw_data(), + ys, nobs, T.raw_data(), Z.raw_data(), RQR.raw_data(), P0.raw_data(), alpha.raw_data(), intercept, d_mu, batch_size, vs, Fs, sum_logFs, - fc_steps, d_fc); + n_diff, fc_steps, d_fc, conf_int, d_F_fc); break; case 3: batched_kalman_loop_kernel<3> <<>>( - ys, nobs, T.raw_data(), Z.raw_data(), RRT.raw_data(), P0.raw_data(), + ys, nobs, T.raw_data(), Z.raw_data(), RQR.raw_data(), P0.raw_data(), alpha.raw_data(), intercept, d_mu, batch_size, vs, Fs, sum_logFs, - fc_steps, d_fc); + n_diff, fc_steps, d_fc, conf_int, d_F_fc); break; case 4: batched_kalman_loop_kernel<4> <<>>( - ys, nobs, T.raw_data(), Z.raw_data(), RRT.raw_data(), P0.raw_data(), + ys, nobs, T.raw_data(), Z.raw_data(), RQR.raw_data(), P0.raw_data(), alpha.raw_data(), intercept, d_mu, batch_size, vs, Fs, sum_logFs, - fc_steps, d_fc); + n_diff, fc_steps, d_fc, conf_int, d_F_fc); break; case 5: batched_kalman_loop_kernel<5> <<>>( - ys, nobs, T.raw_data(), Z.raw_data(), RRT.raw_data(), P0.raw_data(), + ys, nobs, T.raw_data(), Z.raw_data(), RQR.raw_data(), P0.raw_data(), alpha.raw_data(), intercept, d_mu, batch_size, vs, Fs, sum_logFs, - fc_steps, d_fc); + n_diff, fc_steps, d_fc, conf_int, d_F_fc); break; case 6: batched_kalman_loop_kernel<6> <<>>( - ys, nobs, T.raw_data(), Z.raw_data(), RRT.raw_data(), P0.raw_data(), + ys, nobs, T.raw_data(), Z.raw_data(), RQR.raw_data(), P0.raw_data(), alpha.raw_data(), intercept, d_mu, batch_size, vs, Fs, sum_logFs, - fc_steps, d_fc); + n_diff, fc_steps, d_fc, conf_int, d_F_fc); break; case 7: batched_kalman_loop_kernel<7> <<>>( - ys, nobs, T.raw_data(), Z.raw_data(), RRT.raw_data(), P0.raw_data(), + ys, nobs, T.raw_data(), Z.raw_data(), RQR.raw_data(), P0.raw_data(), alpha.raw_data(), intercept, d_mu, batch_size, vs, Fs, sum_logFs, - fc_steps, d_fc); + n_diff, fc_steps, d_fc, conf_int, d_F_fc); break; case 8: batched_kalman_loop_kernel<8> <<>>( - ys, nobs, T.raw_data(), Z.raw_data(), RRT.raw_data(), P0.raw_data(), + ys, nobs, T.raw_data(), Z.raw_data(), RQR.raw_data(), P0.raw_data(), alpha.raw_data(), intercept, d_mu, batch_size, vs, Fs, sum_logFs, - fc_steps, d_fc); + n_diff, fc_steps, d_fc, conf_int, d_F_fc); break; } CUDA_CHECK(cudaPeekAtLastError()); @@ -409,31 +600,29 @@ void batched_kalman_loop(cumlHandle& handle, const double* ys, int nobs, // Note: not always used MLCommon::Sparse::Batched::CSR T_sparse = MLCommon::Sparse::Batched::CSR::from_dense( - T, T_mask, handle.getImpl().getcusolverSpHandle()); - _batched_kalman_loop_large(ys, nobs, T, T_sparse, Z, RRT, P0, alpha, - intercept, d_mu, r, vs, Fs, sum_logFs, fc_steps, - d_fc); + T, T_mask, handle.get_cusolver_sp_handle()); + _batched_kalman_loop_large(ys, nobs, T, T_sparse, Z, RQR, P0, alpha, + intercept, d_mu, rd, vs, Fs, sum_logFs, n_diff, + fc_steps, d_fc, conf_int, d_F_fc); } } template -__global__ void batched_kalman_loglike_kernel(const double* d_vs, - const double* d_Fs, - const double* d_sumLogFs, - int nobs, int batch_size, - double* loglike) { +__global__ void batched_kalman_loglike_kernel( + const double* d_vs, const double* d_Fs, const double* d_sumLogFs, int nobs, + int batch_size, double* d_loglike, double* d_sigma2, int n_diff, + double level) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int tid = threadIdx.x; int bid = blockIdx.x; - int num_threads = blockDim.x; double bid_sigma2 = 0.0; - for (int it = 0; it < nobs; it += num_threads) { + for (int it = 0; it < nobs; it += NUM_THREADS) { // vs and Fs are in time-major order (memory layout: column major) int idx = (it + tid) + bid * nobs; double d_vs2_Fs = 0.0; - if (it + tid < nobs) { + if (it + tid >= n_diff && it + tid < nobs) { double _vi = d_vs[idx]; d_vs2_Fs = _vi * _vi / d_Fs[idx]; } @@ -442,79 +631,134 @@ __global__ void batched_kalman_loglike_kernel(const double* d_vs, bid_sigma2 += partial_sum; } if (tid == 0) { - double nobs_f = static_cast(nobs); - bid_sigma2 /= nobs_f; - loglike[bid] = - -.5 * (d_sumLogFs[bid] + nobs_f * bid_sigma2 + nobs_f * (log(2 * M_PI))); + double nobs_diff_f = static_cast(nobs - n_diff); + bid_sigma2 /= nobs_diff_f; + if (level != 0) d_sigma2[bid] = bid_sigma2; + d_loglike[bid] = -.5 * (d_sumLogFs[bid] + nobs_diff_f * bid_sigma2 + + nobs_diff_f * (log(2 * M_PI))); } } -void batched_kalman_loglike(const double* d_vs, const double* d_Fs, - const double* d_sumLogFs, int nobs, int batch_size, - double* loglike, cudaStream_t stream) { - constexpr int NUM_THREADS = 128; - batched_kalman_loglike_kernel - <<>>(d_vs, d_Fs, d_sumLogFs, nobs, - batch_size, loglike); - CUDA_CHECK(cudaGetLastError()); +/** + * Kernel to finalize the computation of confidence intervals + * + * @note: One block per batch member, one thread per forecast time step + * + * @param[in] d_fc Mean forecasts + * @param[in] d_sigma2 sum(v_t * v_t / F_t) / n_obs_diff + * @param[inout] d_lower Input: F_{n+t} + * Output: lower bound of the confidence intervals + * @param[out] d_upper Upper bound of the confidence intervals + * @param[in] fc_steps Number of forecast steps + * @param[in] multiplier Coefficient associated with the confidence level + */ +__global__ void confidence_intervals(const double* d_fc, const double* d_sigma2, + double* d_lower, double* d_upper, + int fc_steps, double multiplier) { + int idx = blockIdx.x * fc_steps + threadIdx.x; + double fc = d_fc[idx]; + double margin = multiplier * sqrt(d_lower[idx] * d_sigma2[blockIdx.x]); + d_lower[idx] = fc - margin; + d_upper[idx] = fc + margin; } /// Internal Kalman filter implementation that assumes data exists on GPU. -void _batched_kalman_filter(cumlHandle& handle, const double* d_ys, int nobs, +void _batched_kalman_filter(raft::handle_t& handle, const double* d_ys, + int nobs, const ARIMAOrder& order, const MLCommon::LinAlg::Batched::Matrix& Zb, const MLCommon::LinAlg::Batched::Matrix& Tb, const MLCommon::LinAlg::Batched::Matrix& Rb, - std::vector& T_mask, int r, double* d_vs, + std::vector& T_mask, double* d_vs, double* d_Fs, double* d_loglike, const double* d_sigma2, bool intercept, - const double* d_mu, int fc_steps = 0, - double* d_fc = nullptr) { + const double* d_mu, int fc_steps, double* d_fc, + double level, double* d_lower, double* d_upper) { const size_t batch_size = Zb.batches(); - auto stream = handle.getStream(); - auto cublasHandle = handle.getImpl().getCublasHandle(); - auto allocator = handle.getDeviceAllocator(); + auto stream = handle.get_stream(); + auto cublasHandle = handle.get_cublas_handle(); + auto allocator = handle.get_device_allocator(); auto counting = thrust::make_counting_iterator(0); - MLCommon::LinAlg::Batched::Matrix RQb(r, 1, batch_size, cublasHandle, + int n_diff = order.n_diff(); + int rd = order.rd(); + int r = order.r(); + + MLCommon::LinAlg::Batched::Matrix RQb(rd, 1, batch_size, cublasHandle, allocator, stream, true); double* d_RQ = RQb.raw_data(); const double* d_R = Rb.raw_data(); thrust::for_each(thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) { double sigma2 = d_sigma2[bid]; - for (int i = 0; i < r; i++) { - d_RQ[bid * r + i] = d_R[bid * r + i] * sigma2; + for (int i = 0; i < rd; i++) { + d_RQ[bid * rd + i] = d_R[bid * rd + i] * sigma2; } }); - MLCommon::LinAlg::Batched::Matrix RRT = + MLCommon::LinAlg::Batched::Matrix RQR = MLCommon::LinAlg::Batched::b_gemm(RQb, Rb, false, true); // Durbin Koopman "Time Series Analysis" pg 138 ML::PUSH_RANGE("Init P"); - MLCommon::LinAlg::Batched::Matrix P = - MLCommon::LinAlg::Batched::b_lyapunov(Tb, RRT); + MLCommon::LinAlg::Batched::Matrix P(rd, rd, batch_size, cublasHandle, + allocator, stream, true); + { + double* d_P = P.raw_data(); + + if (n_diff > 0) { + // Initialize the diffuse part with a large variance + /// TODO: pass this as a parameter + constexpr double kappa = 1e6; + thrust::for_each(thrust::cuda::par.on(stream), counting, + counting + batch_size, [=] __device__(int bid) { + double* b_P = d_P + rd * rd * bid; + for (int i = 0; i < n_diff; i++) { + b_P[(rd + 1) * i] = kappa; + } + }); + + // Initialize the stationary part by solving a Lyapunov equation + /// TODO: reduce amount of memory copies + MLCommon::LinAlg::Batched::Matrix Ts = + MLCommon::LinAlg::Batched::b_2dcopy(Tb, n_diff, n_diff, r, r); + MLCommon::LinAlg::Batched::Matrix RQRs = + MLCommon::LinAlg::Batched::b_2dcopy(RQR, n_diff, n_diff, r, r); + MLCommon::LinAlg::Batched::Matrix Ps = + MLCommon::LinAlg::Batched::b_lyapunov(Ts, RQRs); + MLCommon::LinAlg::Batched::b_2dcopy(Ps, P, 0, 0, r, r, n_diff, n_diff); + } else { + // Initialize by solving a Lyapunov equation + /// TODO: avoid copy + P = MLCommon::LinAlg::Batched::b_lyapunov(Tb, RQR); + } + } ML::POP_RANGE(); - // Initialize the state alpha as the solution of (I - T) x = c - // Note: optimized as c = (mu 0 ... 0)' + // Initialize the state alpha by solving (I - T*) x* = c with: + // | mu | + // c = | 0 | + // | . | + // | 0 | + // T* = T[d+s*D:, d+s*D:] + // x* = alpha_0[d+s*D:] MLCommon::LinAlg::Batched::Matrix alpha( - r, 1, batch_size, handle.getImpl().getCublasHandle(), - handle.getDeviceAllocator(), stream, true); + rd, 1, batch_size, handle.get_cublas_handle(), + handle.get_device_allocator(), stream, false); if (intercept) { - // Compute I-T + // Compute I-T* MLCommon::LinAlg::Batched::Matrix ImT( r, r, batch_size, cublasHandle, allocator, stream, false); const double* d_T = Tb.raw_data(); double* d_ImT = ImT.raw_data(); thrust::for_each(thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) { - const double* b_T = d_T + r * r * bid; + const double* b_T = d_T + rd * rd * bid; double* b_ImT = d_ImT + r * r * bid; for (int i = 0; i < r; i++) { for (int j = 0; j < r; j++) { b_ImT[r * j + i] = - (i == j ? 1.0 : 0.0) - b_T[r * j + i]; + (i == j ? 1.0 : 0.0) - + b_T[rd * (j + n_diff) + i + n_diff]; } } }); @@ -524,169 +768,250 @@ void _batched_kalman_filter(cumlHandle& handle, const double* d_ys, int nobs, thrust::for_each(thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) { if (abs(d_ImT[bid]) < 1e-3) - d_ImT[bid] = MLCommon::signPrim(d_ImT[bid]) * 1e-3; + d_ImT[bid] = raft::signPrim(d_ImT[bid]) * 1e-3; }); } - // Compute (I-T)^-1 + // Compute (I-T*)^-1 MLCommon::LinAlg::Batched::Matrix ImT_inv = ImT.inv(); - // Compute (I-T)^-1 * c -> multiply 1st column by mu + // Compute (I-T*)^-1 * c -> multiply 1st column by mu const double* d_ImT_inv = ImT_inv.raw_data(); double* d_alpha = alpha.raw_data(); thrust::for_each(thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) { const double* b_ImT_inv = d_ImT_inv + r * r * bid; - double* b_alpha = d_alpha + r * bid; + double* b_alpha = d_alpha + rd * bid; double mu = d_mu[bid]; + for (int i = 0; i < n_diff; i++) { + b_alpha[i] = 0; + } for (int i = 0; i < r; i++) { - b_alpha[i] = b_ImT_inv[i] * mu; + b_alpha[i + n_diff] = b_ImT_inv[i] * mu; } }); + } else { + // Memset alpha to 0 + CUDA_CHECK(cudaMemsetAsync(alpha.raw_data(), 0, + sizeof(double) * rd * batch_size, stream)); } - // init vs, Fs - // In batch-major format. - double* d_sumlogFs; - - d_sumlogFs = (double*)handle.getDeviceAllocator()->allocate( - sizeof(double) * batch_size, stream); - - batched_kalman_loop(handle, d_ys, nobs, Tb, Zb, RRT, P, alpha, T_mask, - intercept, d_mu, r, d_vs, d_Fs, d_sumlogFs, fc_steps, - d_fc); + MLCommon::device_buffer sumLogF_buffer(allocator, stream, batch_size); - // Finalize loglikelihood - batched_kalman_loglike(d_vs, d_Fs, d_sumlogFs, nobs, batch_size, d_loglike, - stream); + batched_kalman_loop(handle, d_ys, nobs, Tb, Zb, RQR, P, alpha, T_mask, + intercept, d_mu, order, d_vs, d_Fs, sumLogF_buffer.data(), + fc_steps, d_fc, level > 0, d_lower); - handle.getDeviceAllocator()->deallocate(d_sumlogFs, - sizeof(double) * batch_size, stream); + // Finalize loglikelihood and prediction intervals + MLCommon::device_buffer sigma2_buffer(allocator, stream, batch_size); + constexpr int NUM_THREADS = 128; + batched_kalman_loglike_kernel + <<>>( + d_vs, d_Fs, sumLogF_buffer.data(), nobs, batch_size, d_loglike, + sigma2_buffer.data(), n_diff, level); + CUDA_CHECK(cudaPeekAtLastError()); + if (level > 0) { + confidence_intervals<<>>( + d_fc, sigma2_buffer.data(), d_lower, d_upper, fc_steps, + sqrt(2.0) * erfinv(level)); + CUDA_CHECK(cudaPeekAtLastError()); + } } -void init_batched_kalman_matrices(cumlHandle& handle, const double* d_ar, +void init_batched_kalman_matrices(raft::handle_t& handle, const double* d_ar, const double* d_ma, const double* d_sar, const double* d_sma, int nb, - const ARIMAOrder& order, int r, double* d_Z_b, - double* d_R_b, double* d_T_b, + const ARIMAOrder& order, int rd, + double* d_Z_b, double* d_R_b, double* d_T_b, std::vector& T_mask) { ML::PUSH_RANGE(__func__); - auto stream = handle.getStream(); + auto stream = handle.get_stream(); // Note: Z is unused yet but kept to avoid reintroducing it later when // adding support for exogeneous variables - cudaMemsetAsync(d_Z_b, 0.0, r * nb * sizeof(double), stream); - cudaMemsetAsync(d_R_b, 0.0, r * nb * sizeof(double), stream); - cudaMemsetAsync(d_T_b, 0.0, r * r * nb * sizeof(double), stream); + cudaMemsetAsync(d_Z_b, 0.0, rd * nb * sizeof(double), stream); + cudaMemsetAsync(d_R_b, 0.0, rd * nb * sizeof(double), stream); + cudaMemsetAsync(d_T_b, 0.0, rd * rd * nb * sizeof(double), stream); - auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + nb, - [=] __device__(int bid) { - // See TSA pg. 54 for Z,R,T matrices - // Z = [1 0 0 0 ... 0] - d_Z_b[bid * r] = 1.0; - - // |1.0 | - // R = |theta_1 | - // | ... | - // |theta_{r-1}| - // - d_R_b[bid * r] = 1.0; - for (int i = 0; i < r - 1; i++) { - d_R_b[bid * r + i + 1] = - MLCommon::TimeSeries::reduced_polynomial( - bid, d_ma, order.q, d_sma, order.Q, order.s, i + 1); - } + int n_diff = order.n_diff(); + int r = order.r(); - // |phi_1 1.0 0.0 ... 0.0| - // | . 1.0 | - // | . . | - // T = | . . 0.0| - // | . . | - // | . 1.0| - // |phi_r 0.0 0.0 ... 0.0| - // - double* batch_T = d_T_b + bid * r * r; - for (int i = 0; i < r; i++) { - batch_T[i] = - MLCommon::TimeSeries::reduced_polynomial( - bid, d_ar, order.p, d_sar, order.P, order.s, i + 1); - } - // shifted identity - for (int i = 0; i < r - 1; i++) { - batch_T[(i + 1) * r + i] = 1.0; - } + auto counting = thrust::make_counting_iterator(0); + auto n_theta = order.n_theta(); + auto n_phi = order.n_phi(); + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + nb, + [=] __device__(int bid) { + // See TSA pg. 54 for Z, R, T matrices + + // Z = [ 1 | 0 . . 0 1 0 . . 0 1 | 1 0 . . 0 ] + // d | s*D | r + for (int i = 0; i < order.d; i++) d_Z_b[bid * rd + i] = 1.0; + for (int i = 1; i <= order.D; i++) + d_Z_b[bid * rd + order.d + i * order.s - 1] = 1.0; + d_Z_b[bid * rd + n_diff] = 1.0; + + // | 0 | + // | . | d + s*D + // | 0 |_ _ + // R = | 1 | + // | theta_1 | r + // | . | + // |theta_{r-1}| + // + d_R_b[bid * rd + n_diff] = 1.0; + for (int i = 0; i < n_theta; i++) { + d_R_b[bid * rd + n_diff + i + 1] = + MLCommon::TimeSeries::reduced_polynomial( + bid, d_ma, order.q, d_sma, order.Q, order.s, i + 1); + } - // If r=2 and phi_2=-1, I-TxT is singular - if (r == 2 && order.p == 2 && abs(batch_T[1] + 1) < 0.01) { - batch_T[1] = -0.99; - } - }); + // | 1 | 0 .. 0 1 | 1 | d + // |_ _|_ _ _ _ _ |_ _ _ _ _ _ _ _ _ |_ _ + // | | 0 .. 0 1 | 1 | + // | | 1 0 | | + // | | . . | | s*D + // | | . . | | + // T = | | 0 1 0 | | + // |_ _|_ _ _ _ _ |_ _ _ _ _ _ _ _ _ |_ _ + // | | | phi_1 1 | + // | | | . 1 0 | + // | | | . . | r + // | | | . 0 . | + // | | | . 1 | + // | | | phi_r 0 . . 0 | + // + // (non-comprehensive example with d=1 and D=1) + // + double* batch_T = d_T_b + bid * rd * rd; + // 1. Differencing component + for (int i = 0; i < order.d; i++) { + for (int j = i; j < order.d; j++) { + batch_T[j * rd + i] = 1.0; + } + } + for (int id = 0; id < order.d; id++) { + batch_T[n_diff * rd + id] = 1.0; + for (int iD = 1; iD <= order.D; iD++) { + batch_T[(order.d + order.s * iD - 1) * rd + id] = 1.0; + } + } + // 2. Seasonal differencing component + for (int iD = 0; iD < order.D; iD++) { + int offset = order.d + iD * order.s; + for (int i = 0; i < order.s - 1; i++) { + batch_T[(offset + i) * rd + offset + i + 1] = 1.0; + } + batch_T[(offset + order.s - 1) * rd + offset] = 1.0; + batch_T[n_diff * rd + offset] = 1.0; + } + if (order.D == 2) { + batch_T[(n_diff - 1) * rd + order.d] = 1.0; + } + // 3. Auto-Regressive component + for (int i = 0; i < n_phi; i++) { + batch_T[n_diff * (rd + 1) + i] = + MLCommon::TimeSeries::reduced_polynomial( + bid, d_ar, order.p, d_sar, order.P, order.s, i + 1); + } + for (int i = 0; i < r - 1; i++) { + batch_T[(n_diff + i + 1) * rd + n_diff + i] = 1.0; + } - T_mask.resize(r * r, false); + // If rd=2 and phi_2=-1, I-TxT is singular + if (rd == 2 && order.p == 2 && abs(batch_T[1] + 1) < 0.01) { + batch_T[1] = -0.99; + } + }); + + // T density/sparsity mask + T_mask.resize(rd * rd, false); + // 1. Differencing component + for (int i = 0; i < order.d; i++) { + for (int j = i; j < order.d; j++) { + T_mask[j * rd + i] = true; + } + } + for (int id = 0; id < order.d; id++) { + T_mask[n_diff * rd + id] = true; + for (int iD = 1; iD <= order.D; iD++) { + T_mask[(order.d + order.s * iD - 1) * rd + id] = true; + } + } + // 2. Seasonal differencing component + for (int iD = 0; iD < order.D; iD++) { + int offset = order.d + iD * order.s; + for (int i = 0; i < order.s - 1; i++) { + T_mask[(offset + i) * rd + offset + i + 1] = true; + } + T_mask[(offset + order.s - 1) * rd + offset] = true; + T_mask[n_diff * rd + offset] = true; + } + if (order.D == 2) { + T_mask[(n_diff - 1) * rd + order.d] = true; + } + // 3. Auto-Regressive component for (int iP = 0; iP < order.P + 1; iP++) { for (int ip = 0; ip < order.p + 1; ip++) { int i = iP * order.s + ip - 1; - if (i >= 0) T_mask[i] = true; + if (i >= 0) T_mask[n_diff * (rd + 1) + i] = true; } } for (int i = 0; i < r - 1; i++) { - T_mask[(i + 1) * r + i] = true; + T_mask[(n_diff + i + 1) * rd + n_diff + i] = true; } ML::POP_RANGE(); } -void batched_kalman_filter(cumlHandle& handle, const double* d_ys, int nobs, +void batched_kalman_filter(raft::handle_t& handle, const double* d_ys, int nobs, const ARIMAParams& params, const ARIMAOrder& order, int batch_size, double* d_loglike, double* d_vs, int fc_steps, - double* d_fc) { + double* d_fc, double level, double* d_lower, + double* d_upper) { ML::PUSH_RANGE(__func__); - const size_t ys_len = nobs; - - auto cublasHandle = handle.getImpl().getCublasHandle(); - auto stream = handle.getStream(); - auto allocator = handle.getDeviceAllocator(); + auto cublasHandle = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + auto allocator = handle.get_device_allocator(); // see (3.18) in TSA by D&K - int r = order.r(); + int rd = order.rd(); - MLCommon::LinAlg::Batched::Matrix Zb(1, r, batch_size, cublasHandle, + MLCommon::LinAlg::Batched::Matrix Zb(1, rd, batch_size, cublasHandle, allocator, stream, false); - MLCommon::LinAlg::Batched::Matrix Tb(r, r, batch_size, cublasHandle, + MLCommon::LinAlg::Batched::Matrix Tb(rd, rd, batch_size, cublasHandle, allocator, stream, false); - MLCommon::LinAlg::Batched::Matrix Rb(r, 1, batch_size, cublasHandle, + MLCommon::LinAlg::Batched::Matrix Rb(rd, 1, batch_size, cublasHandle, allocator, stream, false); std::vector T_mask; init_batched_kalman_matrices(handle, params.ar, params.ma, params.sar, - params.sma, batch_size, order, r, Zb.raw_data(), + params.sma, batch_size, order, rd, Zb.raw_data(), Rb.raw_data(), Tb.raw_data(), T_mask); //////////////////////////////////////////////////////////// // Computation - double* d_Fs = - (double*)allocator->allocate(ys_len * batch_size * sizeof(double), stream); - - _batched_kalman_filter(handle, d_ys, nobs, Zb, Tb, Rb, T_mask, r, d_vs, d_Fs, - d_loglike, params.sigma2, static_cast(order.k), - params.mu, fc_steps, d_fc); + MLCommon::device_buffer F_buffer(allocator, stream, + nobs * batch_size); - allocator->deallocate(d_Fs, ys_len * batch_size * sizeof(double), stream); + _batched_kalman_filter(handle, d_ys, nobs, order, Zb, Tb, Rb, T_mask, d_vs, + F_buffer.data(), d_loglike, params.sigma2, + static_cast(order.k), params.mu, fc_steps, d_fc, + level, d_lower, d_upper); ML::POP_RANGE(); } -void batched_jones_transform(cumlHandle& handle, const ARIMAOrder& order, +void batched_jones_transform(raft::handle_t& handle, const ARIMAOrder& order, int batch_size, bool isInv, const double* h_params, double* h_Tparams) { int N = order.complexity(); - auto allocator = handle.getDeviceAllocator(); - auto stream = handle.getStream(); + auto allocator = handle.get_device_allocator(); + auto stream = handle.get_stream(); double* d_params = (double*)allocator->allocate(N * batch_size * sizeof(double), stream); double* d_Tparams = @@ -695,7 +1020,7 @@ void batched_jones_transform(cumlHandle& handle, const ARIMAOrder& order, params.allocate(order, batch_size, allocator, stream, false); Tparams.allocate(order, batch_size, allocator, stream, true); - MLCommon::updateDevice(d_params, h_params, N * batch_size, stream); + raft::update_device(d_params, h_params, N * batch_size, stream); params.unpack(order, batch_size, d_params, stream); @@ -705,7 +1030,7 @@ void batched_jones_transform(cumlHandle& handle, const ARIMAOrder& order, Tparams.pack(order, batch_size, d_Tparams, stream); - MLCommon::updateHost(h_Tparams, d_Tparams, N * batch_size, stream); + raft::update_host(h_Tparams, d_Tparams, N * batch_size, stream); allocator->deallocate(d_params, N * batch_size * sizeof(double), stream); allocator->deallocate(d_Tparams, N * batch_size * sizeof(double), stream); diff --git a/cpp/src/common/allocatorAdapter.hpp b/cpp/src/common/allocatorAdapter.hpp index 302da642a2..06aaf879d7 100644 --- a/cpp/src/common/allocatorAdapter.hpp +++ b/cpp/src/common/allocatorAdapter.hpp @@ -22,7 +22,7 @@ #include -#include +#include #include namespace ML { @@ -95,9 +95,9 @@ class stdAllocatorAdapter { /** * @todo: Complete doxygen documentation * @code{.cpp} - * void foo( const cumlHandle_impl& h, ... , cudaStream_t stream ) + * void foo( const raft::handle_t& h, ... , cudaStream_t stream ) * { - * auto execution_policy = ML::thrust_exec_policy(h.getDeviceAllocator(),stream); + * auto execution_policy = ML::thrust_exec_policy(h.get_device_allocator(),stream); * thrust::for_each(execution_policy->on(stream), ... ); * } * @endcode diff --git a/cpp/src/common/cuML_comms_impl.cpp b/cpp/src/common/cuML_comms_impl.cpp deleted file mode 100644 index 41bb9e0dc2..0000000000 --- a/cpp/src/common/cuML_comms_impl.cpp +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include - -namespace MLCommon { - -cumlCommunicator::cumlCommunicator(std::unique_ptr impl) - : _impl(impl.release()) { - ASSERT(nullptr != _impl.get(), "ERROR: Invalid cumlCommunicator_iface used!"); -} - -int cumlCommunicator::getSize() const { return _impl->getSize(); } - -int cumlCommunicator::getRank() const { return _impl->getRank(); } - -cumlCommunicator cumlCommunicator::commSplit(int color, int key) const { - return cumlCommunicator(_impl->commSplit(color, key)); -} - -void cumlCommunicator::barrier() const { _impl->barrier(); } - -cumlCommunicator::status_t cumlCommunicator::syncStream( - cudaStream_t stream) const { - return _impl->syncStream(stream); -} - -void cumlCommunicator::isend(const void* buf, int size, int dest, int tag, - request_t* request) const { - _impl->isend(buf, size, dest, tag, request); -} - -void cumlCommunicator::irecv(void* buf, int size, int source, int tag, - request_t* request) const { - _impl->irecv(buf, size, source, tag, request); -} - -void cumlCommunicator::waitall(int count, request_t array_of_requests[]) const { - _impl->waitall(count, array_of_requests); -} - -void cumlCommunicator::allreduce(const void* sendbuff, void* recvbuff, - int count, datatype_t datatype, op_t op, - cudaStream_t stream) const { - _impl->allreduce(sendbuff, recvbuff, count, datatype, op, stream); -} - -void cumlCommunicator::bcast(void* buff, int count, datatype_t datatype, - int root, cudaStream_t stream) const { - _impl->bcast(buff, count, datatype, root, stream); -} - -void cumlCommunicator::reduce(const void* sendbuff, void* recvbuff, int count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - _impl->reduce(sendbuff, recvbuff, count, datatype, op, root, stream); -} - -void cumlCommunicator::allgather(const void* sendbuff, void* recvbuff, - int sendcount, datatype_t datatype, - cudaStream_t stream) const { - _impl->allgather(sendbuff, recvbuff, sendcount, datatype, stream); -} - -void cumlCommunicator::allgatherv(const void* sendbuf, void* recvbuf, - const int recvcounts[], const int displs[], - datatype_t datatype, - cudaStream_t stream) const { - _impl->allgatherv(sendbuf, recvbuf, recvcounts, displs, datatype, stream); -} - -void cumlCommunicator::reducescatter(const void* sendbuff, void* recvbuff, - int recvcount, datatype_t datatype, - op_t op, cudaStream_t stream) const { - _impl->reducescatter(sendbuff, recvbuff, recvcount, datatype, op, stream); -} - -template <> -cumlCommunicator::datatype_t cumlCommunicator::getDataType() const { - return cumlCommunicator::CHAR; -} - -template <> -cumlCommunicator::datatype_t cumlCommunicator::getDataType() const { - return cumlCommunicator::UINT8; -} - -template <> -cumlCommunicator::datatype_t cumlCommunicator::getDataType() const { - return cumlCommunicator::INT; -} - -template <> -cumlCommunicator::datatype_t cumlCommunicator::getDataType() const { - return cumlCommunicator::UINT; -} - -template <> -cumlCommunicator::datatype_t cumlCommunicator::getDataType() const { - return cumlCommunicator::INT64; -} - -template <> -cumlCommunicator::datatype_t cumlCommunicator::getDataType() const { - return cumlCommunicator::UINT64; -} - -template <> -cumlCommunicator::datatype_t cumlCommunicator::getDataType() const { - return cumlCommunicator::FLOAT; -} - -template <> -cumlCommunicator::datatype_t cumlCommunicator::getDataType() const { - return cumlCommunicator::DOUBLE; -} - -cumlCommunicator_iface::~cumlCommunicator_iface() {} - -} // namespace MLCommon diff --git a/cpp/src/common/cumlHandle.cpp b/cpp/src/common/cumlHandle.cpp index c0b1bfe50e..c4697c14b5 100644 --- a/cpp/src/common/cumlHandle.cpp +++ b/cpp/src/common/cumlHandle.cpp @@ -15,235 +15,22 @@ */ #include "cumlHandle.hpp" -#include -#include -#include -#include +#include +#include +#include +#include #include #include namespace ML { -int cumlHandle::getDefaultNumInternalStreams() { - return _default_num_internal_streams; -} - -cumlHandle::cumlHandle(int n_streams) : _impl(new cumlHandle_impl(n_streams)) {} -cumlHandle::cumlHandle() : _impl(new cumlHandle_impl()) {} -cumlHandle::~cumlHandle() {} - -void cumlHandle::setStream(cudaStream_t stream) { _impl->setStream(stream); } - -cudaStream_t cumlHandle::getStream() const { return _impl->getStream(); } - -const cudaDeviceProp& cumlHandle::getDeviceProperties() const { - return _impl->getDeviceProperties(); -} - -std::vector cumlHandle::getInternalStreams() const { - return _impl->getInternalStreams(); -} - -void cumlHandle::setDeviceAllocator( - std::shared_ptr allocator) { - _impl->setDeviceAllocator(allocator); -} - -std::shared_ptr cumlHandle::getDeviceAllocator() const { - return _impl->getDeviceAllocator(); -} - -void cumlHandle::setHostAllocator(std::shared_ptr allocator) { - _impl->setHostAllocator(allocator); -} - -std::shared_ptr cumlHandle::getHostAllocator() const { - return _impl->getHostAllocator(); -} -int cumlHandle::getNumInternalStreams() { - return _impl->getNumInternalStreams(); -} -const cumlHandle_impl& cumlHandle::getImpl() const { return *_impl.get(); } - -cumlHandle_impl& cumlHandle::getImpl() { return *_impl.get(); } - -using MLCommon::defaultDeviceAllocator; -using MLCommon::defaultHostAllocator; - -cumlHandle_impl::cumlHandle_impl(int n_streams) - : _cublas_handle(), - _cusolverDn_handle(), - _cusolverSp_handle(), - _cusparse_handle(), - _userStream(nullptr), - _event(), - _deviceAllocator(std::make_shared()), - _hostAllocator(std::make_shared()), - _communicator(), - _streams(), - _prop(), - _dev_id([]() -> int { - int cur_dev = -1; - CUDA_CHECK(cudaGetDevice(&cur_dev)); - return cur_dev; - }()), - _num_streams(n_streams), - _cublasInitialized(false), - _cusolverDnInitialized(false), - _cusolverSpInitialized(false), - _cusparseInitialized(false), - _devicePropInitialized(false) { - createResources(); -} - -cumlHandle_impl::~cumlHandle_impl() { destroyResources(); } - -int cumlHandle_impl::getDevice() const { return _dev_id; } - -void cumlHandle_impl::setStream(cudaStream_t stream) { _userStream = stream; } - -cudaStream_t cumlHandle_impl::getStream() const { return _userStream; } - -const cudaDeviceProp& cumlHandle_impl::getDeviceProperties() const { - if (!_devicePropInitialized) { - CUDA_CHECK(cudaGetDeviceProperties(&_prop, _dev_id)); - _devicePropInitialized = true; - } - return _prop; -} - -void cumlHandle_impl::setDeviceAllocator( - std::shared_ptr allocator) { - _deviceAllocator = allocator; -} - -std::shared_ptr cumlHandle_impl::getDeviceAllocator() const { - return _deviceAllocator; -} - -void cumlHandle_impl::setHostAllocator( - std::shared_ptr allocator) { - _hostAllocator = allocator; -} - -std::shared_ptr cumlHandle_impl::getHostAllocator() const { - return _hostAllocator; -} - -cublasHandle_t cumlHandle_impl::getCublasHandle() const { - if (!_cublasInitialized) { - CUBLAS_CHECK(cublasCreate(&_cublas_handle)); - _cublasInitialized = true; - } - return _cublas_handle; -} - -cusolverDnHandle_t cumlHandle_impl::getcusolverDnHandle() const { - if (!_cusolverDnInitialized) { - CUSOLVER_CHECK(cusolverDnCreate(&_cusolverDn_handle)); - _cusolverDnInitialized = true; - } - return _cusolverDn_handle; -} - -cusolverSpHandle_t cumlHandle_impl::getcusolverSpHandle() const { - if (!_cusolverSpInitialized) { - CUSOLVER_CHECK(cusolverSpCreate(&_cusolverSp_handle)); - _cusolverSpInitialized = true; - } - return _cusolverSp_handle; -} - -cusparseHandle_t cumlHandle_impl::getcusparseHandle() const { - if (!_cusparseInitialized) { - CUSPARSE_CHECK(cusparseCreate(&_cusparse_handle)); - _cusparseInitialized = true; - } - return _cusparse_handle; -} - -cudaStream_t cumlHandle_impl::getInternalStream(int sid) const { - return _streams[sid]; -} - -int cumlHandle_impl::getNumInternalStreams() const { return _num_streams; } - -std::vector cumlHandle_impl::getInternalStreams() const { - std::vector int_streams_vec(_num_streams); - for (auto s : _streams) { - int_streams_vec.push_back(s); - } - return int_streams_vec; -} - -void cumlHandle_impl::waitOnUserStream() const { - CUDA_CHECK(cudaEventRecord(_event, _userStream)); - for (auto s : _streams) { - CUDA_CHECK(cudaStreamWaitEvent(s, _event, 0)); - } -} - -void cumlHandle_impl::waitOnInternalStreams() const { - for (auto s : _streams) { - CUDA_CHECK(cudaEventRecord(_event, s)); - CUDA_CHECK(cudaStreamWaitEvent(_userStream, _event, 0)); - } -} - -void cumlHandle_impl::setCommunicator( - std::shared_ptr communicator) { - _communicator = communicator; -} - -const MLCommon::cumlCommunicator& cumlHandle_impl::getCommunicator() const { - ASSERT(nullptr != _communicator.get(), - "ERROR: Communicator was not initialized\n"); - return *_communicator; -} - -bool cumlHandle_impl::commsInitialized() const { - return (nullptr != _communicator.get()); -} - -void cumlHandle_impl::createResources() { - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - _streams.push_back(stream); - for (int i = 1; i < _num_streams; ++i) { - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - _streams.push_back(stream); - } - CUDA_CHECK(cudaEventCreateWithFlags(&_event, cudaEventDisableTiming)); -} - -void cumlHandle_impl::destroyResources() { - if (_cusparseInitialized) { - CUSPARSE_CHECK_NO_THROW(cusparseDestroy(_cusparse_handle)); - } - if (_cusolverDnInitialized) { - CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(_cusolverDn_handle)); - } - if (_cusolverSpInitialized) { - CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(_cusolverSp_handle)); - } - if (_cublasInitialized) { - CUBLAS_CHECK_NO_THROW(cublasDestroy(_cublas_handle)); - } - while (!_streams.empty()) { - CUDA_CHECK_NO_THROW(cudaStreamDestroy(_streams.back())); - _streams.pop_back(); - } - CUDA_CHECK_NO_THROW(cudaEventDestroy(_event)); -} - HandleMap handleMap; std::pair HandleMap::createAndInsertHandle() { cumlError_t status = CUML_SUCCESS; cumlHandle_t chosen_handle; try { - auto handle_ptr = new ML::cumlHandle(); + auto handle_ptr = new raft::handle_t(); bool inserted; { std::lock_guard guard(_mapMutex); @@ -274,19 +61,20 @@ std::pair HandleMap::createAndInsertHandle() { return std::pair(chosen_handle, status); } -std::pair HandleMap::lookupHandlePointer( +std::pair HandleMap::lookupHandlePointer( cumlHandle_t handle) const { std::lock_guard guard(_mapMutex); auto it = _handleMap.find(handle); if (it == _handleMap.end()) { - return std::pair(nullptr, CUML_INVALID_HANDLE); + return std::pair(nullptr, + CUML_INVALID_HANDLE); } else { - return std::pair(it->second, CUML_SUCCESS); + return std::pair(it->second, CUML_SUCCESS); } } cumlError_t HandleMap::removeAndDestroyHandle(cumlHandle_t handle) { - ML::cumlHandle* handle_ptr; + raft::handle_t* handle_ptr; { std::lock_guard guard(_mapMutex); auto it = _handleMap.find(handle); diff --git a/cpp/src/common/cumlHandle.hpp b/cpp/src/common/cumlHandle.hpp index 0b732f4ccd..864d2c8cc2 100644 --- a/cpp/src/common/cumlHandle.hpp +++ b/cpp/src/common/cumlHandle.hpp @@ -26,10 +26,11 @@ #include #include -#include +#include +#include #include -#include +#include #include @@ -38,65 +39,6 @@ namespace ML { using MLCommon::deviceAllocator; using MLCommon::hostAllocator; -/** - * @todo: Add doxygen documentation - */ -class cumlHandle_impl { - public: - cumlHandle_impl(int n_streams = cumlHandle::getDefaultNumInternalStreams()); - ~cumlHandle_impl(); - int getDevice() const; - void setStream(cudaStream_t stream); - cudaStream_t getStream() const; - void setDeviceAllocator(std::shared_ptr allocator); - std::shared_ptr getDeviceAllocator() const; - void setHostAllocator(std::shared_ptr allocator); - std::shared_ptr getHostAllocator() const; - - cublasHandle_t getCublasHandle() const; - cusolverDnHandle_t getcusolverDnHandle() const; - cusolverSpHandle_t getcusolverSpHandle() const; - cusparseHandle_t getcusparseHandle() const; - - cudaStream_t getInternalStream(int sid) const; - int getNumInternalStreams() const; - - std::vector getInternalStreams() const; - - void waitOnUserStream() const; - void waitOnInternalStreams() const; - - void setCommunicator( - std::shared_ptr communicator); - const MLCommon::cumlCommunicator& getCommunicator() const; - bool commsInitialized() const; - - const cudaDeviceProp& getDeviceProperties() const; - - private: - mutable cublasHandle_t _cublas_handle; - mutable cusolverDnHandle_t _cusolverDn_handle; - mutable cusolverSpHandle_t _cusolverSp_handle; - mutable cusparseHandle_t _cusparse_handle; - cudaStream_t _userStream; - cudaEvent_t _event; - std::shared_ptr _deviceAllocator; - std::shared_ptr _hostAllocator; - std::shared_ptr _communicator; - std::vector _streams; - mutable cudaDeviceProp _prop; - const int _dev_id; - const int _num_streams; - mutable bool _cublasInitialized; - mutable bool _cusolverDnInitialized; - mutable bool _cusolverSpInitialized; - mutable bool _cusparseInitialized; - mutable bool _devicePropInitialized; - - void createResources(); - void destroyResources(); -}; - /** * Map from integral cumlHandle_t identifiers to cumlHandle pointer protected * by a mutex for thread-safe access. @@ -118,7 +60,7 @@ class HandleMap { * the handle is INVALID_HANDLE. Error code CUML_INAVLID_HANDLE * is returned if the provided `handle` is invald. */ - std::pair lookupHandlePointer( + std::pair lookupHandlePointer( cumlHandle_t handle) const; /** @@ -134,7 +76,7 @@ class HandleMap { -1; //!< sentinel value for invalid ID private: - std::unordered_map + std::unordered_map _handleMap; //!< map from ID to pointer mutable std::mutex _mapMutex; //!< mutex protecting the map cumlHandle_t _nextHandle; //!< value of next handle ID @@ -143,25 +85,4 @@ class HandleMap { /// Static handle map instance (see cumlHandle.cpp) extern HandleMap handleMap; -namespace detail { - -/** - * @todo: Add doxygen documentation - */ -class streamSyncer { - public: - streamSyncer(const cumlHandle_impl& handle) : _handle(handle) { - _handle.waitOnUserStream(); - } - ~streamSyncer() { _handle.waitOnInternalStreams(); } - - streamSyncer(const streamSyncer& other) = delete; - streamSyncer& operator=(const streamSyncer& other) = delete; - - private: - const cumlHandle_impl& _handle; -}; - -} // end namespace detail - } // end namespace ML diff --git a/cpp/src/common/cuml_api.cpp b/cpp/src/common/cuml_api.cpp index d41e721951..fb66ff78ab 100644 --- a/cpp/src/common/cuml_api.cpp +++ b/cpp/src/common/cuml_api.cpp @@ -14,16 +14,18 @@ * limitations under the License. */ -#include #include +#include #include #include +#include +#include #include "cumlHandle.hpp" namespace ML { namespace detail { -class hostAllocatorFunctionWrapper : public MLCommon::hostAllocator { +class hostAllocatorFunctionWrapper : public raft::mr::host::allocator { public: hostAllocatorFunctionWrapper(cuml_allocate allocate_fn, cuml_deallocate deallocate_fn) @@ -44,7 +46,8 @@ class hostAllocatorFunctionWrapper : public MLCommon::hostAllocator { const std::function _deallocate_fn; }; -class deviceAllocatorFunctionWrapper : public MLCommon::deviceAllocator { +class deviceAllocatorFunctionWrapper + : public raft::mr::device::default_allocator { public: deviceAllocatorFunctionWrapper(cuml_allocate allocate_fn, cuml_deallocate deallocate_fn) @@ -87,11 +90,11 @@ extern "C" cumlError_t cumlCreate(cumlHandle_t* handle) { extern "C" cumlError_t cumlSetStream(cumlHandle_t handle, cudaStream_t stream) { cumlError_t status; - ML::cumlHandle* handle_ptr; + raft::handle_t* handle_ptr; std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle); if (status == CUML_SUCCESS) { try { - handle_ptr->setStream(stream); + handle_ptr->set_stream(stream); } //TODO: Implement this //catch (const MLCommon::Exception& e) @@ -109,11 +112,11 @@ extern "C" cumlError_t cumlSetStream(cumlHandle_t handle, cudaStream_t stream) { extern "C" cumlError_t cumlGetStream(cumlHandle_t handle, cudaStream_t* stream) { cumlError_t status; - ML::cumlHandle* handle_ptr; + raft::handle_t* handle_ptr; std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle); if (status == CUML_SUCCESS) { try { - *stream = handle_ptr->getStream(); + *stream = handle_ptr->get_stream(); } //TODO: Implement this //catch (const MLCommon::Exception& e) @@ -132,14 +135,14 @@ extern "C" cumlError_t cumlSetDeviceAllocator(cumlHandle_t handle, cuml_allocate allocate_fn, cuml_deallocate deallocate_fn) { cumlError_t status; - ML::cumlHandle* handle_ptr; + raft::handle_t* handle_ptr; std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle); if (status == CUML_SUCCESS) { try { std::shared_ptr allocator( new ML::detail::deviceAllocatorFunctionWrapper(allocate_fn, deallocate_fn)); - handle_ptr->setDeviceAllocator(allocator); + handle_ptr->set_device_allocator(allocator); } //TODO: Implement this //catch (const MLCommon::Exception& e) @@ -158,14 +161,14 @@ extern "C" cumlError_t cumlSetHostAllocator(cumlHandle_t handle, cuml_allocate allocate_fn, cuml_deallocate deallocate_fn) { cumlError_t status; - ML::cumlHandle* handle_ptr; + raft::handle_t* handle_ptr; std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle); if (status == CUML_SUCCESS) { try { std::shared_ptr allocator( new ML::detail::hostAllocatorFunctionWrapper(allocate_fn, deallocate_fn)); - handle_ptr->setHostAllocator(allocator); + handle_ptr->set_host_allocator(allocator); } //TODO: Implement this //catch (const MLCommon::Exception& e) diff --git a/cpp/src/common/logger.cpp b/cpp/src/common/logger.cpp index 8da2c5384b..2a01754e2b 100644 --- a/cpp/src/common/logger.cpp +++ b/cpp/src/common/logger.cpp @@ -18,7 +18,9 @@ #include // NOLINT #include +#include #include +#include namespace ML { @@ -48,7 +50,10 @@ Logger& Logger::get() { return logger; } -Logger::Logger() : logger{spdlog::stdout_color_mt("cuml")}, currPattern() { +Logger::Logger() + : sink{std::make_shared()}, + logger{std::make_shared("cuml", sink)}, + currPattern() { setPattern(DefaultPattern); setLevel(CUML_LEVEL_INFO); } @@ -63,6 +68,12 @@ void Logger::setPattern(const std::string& pattern) { logger->set_pattern(pattern); } +void Logger::setCallback(spdlog::sinks::LogCallback callback) { + sink->set_callback(callback); +} + +void Logger::setFlush(void (*flush)()) { sink->set_flush(flush); } + bool Logger::shouldLogFor(int level) const { level = convert_level_to_spdlog(level); auto level_e = static_cast(level); @@ -87,6 +98,8 @@ void Logger::log(int level, const char* fmt, ...) { } } +void Logger::flush() { logger->flush(); } + PatternSetter::PatternSetter(const std::string& pattern) : prevPattern() { prevPattern = Logger::get().getPattern(); Logger::get().setPattern(pattern); diff --git a/cpp/src/common/nvtx.cu b/cpp/src/common/nvtx.cu index a2f8fb9be9..a1cad6420c 100644 --- a/cpp/src/common/nvtx.cu +++ b/cpp/src/common/nvtx.cu @@ -54,7 +54,7 @@ uint32_t hsv2rgb(float h, float s, float v) { } // convert hue from [0, 1] range to [0, 360] float h_deg = h * 360.f; - if (0.f < h_deg || h_deg >= 360.f) h_deg = 0.f; + if (0.f > h_deg || h_deg >= 360.f) h_deg = 0.f; h_deg /= 60.f; int h_range = (int)h_deg; float h_mod = h_deg - h_range; @@ -147,16 +147,7 @@ void PUSH_RANGE(const char *name) { nvtxRangePushEx(&eventAttrib); } -#ifdef ENABLE_EMPTY_MARKER_KERNEL -__global__ void emptyMarkerKernel() {} -#endif // ENABLE_EMPTY_MARKER_KERNEL - -void POP_RANGE() { - nvtxRangePop(); -#ifdef ENABLE_EMPTY_MARKER_KERNEL - emptyMarkerKernel<<<1, 1>>>(); -#endif -} +void POP_RANGE() { nvtxRangePop(); } #else // NVTX_ENABLED diff --git a/cpp/src/common/tensor.hpp b/cpp/src/common/tensor.hpp index 18fe6d2b0f..5646d421c2 100644 --- a/cpp/src/common/tensor.hpp +++ b/cpp/src/common/tensor.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include diff --git a/cpp/src/comms/cuML_comms_test.cpp b/cpp/src/comms/cuML_comms_test.cpp deleted file mode 100644 index a15cc77647..0000000000 --- a/cpp/src/comms/cuML_comms_test.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cuML_comms_test.hpp" - -#include -#include -#include -#include - -namespace ML { -namespace Comms { - -bool test_collective_allreduce(const ML::cumlHandle& h) { - const cumlHandle_impl& handle = h.getImpl(); - ML::detail::streamSyncer _(handle); - const MLCommon::cumlCommunicator& communicator = handle.getCommunicator(); - - const int send = 1; - - cudaStream_t stream = handle.getStream(); - - MLCommon::device_buffer temp_d(handle.getDeviceAllocator(), stream); - temp_d.resize(1, stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); - communicator.allreduce(temp_d.data(), temp_d.data(), 1, - MLCommon::cumlCommunicator::SUM, stream); - int temp_h = 0; - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - communicator.barrier(); - - std::cout << "Clique size: " << communicator.getSize() << std::endl; - std::cout << "final_size: " << temp_h << std::endl; - - return temp_h == communicator.getSize(); -} - -bool test_pointToPoint_simple_send_recv(const ML::cumlHandle& h, - int numTrials) { - const cumlHandle_impl& handle = h.getImpl(); - const MLCommon::cumlCommunicator& communicator = handle.getCommunicator(); - const int rank = communicator.getRank(); - - bool ret = true; - for (int i = 0; i < numTrials; i++) { - std::vector received_data((communicator.getSize() - 1), -1); - - std::vector requests; - requests.resize(2 * (communicator.getSize() - 1)); - int request_idx = 0; - //post receives - for (int r = 0; r < communicator.getSize(); ++r) { - if (r != rank) { - communicator.irecv(received_data.data() + request_idx, 1, r, 0, - requests.data() + request_idx); - ++request_idx; - } - } - - for (int r = 0; r < communicator.getSize(); ++r) { - if (r != rank) { - communicator.isend(&rank, 1, r, 0, requests.data() + request_idx); - ++request_idx; - } - } - - communicator.waitall(requests.size(), requests.data()); - communicator.barrier(); - - if (communicator.getRank() == 0) { - std::cout << "=========================" << std::endl; - std::cout << "Trial " << i << std::endl; - } - - for (int printrank = 0; printrank < communicator.getSize(); ++printrank) { - if (communicator.getRank() == printrank) { - std::cout << "Rank " << communicator.getRank() << " received: ["; - for (int i = 0; i < received_data.size(); i++) { - auto rec = received_data[i]; - std::cout << rec; - if (rec == -1) ret = false; - communicator.barrier(); - if (i < received_data.size() - 1) std::cout << ", "; - } - std::cout << "]" << std::endl; - } - - communicator.barrier(); - } - - if (communicator.getRank() == 0) - std::cout << "=========================" << std::endl; - } - - return ret; -} - -bool test_pointToPoint_recv_any_rank(const ML::cumlHandle& h, int numTrials) { - const cumlHandle_impl& handle = h.getImpl(); - const MLCommon::cumlCommunicator& communicator = handle.getCommunicator(); - const int rank = communicator.getRank(); - - bool ret = true; - for (int i = 0; i < numTrials; i++) { - std::vector received_data((communicator.getSize() - 1), -1); - - std::vector requests; - requests.resize(2 * (communicator.getSize() - 1)); - int request_idx = 0; - //post receives - for (int r = 0; r < communicator.getSize(); ++r) { - if (r != rank) { - communicator.irecv(received_data.data() + request_idx, 1, - MLCommon::cumlCommunicator::CUML_ANY_SOURCE, 0, - requests.data() + request_idx); - ++request_idx; - } - } - - for (int r = 0; r < communicator.getSize(); ++r) { - if (r != rank) { - communicator.isend(&rank, 1, r, 0, requests.data() + request_idx); - ++request_idx; - } - } - - std::cout << "Waiting..." << std::endl; - communicator.waitall(requests.size(), requests.data()); - communicator.barrier(); - - if (communicator.getRank() == 0) { - std::cout << "=========================" << std::endl; - std::cout << "Trial " << i << std::endl; - } - - for (int printrank = 0; printrank < communicator.getSize(); ++printrank) { - if (communicator.getRank() == printrank) { - std::cout << "Rank " << communicator.getRank() << " received: ["; - for (int i = 0; i < received_data.size(); i++) { - auto rec = received_data[i]; - std::cout << rec; - if (rec == -1) ret = false; - communicator.barrier(); - if (i < received_data.size() - 1) std::cout << ", "; - } - std::cout << "]" << std::endl; - } - communicator.barrier(); - } - - if (communicator.getRank() == 0) - std::cout << "=========================" << std::endl; - } - - return ret; -} - -}; // namespace Comms -}; // end namespace ML diff --git a/cpp/src/comms/cuML_comms_test.hpp b/cpp/src/comms/cuML_comms_test.hpp deleted file mode 100644 index f363d5d47c..0000000000 --- a/cpp/src/comms/cuML_comms_test.hpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace ML { -namespace Comms { - -/** - * @brief Simple allreduce test for single integer value of 1. Each rank - * evaluates whether their allreduced value equals the size of the clique. - * @param[in] handle cumlHandle instance with initialized cumlCommunicator - */ -bool test_collective_allreduce(const ML::cumlHandle& handle); - -/** - * @brief Simple point-to-point test. Each rank passes its rank to all other - * ranks and verifies that it received messages from all other ranks. - * @param[in] handle cumlHandle instance with initialized cumlCommunicator - * @param[in] n_trials number of iterations to pass messages - */ -bool test_pointToPoint_simple_send_recv(const ML::cumlHandle& handle, - int n_trials); - -bool test_pointToPoint_recv_any_rank(const ML::cumlHandle& handle, - int numTrials); - -}; // namespace Comms -}; // end namespace ML diff --git a/cpp/src/datasets/make_arima.cu b/cpp/src/datasets/make_arima.cu index dac40d1fb4..f309828d4b 100644 --- a/cpp/src/datasets/make_arima.cu +++ b/cpp/src/datasets/make_arima.cu @@ -22,25 +22,25 @@ namespace ML { namespace Datasets { template -inline void make_arima_helper(const cumlHandle& handle, DataT* out, +inline void make_arima_helper(const raft::handle_t& handle, DataT* out, IdxT batch_size, IdxT n_obs, ARIMAOrder order, DataT scale, DataT noise_scale, DataT intercept_scale, uint64_t seed) { - auto stream = handle.getStream(); - auto allocator = handle.getImpl().getDeviceAllocator(); + auto stream = handle.get_stream(); + auto allocator = handle.get_device_allocator(); MLCommon::Random::make_arima(out, batch_size, n_obs, order, allocator, stream, scale, noise_scale, intercept_scale, seed); } -void make_arima(const cumlHandle& handle, float* out, int batch_size, int n_obs, - ARIMAOrder order, float scale, float noise_scale, +void make_arima(const raft::handle_t& handle, float* out, int batch_size, + int n_obs, ARIMAOrder order, float scale, float noise_scale, float intercept_scale, uint64_t seed) { make_arima_helper(handle, out, batch_size, n_obs, order, scale, noise_scale, intercept_scale, seed); } -void make_arima(const cumlHandle& handle, double* out, int batch_size, +void make_arima(const raft::handle_t& handle, double* out, int batch_size, int n_obs, ARIMAOrder order, double scale, double noise_scale, double intercept_scale, uint64_t seed) { make_arima_helper(handle, out, batch_size, n_obs, order, scale, noise_scale, diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu index 0c3713a0ee..8059d2bd89 100644 --- a/cpp/src/datasets/make_blobs.cu +++ b/cpp/src/datasets/make_blobs.cu @@ -21,48 +21,48 @@ namespace ML { namespace Datasets { -void make_blobs(const cumlHandle& handle, float* out, int64_t* labels, +void make_blobs(const raft::handle_t& handle, float* out, int64_t* labels, int64_t n_rows, int64_t n_cols, int64_t n_clusters, bool row_major, const float* centers, const float* cluster_std, const float cluster_std_scalar, bool shuffle, float center_box_min, float center_box_max, uint64_t seed) { MLCommon::Random::make_blobs( - out, labels, n_rows, n_cols, n_clusters, handle.getDeviceAllocator(), - handle.getStream(), row_major, centers, cluster_std, cluster_std_scalar, + out, labels, n_rows, n_cols, n_clusters, handle.get_device_allocator(), + handle.get_stream(), row_major, centers, cluster_std, cluster_std_scalar, shuffle, center_box_min, center_box_max, seed); } -void make_blobs(const cumlHandle& handle, double* out, int64_t* labels, +void make_blobs(const raft::handle_t& handle, double* out, int64_t* labels, int64_t n_rows, int64_t n_cols, int64_t n_clusters, bool row_major, const double* centers, const double* cluster_std, const double cluster_std_scalar, bool shuffle, double center_box_min, double center_box_max, uint64_t seed) { MLCommon::Random::make_blobs( - out, labels, n_rows, n_cols, n_clusters, handle.getDeviceAllocator(), - handle.getStream(), row_major, centers, cluster_std, cluster_std_scalar, + out, labels, n_rows, n_cols, n_clusters, handle.get_device_allocator(), + handle.get_stream(), row_major, centers, cluster_std, cluster_std_scalar, shuffle, center_box_min, center_box_max, seed); } -void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows, - int n_cols, int n_clusters, bool row_major, +void make_blobs(const raft::handle_t& handle, float* out, int* labels, + int n_rows, int n_cols, int n_clusters, bool row_major, const float* centers, const float* cluster_std, const float cluster_std_scalar, bool shuffle, float center_box_min, float center_box_max, uint64_t seed) { MLCommon::Random::make_blobs( - out, labels, n_rows, n_cols, n_clusters, handle.getDeviceAllocator(), - handle.getStream(), row_major, centers, cluster_std, cluster_std_scalar, + out, labels, n_rows, n_cols, n_clusters, handle.get_device_allocator(), + handle.get_stream(), row_major, centers, cluster_std, cluster_std_scalar, shuffle, center_box_min, center_box_max, seed); } -void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows, - int n_cols, int n_clusters, bool row_major, +void make_blobs(const raft::handle_t& handle, double* out, int* labels, + int n_rows, int n_cols, int n_clusters, bool row_major, const double* centers, const double* cluster_std, const double cluster_std_scalar, bool shuffle, double center_box_min, double center_box_max, uint64_t seed) { MLCommon::Random::make_blobs( - out, labels, n_rows, n_cols, n_clusters, handle.getDeviceAllocator(), - handle.getStream(), row_major, centers, cluster_std, cluster_std_scalar, + out, labels, n_rows, n_cols, n_clusters, handle.get_device_allocator(), + handle.get_stream(), row_major, centers, cluster_std, cluster_std_scalar, shuffle, center_box_min, center_box_max, seed); } } // namespace Datasets diff --git a/cpp/src/datasets/make_regression.cu b/cpp/src/datasets/make_regression.cu index 0fd600f2d2..82b30428cf 100644 --- a/cpp/src/datasets/make_regression.cu +++ b/cpp/src/datasets/make_regression.cu @@ -22,24 +22,24 @@ namespace ML { namespace Datasets { template -void make_regression_helper(const cumlHandle& handle, DataT* out, DataT* values, - IdxT n_rows, IdxT n_cols, IdxT n_informative, - DataT* coef, IdxT n_targets, DataT bias, - IdxT effective_rank, DataT tail_strength, - DataT noise, bool shuffle, uint64_t seed) { - const auto& handle_impl = handle.getImpl(); - cudaStream_t stream = handle_impl.getStream(); - cublasHandle_t cublas_handle = handle_impl.getCublasHandle(); - cusolverDnHandle_t cusolver_handle = handle_impl.getcusolverDnHandle(); - auto allocator = handle_impl.getDeviceAllocator(); +void make_regression_helper(const raft::handle_t& handle, DataT* out, + DataT* values, IdxT n_rows, IdxT n_cols, + IdxT n_informative, DataT* coef, IdxT n_targets, + DataT bias, IdxT effective_rank, + DataT tail_strength, DataT noise, bool shuffle, + uint64_t seed) { + const auto& handle_impl = handle; + cudaStream_t stream = handle_impl.get_stream(); + cublasHandle_t cublas_handle = handle_impl.get_cublas_handle(); + cusolverDnHandle_t cusolver_handle = handle_impl.get_cusolver_dn_handle(); + auto allocator = handle_impl.get_device_allocator(); MLCommon::Random::make_regression( - out, values, n_rows, n_cols, n_informative, cublas_handle, cusolver_handle, - allocator, stream, coef, n_targets, bias, effective_rank, tail_strength, - noise, shuffle, seed); + handle, out, values, n_rows, n_cols, n_informative, stream, coef, n_targets, + bias, effective_rank, tail_strength, noise, shuffle, seed); } -void make_regression(const cumlHandle& handle, float* out, float* values, +void make_regression(const raft::handle_t& handle, float* out, float* values, int64_t n_rows, int64_t n_cols, int64_t n_informative, float* coef, int64_t n_targets, float bias, int64_t effective_rank, float tail_strength, float noise, @@ -49,7 +49,7 @@ void make_regression(const cumlHandle& handle, float* out, float* values, noise, shuffle, seed); } -void make_regression(const cumlHandle& handle, double* out, double* values, +void make_regression(const raft::handle_t& handle, double* out, double* values, int64_t n_rows, int64_t n_cols, int64_t n_informative, double* coef, int64_t n_targets, double bias, int64_t effective_rank, double tail_strength, double noise, @@ -59,7 +59,7 @@ void make_regression(const cumlHandle& handle, double* out, double* values, noise, shuffle, seed); } -void make_regression(const cumlHandle& handle, float* out, float* values, +void make_regression(const raft::handle_t& handle, float* out, float* values, int n_rows, int n_cols, int n_informative, float* coef, int n_targets, float bias, int effective_rank, float tail_strength, float noise, bool shuffle, @@ -69,7 +69,7 @@ void make_regression(const cumlHandle& handle, float* out, float* values, noise, shuffle, seed); } -void make_regression(const cumlHandle& handle, double* out, double* values, +void make_regression(const raft::handle_t& handle, double* out, double* values, int n_rows, int n_cols, int n_informative, double* coef, int n_targets, double bias, int effective_rank, double tail_strength, double noise, bool shuffle, diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh index a9082b6134..f80a827d7c 100644 --- a/cpp/src/dbscan/adjgraph/algo.cuh +++ b/cpp/src/dbscan/adjgraph/algo.cuh @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include "../common.cuh" #include "pack.h" @@ -41,12 +41,12 @@ static const int TPB_X = 256; * CSR row_ind_ptr array (adj_graph) and filters into a core_pts array based on min_pts. */ template -void launcher(const ML::cumlHandle_impl &handle, Pack data, - Index_ batchSize, cudaStream_t stream) { +void launcher(const raft::handle_t &handle, Pack data, Index_ batchSize, + cudaStream_t stream) { device_ptr dev_vd = device_pointer_cast(data.vd); device_ptr dev_ex_scan = device_pointer_cast(data.ex_scan); - ML::thrustAllocatorAdapter alloc(handle.getDeviceAllocator(), stream); + ML::thrustAllocatorAdapter alloc(handle.get_device_allocator(), stream); exclusive_scan(thrust::cuda::par(alloc).on(stream), dev_vd, dev_vd + batchSize, dev_ex_scan); diff --git a/cpp/src/dbscan/adjgraph/naive.cuh b/cpp/src/dbscan/adjgraph/naive.cuh index dbffae372d..ae44f5a59e 100644 --- a/cpp/src/dbscan/adjgraph/naive.cuh +++ b/cpp/src/dbscan/adjgraph/naive.cuh @@ -16,10 +16,10 @@ #pragma once -#include +#include #include #include -#include +#include #include "../common.cuh" #include "pack.h" @@ -28,23 +28,23 @@ namespace AdjGraph { namespace Naive { template -void launcher(const ML::cumlHandle_impl& handle, Pack data, - Index_ batchSize, cudaStream_t stream) { +void launcher(const raft::handle_t& handle, Pack data, Index_ batchSize, + cudaStream_t stream) { Index_ k = 0; Index_ N = data.N; - MLCommon::host_buffer host_vd(handle.getHostAllocator(), stream, + MLCommon::host_buffer host_vd(handle.get_host_allocator(), stream, batchSize + 1); - MLCommon::host_buffer host_core_pts(handle.getHostAllocator(), stream, + MLCommon::host_buffer host_core_pts(handle.get_host_allocator(), stream, batchSize); - MLCommon::host_buffer host_adj(handle.getHostAllocator(), stream, + MLCommon::host_buffer host_adj(handle.get_host_allocator(), stream, batchSize * N); - MLCommon::host_buffer host_ex_scan(handle.getHostAllocator(), stream, - batchSize); - MLCommon::updateHost(host_adj.data(), data.adj, batchSize * N, stream); - MLCommon::updateHost(host_vd.data(), data.vd, batchSize + 1, stream); + MLCommon::host_buffer host_ex_scan(handle.get_host_allocator(), + stream, batchSize); + raft::update_host(host_adj.data(), data.adj, batchSize * N, stream); + raft::update_host(host_vd.data(), data.vd, batchSize + 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); size_t adjgraph_size = size_t(host_vd[batchSize]); - MLCommon::host_buffer host_adj_graph(handle.getHostAllocator(), + MLCommon::host_buffer host_adj_graph(handle.get_host_allocator(), stream, adjgraph_size); for (Index_ i = 0; i < batchSize; i++) { for (Index_ j = 0; j < N; j++) { @@ -59,11 +59,10 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, host_ex_scan[0] = Index_(0); for (Index_ i = 1; i < batchSize; i++) host_ex_scan[i] = host_ex_scan[i - 1] + host_vd[i - 1]; - MLCommon::updateDevice(data.adj_graph, host_adj_graph.data(), adjgraph_size, - stream); - MLCommon::updateDevice(data.core_pts, host_core_pts.data(), batchSize, - stream); - MLCommon::updateDevice(data.ex_scan, host_ex_scan.data(), batchSize, stream); + raft::update_device(data.adj_graph, host_adj_graph.data(), adjgraph_size, + stream); + raft::update_device(data.core_pts, host_core_pts.data(), batchSize, stream); + raft::update_device(data.ex_scan, host_ex_scan.data(), batchSize, stream); } } // namespace Naive } // namespace AdjGraph diff --git a/cpp/src/dbscan/adjgraph/runner.cuh b/cpp/src/dbscan/adjgraph/runner.cuh index 4fe0ae463a..90122a3bd5 100644 --- a/cpp/src/dbscan/adjgraph/runner.cuh +++ b/cpp/src/dbscan/adjgraph/runner.cuh @@ -25,10 +25,9 @@ namespace Dbscan { namespace AdjGraph { template -void run(const ML::cumlHandle_impl& handle, bool* adj, Index_* vd, - Index_* adj_graph, Index_ adjnnz, Index_* ex_scan, Index_ N, - Index_ minpts, bool* core_pts, int algo, Index_ batchSize, - cudaStream_t stream) { +void run(const raft::handle_t& handle, bool* adj, Index_* vd, Index_* adj_graph, + Index_ adjnnz, Index_* ex_scan, Index_ N, Index_ minpts, + bool* core_pts, int algo, Index_ batchSize, cudaStream_t stream) { Pack data = {vd, adj, adj_graph, adjnnz, ex_scan, core_pts, N, minpts}; switch (algo) { diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu index 077e72212a..7f8ae9e286 100644 --- a/cpp/src/dbscan/dbscan.cu +++ b/cpp/src/dbscan/dbscan.cu @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include +#include #include #include #include "dbscan.cuh" @@ -24,70 +24,72 @@ namespace ML { using namespace Dbscan; -void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, - float eps, int min_pts, int *labels, size_t max_bytes_per_batch, - int verbosity) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, nullptr, max_bytes_per_batch, - handle.getStream(), verbosity); +void dbscanFit(const raft::handle_t &handle, float *input, int n_rows, + int n_cols, float eps, int min_pts, int *labels, + size_t max_bytes_per_batch, int verbosity) { + dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, labels, + nullptr, max_bytes_per_batch, handle.get_stream(), + verbosity); } -void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, - double eps, int min_pts, int *labels, size_t max_bytes_per_batch, - int verbosity) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, nullptr, max_bytes_per_batch, - handle.getStream(), verbosity); +void dbscanFit(const raft::handle_t &handle, double *input, int n_rows, + int n_cols, double eps, int min_pts, int *labels, + size_t max_bytes_per_batch, int verbosity) { + dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, + labels, nullptr, max_bytes_per_batch, + handle.get_stream(), verbosity); } -void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, - float eps, int min_pts, int *labels, int *core_sample_indices, - size_t max_bytes_per_batch, int verbosity) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, core_sample_indices, - max_bytes_per_batch, handle.getStream(), verbosity); +void dbscanFit(const raft::handle_t &handle, float *input, int n_rows, + int n_cols, float eps, int min_pts, int *labels, + int *core_sample_indices, size_t max_bytes_per_batch, + int verbosity) { + dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, labels, + core_sample_indices, max_bytes_per_batch, + handle.get_stream(), verbosity); } -void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, - double eps, int min_pts, int *labels, int *core_sample_indices, - size_t max_bytes_per_batch, int verbosity) { - dbscanFitImpl( - handle.getImpl(), input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, max_bytes_per_batch, handle.getStream(), verbosity); +void dbscanFit(const raft::handle_t &handle, double *input, int n_rows, + int n_cols, double eps, int min_pts, int *labels, + int *core_sample_indices, size_t max_bytes_per_batch, + int verbosity) { + dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, + labels, core_sample_indices, max_bytes_per_batch, + handle.get_stream(), verbosity); } -void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows, +void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows, int64_t n_cols, float eps, int min_pts, int64_t *labels, size_t max_bytes_per_batch, int verbosity) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, nullptr, max_bytes_per_batch, - handle.getStream(), verbosity); + dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, + labels, nullptr, max_bytes_per_batch, + handle.get_stream(), verbosity); } -void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows, +void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows, int64_t n_cols, double eps, int min_pts, int64_t *labels, size_t max_bytes_per_batch, int verbosity) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, nullptr, max_bytes_per_batch, - handle.getStream(), verbosity); + dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, + labels, nullptr, max_bytes_per_batch, + handle.get_stream(), verbosity); } -void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows, +void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows, int64_t n_cols, float eps, int min_pts, int64_t *labels, int64_t *core_sample_indices, size_t max_bytes_per_batch, int verbosity) { dbscanFitImpl( - handle.getImpl(), input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, max_bytes_per_batch, handle.getStream(), verbosity); + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); } -void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows, +void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows, int64_t n_cols, double eps, int min_pts, int64_t *labels, int64_t *core_sample_indices, size_t max_bytes_per_batch, int verbosity) { dbscanFitImpl( - handle.getImpl(), input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, max_bytes_per_batch, handle.getStream(), verbosity); + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); } }; // end namespace ML diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh index 04a9d642f8..af6420f1d1 100644 --- a/cpp/src/dbscan/dbscan.cuh +++ b/cpp/src/dbscan/dbscan.cuh @@ -60,12 +60,12 @@ Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows, max_mbytes_per_batch = DEFAULT_MAX_MEM_MBYTES; } - Index_ nBatches = - (Index_)ceildiv(estimated_memory, max_mbytes_per_batch * 1000000); + Index_ nBatches = (Index_)raft::ceildiv( + estimated_memory, max_mbytes_per_batch * 1000000); Index_ MAX_LABEL = std::numeric_limits::max(); // to avoid overflow, we need: batch_size <= MAX_LABEL / n_rows (floor div) - // -> num_batches >= ceildiv(n_rows / (MAX_LABEL / n_rows)) - Index_ nBatchesPrec = ceildiv(n_rows, MAX_LABEL / n_rows); + // -> num_batches >= raft::ceildiv(n_rows / (MAX_LABEL / n_rows)) + Index_ nBatchesPrec = raft::ceildiv(n_rows, MAX_LABEL / n_rows); // at some point, if nBatchesPrec is larger than nBatches // (or larger by a given factor) and we know that there are clear // performance benefits of using a smaller number of batches, @@ -75,7 +75,7 @@ Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows, // actually improve performance, even when using >16.10^9 points per batch. // Much larger batches than 16.10^9 do not currently fit on GPU architectures if (sizeof(Index_) > sizeof(int) && - (size_t)n_rows * ceildiv(n_rows, nBatches) < + (size_t)n_rows * raft::ceildiv(n_rows, nBatches) < std::numeric_limits::max()) { CUML_LOG_WARN( "You are using an index type of size (%d bytes) but a smaller index " @@ -92,7 +92,7 @@ Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows, } template -void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows, +void dbscanFitImpl(const raft::handle_t &handle, T *input, Index_ n_rows, Index_ n_cols, T eps, Index_ min_pts, Index_ *labels, Index_ *core_sample_indices, size_t max_mbytes_per_batch, cudaStream_t stream, int verbosity) { @@ -117,7 +117,7 @@ void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows, handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, algoVd, algoAdj, algoCcl, NULL, n_batches, stream); - MLCommon::device_buffer workspace(handle.getDeviceAllocator(), stream, + MLCommon::device_buffer workspace(handle.get_device_allocator(), stream, workspaceSize); Dbscan::run(handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, algoVd, algoAdj, algoCcl, workspace.data(), diff --git a/cpp/src/dbscan/dbscan_api.cpp b/cpp/src/dbscan/dbscan_api.cpp index de7233fcf5..15cb1bf684 100644 --- a/cpp/src/dbscan/dbscan_api.cpp +++ b/cpp/src/dbscan/dbscan_api.cpp @@ -24,12 +24,12 @@ cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows, int *core_sample_indices, size_t max_bytes_per_batch, int verbosity) { cumlError_t status; - ML::cumlHandle *handle_ptr; + raft::handle_t *handle_ptr; std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle); if (status == CUML_SUCCESS) { try { - dbscanFit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, max_bytes_per_batch, verbosity); + ML::dbscanFit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels, + core_sample_indices, max_bytes_per_batch, verbosity); } //TODO: Implement this //catch (const MLCommon::Exception& e) @@ -49,12 +49,12 @@ cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows, int *core_sample_indices, size_t max_bytes_per_batch, int verbosity) { cumlError_t status; - ML::cumlHandle *handle_ptr; + raft::handle_t *handle_ptr; std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle); if (status == CUML_SUCCESS) { try { - dbscanFit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, max_bytes_per_batch, verbosity); + ML::dbscanFit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels, + core_sample_indices, max_bytes_per_batch, verbosity); } //TODO: Implement this //catch (const MLCommon::Exception& e) diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh index 749bc0f04e..1f0a5c7f8d 100644 --- a/cpp/src/dbscan/runner.cuh +++ b/cpp/src/dbscan/runner.cuh @@ -16,12 +16,12 @@ #pragma once -#include +#include #include #include #include -#include #include