From e51419dd1515491d64d86b67f7c6704d46c9503a Mon Sep 17 00:00:00 2001 From: divyegala Date: Fri, 11 Oct 2024 12:27:15 -0700 Subject: [PATCH 1/3] passing tests --- .../all_cuda-118_arch-x86_64.yaml | 4 +++- .../all_cuda-125_arch-x86_64.yaml | 4 +++- dependencies.yaml | 10 ++++++++- .../cuml/cuml/cluster/hdbscan/prediction.pyx | 22 ++++++++++++------- .../cuml/cuml/tests/test_device_selection.py | 5 ++--- python/cuml/pyproject.toml | 2 +- 6 files changed, 32 insertions(+), 15 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 6cfbc14a6b..01d00fedbd 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -24,7 +24,6 @@ dependencies: - fmt>=11.0.2,<12 - gcc_linux-64=11.* - graphviz -- hdbscan>=0.8.38,<0.8.39 - hypothesis>=6.0,<7 - ipykernel - ipython @@ -51,6 +50,7 @@ dependencies: - numpydoc - nvcc_linux-64=11.8 - packaging +- pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.12.*,>=0.0.0a0 - pynndescent @@ -78,4 +78,6 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 +- pip: + - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index e6e0d0c726..9c0993ea3a 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -26,7 +26,6 @@ dependencies: - fmt>=11.0.2,<12 - gcc_linux-64=11.* - graphviz -- hdbscan>=0.8.38,<0.8.39 - hypothesis>=6.0,<7 - ipykernel - ipython @@ -47,6 +46,7 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - packaging +- pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.12.*,>=0.0.0a0 - pynndescent @@ -74,4 +74,6 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 +- pip: + - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 8d6a028d90..0f6f71f97f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -512,7 +512,6 @@ dependencies: packages: - *cython - dask-ml - - hdbscan>=0.8.38,<0.8.39 - hypothesis>=6.0,<7 - nltk - numpydoc @@ -527,6 +526,15 @@ dependencies: - umap-learn==0.5.6 - pynndescent - setuptools # Needed on Python 3.12 for dask-glm, which requires pkg_resources but Python 3.12 doesn't have setuptools by default + - output_types: conda + packages: + - pip + - pip: + - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master + - output_types: pyproject + packages: + - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master + test_notebooks: common: - output_types: [conda, requirements] diff --git a/python/cuml/cuml/cluster/hdbscan/prediction.pyx b/python/cuml/cuml/cluster/hdbscan/prediction.pyx index b3ef5b3587..169b26328b 100644 --- a/python/cuml/cuml/cluster/hdbscan/prediction.pyx +++ b/python/cuml/cuml/cluster/hdbscan/prediction.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -166,19 +166,25 @@ def all_points_membership_vectors(clusterer, batch_size=4096): # trained on gpu if not hasattr(clusterer, "_cpu_model"): - # the reference HDBSCAN implementations uses @property - # for attributes without setters available for them, - # so they can't be transferred from the GPU model - # to the CPU model - raise ValueError("Inferring on CPU is not supported yet when the " - "model has been trained on GPU") + clusterer.import_cpu_model() + clusterer.build_cpu_model() + clusterer.gpu_to_cpu() + # These attributes have to be reassigned to the CPU model + # as the raw arrays because the reference HDBSCAN implementation + # reconstructs the objects from the raw arrays + clusterer._cpu_model.condensed_tree_ = \ + clusterer.condensed_tree_._raw_tree + clusterer._cpu_model.single_linkage_tree_ = \ + clusterer.single_linkage_tree_._linkage + clusterer._cpu_model.minimum_spanning_tree_ = \ + clusterer.minimum_spanning_tree_._mst # this took a long debugging session to figure out, but # this method on cpu does not work without this copy for some reason clusterer._cpu_model.prediction_data_.raw_data = \ clusterer._cpu_model.prediction_data_.raw_data.copy() return cpu_all_points_membership_vectors(clusterer._cpu_model) - + # gpu infer, cpu/gpu train elif device_type == DeviceType.device: # trained on cpu if hasattr(clusterer, "_cpu_model"): diff --git a/python/cuml/cuml/tests/test_device_selection.py b/python/cuml/cuml/tests/test_device_selection.py index 449c032161..31c0f9aed6 100644 --- a/python/cuml/cuml/tests/test_device_selection.py +++ b/python/cuml/cuml/tests/test_device_selection.py @@ -932,9 +932,6 @@ def test_nn_methods(train_device, infer_device): @pytest.mark.parametrize("infer_device", ["cpu", "gpu"]) def test_hdbscan_methods(train_device, infer_device): - if train_device == "gpu" and infer_device == "cpu": - pytest.skip("Can't transfer attributes to cpu for now") - ref_model = refHDBSCAN( prediction_data=True, approx_min_span_tree=False, @@ -951,11 +948,13 @@ def test_hdbscan_methods(train_device, infer_device): ref_membership = cpu_all_points_membership_vectors(ref_model) ref_labels, ref_probs = cpu_approximate_predict(ref_model, X_test_blob) + gen_min_span_tree = train_device == "gpu" and infer_device == "cpu" model = HDBSCAN( prediction_data=True, approx_min_span_tree=False, max_cluster_size=0, min_cluster_size=30, + gen_min_span_tree=gen_min_span_tree, ) with using_device_type(train_device): trained_labels = model.fit_predict(X_train_blob) diff --git a/python/cuml/pyproject.toml b/python/cuml/pyproject.toml index 1b4bdeca47..66152e49fe 100644 --- a/python/cuml/pyproject.toml +++ b/python/cuml/pyproject.toml @@ -113,7 +113,7 @@ classifiers = [ test = [ "cython>=3.0.0", "dask-ml", - "hdbscan>=0.8.38,<0.8.39", + "hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master", "hypothesis>=6.0,<7", "nltk", "numpydoc", From d4b96106f69e96ae46a7b8734ace18ea80038354 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 16 Oct 2024 15:49:18 -0700 Subject: [PATCH 2/3] use new hdbscan release --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- dependencies.yaml | 2 +- python/cuml/pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 01d00fedbd..b53b161876 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -24,6 +24,7 @@ dependencies: - fmt>=11.0.2,<12 - gcc_linux-64=11.* - graphviz +- hdbscan>=0.8.39,<0.8.40 - hypothesis>=6.0,<7 - ipykernel - ipython @@ -69,7 +70,6 @@ dependencies: - scikit-learn==1.5 - scipy>=1.8.0 - seaborn -- setuptools - spdlog>=1.14.1,<1.15 - sphinx-copybutton - sphinx-markdown-tables diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 9c0993ea3a..e3306608b4 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -26,6 +26,7 @@ dependencies: - fmt>=11.0.2,<12 - gcc_linux-64=11.* - graphviz +- hdbscan>=0.8.39,<0.8.40 - hypothesis>=6.0,<7 - ipykernel - ipython @@ -65,7 +66,6 @@ dependencies: - scikit-learn==1.5 - scipy>=1.8.0 - seaborn -- setuptools - spdlog>=1.14.1,<1.15 - sphinx-copybutton - sphinx-markdown-tables diff --git a/dependencies.yaml b/dependencies.yaml index 0f6f71f97f..e6c16f9ce5 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -512,6 +512,7 @@ dependencies: packages: - *cython - dask-ml + - hdbscan>=0.8.39,<0.8.40 - hypothesis>=6.0,<7 - nltk - numpydoc @@ -525,7 +526,6 @@ dependencies: - statsmodels - umap-learn==0.5.6 - pynndescent - - setuptools # Needed on Python 3.12 for dask-glm, which requires pkg_resources but Python 3.12 doesn't have setuptools by default - output_types: conda packages: - pip diff --git a/python/cuml/pyproject.toml b/python/cuml/pyproject.toml index 66152e49fe..caea57dc71 100644 --- a/python/cuml/pyproject.toml +++ b/python/cuml/pyproject.toml @@ -114,6 +114,7 @@ test = [ "cython>=3.0.0", "dask-ml", "hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master", + "hdbscan>=0.8.39,<0.8.40", "hypothesis>=6.0,<7", "nltk", "numpydoc", @@ -125,7 +126,6 @@ test = [ "pytest==7.*", "scikit-learn==1.5", "seaborn", - "setuptools", "statsmodels", "umap-learn==0.5.6", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 83a156e9a47ccd31aa9039d679ffd66c5ec1753d Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 16 Oct 2024 15:52:53 -0700 Subject: [PATCH 3/3] remove extra pip install --- conda/environments/all_cuda-118_arch-x86_64.yaml | 3 --- conda/environments/all_cuda-125_arch-x86_64.yaml | 3 --- dependencies.yaml | 9 --------- python/cuml/pyproject.toml | 1 - 4 files changed, 16 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index b53b161876..fb5a8f118b 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -51,7 +51,6 @@ dependencies: - numpydoc - nvcc_linux-64=11.8 - packaging -- pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.12.*,>=0.0.0a0 - pynndescent @@ -78,6 +77,4 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 -- pip: - - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index e3306608b4..44a5164ba6 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -47,7 +47,6 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - packaging -- pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.12.*,>=0.0.0a0 - pynndescent @@ -74,6 +73,4 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 -- pip: - - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index e6c16f9ce5..a53a8c4d1a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -526,15 +526,6 @@ dependencies: - statsmodels - umap-learn==0.5.6 - pynndescent - - output_types: conda - packages: - - pip - - pip: - - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master - - output_types: pyproject - packages: - - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master - test_notebooks: common: - output_types: [conda, requirements] diff --git a/python/cuml/pyproject.toml b/python/cuml/pyproject.toml index caea57dc71..f31f5c61b1 100644 --- a/python/cuml/pyproject.toml +++ b/python/cuml/pyproject.toml @@ -113,7 +113,6 @@ classifiers = [ test = [ "cython>=3.0.0", "dask-ml", - "hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master", "hdbscan>=0.8.39,<0.8.40", "hypothesis>=6.0,<7", "nltk",