From c1123e09d1213d18d59daf01f7dc63b77c3bfdd1 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 4 Nov 2020 14:57:27 -0500 Subject: [PATCH 1/3] Speedup umap MNMG tests by lowering data sizes and removing parameters to test --- python/cuml/test/dask/test_label_encoder.py | 2 +- python/cuml/test/dask/test_umap.py | 61 ++++++++------------- 2 files changed, 23 insertions(+), 40 deletions(-) diff --git a/python/cuml/test/dask/test_label_encoder.py b/python/cuml/test/dask/test_label_encoder.py index 78fa9e002e..b8468fcea2 100644 --- a/python/cuml/test/dask/test_label_encoder.py +++ b/python/cuml/test/dask/test_label_encoder.py @@ -59,7 +59,7 @@ def test_labelencoder_transform(length, cardinality, client): encoder_arr = cp.asnumpy(encoded.compute().to_array()) encoded_arr = _arr_to_similarity_mat(encoder_arr) assert ( - (encoded_arr == encoded_arr.T) == (df_arr == df_arr.T) + (encoded_arr == encoded_arr.T) = r= (df_arr == df_arr.T) ).all() diff --git a/python/cuml/test/dask/test_umap.py b/python/cuml/test/dask/test_umap.py index 0f78457f04..1403c2dfb7 100644 --- a/python/cuml/test/dask/test_umap.py +++ b/python/cuml/test/dask/test_umap.py @@ -17,8 +17,8 @@ import cupy as cp import numpy as np +from cuml.common import logger from cuml.metrics import trustworthiness -from cuml.datasets import make_blobs import math @@ -28,32 +28,23 @@ def _load_dataset(dataset, n_rows): - if dataset == "make_blobs": - local_X, local_y = make_blobs(n_samples=n_rows, n_features=10, - centers=200, cluster_std=0.8, - random_state=42) + if dataset == "digits": + local_X, local_y = load_digits(return_X_y=True) - local_X = cp.asarray(local_X) - local_y = cp.asarray(local_y) + else: # dataset == "iris" + local_X, local_y = load_iris(return_X_y=True) - else: - if dataset == "digits": - local_X, local_y = load_digits(return_X_y=True) + local_X = cp.asarray(local_X) + local_y = cp.asarray(local_y) - else: # dataset == "iris" - local_X, local_y = load_iris(return_X_y=True) + local_X = local_X.repeat( + math.ceil(n_rows / len(local_X)), axis=0) + local_y = local_y.repeat( + math.ceil(n_rows / len(local_y)), axis=0) - local_X = cp.asarray(local_X) - local_y = cp.asarray(local_y) - - local_X = local_X.repeat( - math.ceil(n_rows / len(local_X)), axis=0) - local_y = local_y.repeat( - math.ceil(n_rows / len(local_y)), axis=0) - - # Add some gaussian noise - local_X += cp.random.standard_normal(local_X.shape, - dtype=cp.float32) + # Add some gaussian noise + local_X += cp.random.standard_normal(local_X.shape, + dtype=cp.float32) return local_X, local_y @@ -122,9 +113,9 @@ def _umap_mnmg_trustworthiness(local_X, local_y, @pytest.mark.mg -@pytest.mark.parametrize("n_parts", [2, 5, 10]) -@pytest.mark.parametrize("n_rows", [10000, 50000]) -@pytest.mark.parametrize("sampling_ratio", [0.1, 0.2, 0.4, 0.5]) +@pytest.mark.parametrize("n_parts", [2, 9]) +@pytest.mark.parametrize("n_rows", [100, 500]) +@pytest.mark.parametrize("sampling_ratio", [0.4, 0.9]) @pytest.mark.parametrize("supervised", [True, False]) @pytest.mark.parametrize("dataset", ["digits", "iris"]) @pytest.mark.parametrize("n_neighbors", [10]) @@ -139,19 +130,11 @@ def test_umap_mnmg(n_parts, n_rows, sampling_ratio, supervised, loc_umap = _local_umap_trustworthiness(local_X, local_y, n_neighbors, supervised) - print("\nLocal UMAP trustworthiness score : {:.2f}".format(loc_umap)) - print("UMAP MNMG trustworthiness score : {:.2f}".format(dist_umap)) + logger.debug("\nLocal UMAP trustworthiness score : {:.2f}" + .format(loc_umap)) + logger.debug("UMAP MNMG trustworthiness score : {:.2f}" + .format(dist_umap)) trust_diff = loc_umap - dist_umap - if sampling_ratio == 0.1: - assert trust_diff <= 0.4 - elif sampling_ratio == 0.2: - assert trust_diff <= 0.3 - elif sampling_ratio == 0.4: - assert trust_diff <= 0.2 - elif sampling_ratio == 0.5: - assert trust_diff <= 0.11 - else: - raise ValueError("No assertion for sampling ratio. " - "Please update.") + assert trust_diff <= 0.1 From b0d9ff98ce399597b53ccb17f2a6ed2a1af844ae Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 4 Nov 2020 15:01:01 -0500 Subject: [PATCH 2/3] Reomving accidental change --- python/cuml/test/dask/test_label_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/test/dask/test_label_encoder.py b/python/cuml/test/dask/test_label_encoder.py index b8468fcea2..78fa9e002e 100644 --- a/python/cuml/test/dask/test_label_encoder.py +++ b/python/cuml/test/dask/test_label_encoder.py @@ -59,7 +59,7 @@ def test_labelencoder_transform(length, cardinality, client): encoder_arr = cp.asnumpy(encoded.compute().to_array()) encoded_arr = _arr_to_similarity_mat(encoder_arr) assert ( - (encoded_arr == encoded_arr.T) = r= (df_arr == df_arr.T) + (encoded_arr == encoded_arr.T) == (df_arr == df_arr.T) ).all() From 68f1a92829c11c61bc7ba26b565e89850f751194 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 4 Nov 2020 15:02:19 -0500 Subject: [PATCH 3/3] Updating changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f43b1be69..445d77c1d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ - PR #3067: Deleting prims moved to RAFT and updating header paths - PR #3074: Reducing dask coordinate descent test runtime - PR #3052: Speeding up MNMG KNN Cl&Re testing +- PR #3115: Speeding up MNMG UMAP testing ## Bug Fixes - PR #3065: Refactoring prims metrics function names from camelcase to underscore format