Benchmarks

lmcinnes · Mar 7, 2019 · 5353060 · 5353060
1 parent 5a819b1
commit 5353060
Show file tree

Hide file tree

Showing 4 changed files with 338 additions and 0 deletions.
diff --git a/benchmark.py b/benchmark.py
@@ -0,0 +1,117 @@
+# To run the "beefy" experiment
+#
+# Run on a GCE instance
+# n1-standard-64 - 240GB mem, 100GB disk
+# gcloud beta compute --project=hca-scale instances create ll-knn --zone=us-east1-b --machine-type=n1-standard-64 --subnet=default --network-tier=PREMIUM --maintenance-policy=MIGRATE --service-account=218219996328-compute@developer.gserviceaccount.com --scopes=https://www.googleapis.com/auth/cloud-platform --image=debian-9-stretch-v20190213 --image-project=debian-cloud --boot-disk-size=100GB --boot-disk-type=pd-standard --boot-disk-device-name=ll-knn
+#
+# sudo apt-get update && sudo apt-get install -y git python3-pip
+# pip3 install numba numpy scipy scikit-learn
+# pip3 install git+https://github.com/tomwhite/pynndescent@benchmarks
+# pip3 list
+# git clone https://github.com/tomwhite/pynndescent
+# cd pynndescent
+# git checkout benchmarks
+# python3 benchmark.py
+
+import multiprocessing
+import os
+import time
+
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+
+from pynndescent import distances as pynndistances
+from pynndescent import NNDescent
+from pynndescent import threaded
+
+np.random.seed(42)
+
+N = 100000
+D = 128
+dataset = np.random.rand(N, D).astype(np.float32)
+gold = {}
+
+n_cores = multiprocessing.cpu_count()
+
+def cores_powers_of_two():
+    i = 1
+    while True:
+        yield i
+        i *= 2
+        if i > n_cores:
+            break
+
+def scikitlearn_brute(X, threads=1, n_neighbors=25, max_candidates=50):
+    t0 = time.time()
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute').fit(X)
+    distances, indices = nbrs.kneighbors(X)
+    t1 = time.time()
+    gold[X.shape[0]] = indices
+    return indices, distances, t1-t0
+
+def scikitlearn_ball_tree(X, threads=1, n_neighbors=25, max_candidates=50):
+    t0 = time.time()
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(X)
+    distances, indices = nbrs.kneighbors(X)
+    t1 = time.time()
+    return indices, distances, t1-t0
+
+def pynndescent_regular(X, threads=1, n_neighbors=25, max_candidates=50):
+    os.environ["NUMBA_NUM_THREADS"] = str(threads)
+    t0 = time.time()
+    index = NNDescent(X, n_neighbors=n_neighbors, max_candidates=max_candidates, tree_init=False)
+    indices, distances = index._neighbor_graph
+    t1 = time.time()
+    return indices, distances, t1-t0
+
+def pynndescent_threaded(X, threads=1, n_neighbors=25, max_candidates=50):
+    t0 = time.time()
+    dist = pynndistances.named_distances["euclidean"]
+    dist_args = ()
+    threaded_nn_descent = threaded.make_nn_descent(dist, dist_args)
+    indices, distances = threaded_nn_descent(X, n_neighbors=n_neighbors, max_candidates=max_candidates, rng_state=None, chunk_size=X.shape[0]//threads, threads=threads)
+    t1 = time.time()
+    return indices, distances, t1-t0
+
+def accuracy(expected, actual):
+    # Look at the size of corresponding row intersections
+    return np.array([len(np.intersect1d(x, y)) for x, y in zip(expected, actual)]).sum() / expected.size
+
+def all_experiments():
+    n_neighbors = 25
+    max_candidates = 50
+    for rows in (1000, 5000, 10000, 20000, 50000, 100000):
+        for threads in (1, ):
+            yield (scikitlearn_brute, rows, threads, n_neighbors, max_candidates)
+    for rows in (1000, 5000, 10000, 20000):
+        for threads in (1, ):
+            yield (scikitlearn_ball_tree, rows, threads, n_neighbors, max_candidates)
+    for rows in (1000, 5000, 10000, 20000, 50000, 100000, 1000000):
+        for threads in cores_powers_of_two():
+            if rows < 50000 and threads > 1:
+                continue
+            if rows >= 1000000 and threads > 1:
+                continue
+            yield (pynndescent_regular, rows, threads, n_neighbors, max_candidates)
+    for rows in (1000, 5000, 10000, 20000, 50000, 100000, 1000000):
+        for threads in cores_powers_of_two():
+            if rows >= 50000 and threads < 4:
+                continue
+            if rows >= 1000000 and threads < n_cores:
+                continue
+            yield (pynndescent_threaded, rows, threads, n_neighbors, max_candidates)
+
+def generate_experiments(predicate=None):
+    for exp in all_experiments():
+        if predicate is None or predicate(exp):
+            yield exp
+
+# modify the predicate to run a subset of experiments
+#predicate = lambda exp: (exp[0] == scikitlearn_brute or exp[0] == pynndescent_threaded) and exp[1] >= 100000 and exp[2] == 8
+#predicate = lambda exp: (exp[0] == scikitlearn_brute and exp[1] == 50000) or (exp[0] == pynndescent_regular and exp[1] == 50000) or (exp[0] == pynndescent_threaded and exp[1] == 50000 and exp[2] == 8)
+predicate = lambda exp: (exp[0] == scikitlearn_brute or exp[0] == pynndescent_regular or exp[0] == pynndescent_threaded) and exp[1] == 20000
+for algorithm, rows, threads, n_neighbors, max_candidates in generate_experiments(predicate):
+    indices, distances, t = algorithm(dataset[:rows], threads, n_neighbors, max_candidates)
+    acc = accuracy(gold[rows], indices) if rows in gold else -1
+    print("{},{},{},{},{},{},{}".format(algorithm.__name__, threads, rows, n_neighbors, max_candidates, t, acc))
+
diff --git a/benchmark_results.ipynb b/benchmark_results.ipynb
diff --git a/benchmark_results.py b/benchmark_results.py
@@ -0,0 +1,15 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+
+results = pd.read_csv('results/results_local_20190227.csv')
+
+for algorithm, threads in (('scikitlearn_brute', 1), ('scikitlearn_ball_tree', 1), ('pynndescent_regular', 8), ('pynndescent_threaded', 1), ('pynndescent_threaded', 8)):
+    results_subset = results[(results['algorithm'] == algorithm) & (results['threads'] == threads)][['rows','duration']]
+    plt.loglog('rows', 'duration', data=results_subset, marker='o', label='{} (threads={})'.format(algorithm, threads))
+
+plt.title("Nearest neighbor algorithms (D=128, NN=25)")
+plt.xlabel('Rows')
+plt.ylabel('Duration (s)')
+plt.legend()
+
+plt.show()
diff --git a/benchmark_results2.py b/benchmark_results2.py
@@ -0,0 +1,15 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+
+results = pd.read_csv('results/results_beefy_20190227.csv')
+
+for algorithm, rows in (('pynndescent_regular', 50000), ('pynndescent_regular', 100000), ('pynndescent_threaded', 50000), ('pynndescent_threaded', 100000)):
+    results_subset = results[(results['algorithm'] == algorithm) & (results['rows'] == rows)][['threads','duration']]
+    plt.loglog('threads', 'duration', data=results_subset, marker='o', label='{} {} rows'.format(algorithm, rows))
+
+plt.title("Nearest neighbor algorithms (D=128, NN=25)")
+plt.xlabel('Threads')
+plt.ylabel('Duration (s)')
+plt.legend()
+
+plt.show()