Skip to content

Commit

Permalink
Benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Mar 7, 2019
1 parent 5a819b1 commit 5353060
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 0 deletions.
117 changes: 117 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# To run the "beefy" experiment
#
# Run on a GCE instance
# n1-standard-64 - 240GB mem, 100GB disk
# gcloud beta compute --project=hca-scale instances create ll-knn --zone=us-east1-b --machine-type=n1-standard-64 --subnet=default --network-tier=PREMIUM --maintenance-policy=MIGRATE --service-account=218219996328-compute@developer.gserviceaccount.com --scopes=https://www.googleapis.com/auth/cloud-platform --image=debian-9-stretch-v20190213 --image-project=debian-cloud --boot-disk-size=100GB --boot-disk-type=pd-standard --boot-disk-device-name=ll-knn
#
# sudo apt-get update && sudo apt-get install -y git python3-pip
# pip3 install numba numpy scipy scikit-learn
# pip3 install git+https://github.com/tomwhite/pynndescent@benchmarks
# pip3 list
# git clone https://github.com/tomwhite/pynndescent
# cd pynndescent
# git checkout benchmarks
# python3 benchmark.py

import multiprocessing
import os
import time

import numpy as np
from sklearn.neighbors import NearestNeighbors

from pynndescent import distances as pynndistances
from pynndescent import NNDescent
from pynndescent import threaded

np.random.seed(42)

N = 100000
D = 128
dataset = np.random.rand(N, D).astype(np.float32)
gold = {}

n_cores = multiprocessing.cpu_count()

def cores_powers_of_two():
i = 1
while True:
yield i
i *= 2
if i > n_cores:
break

def scikitlearn_brute(X, threads=1, n_neighbors=25, max_candidates=50):
t0 = time.time()
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute').fit(X)
distances, indices = nbrs.kneighbors(X)
t1 = time.time()
gold[X.shape[0]] = indices
return indices, distances, t1-t0

def scikitlearn_ball_tree(X, threads=1, n_neighbors=25, max_candidates=50):
t0 = time.time()
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)
t1 = time.time()
return indices, distances, t1-t0

def pynndescent_regular(X, threads=1, n_neighbors=25, max_candidates=50):
os.environ["NUMBA_NUM_THREADS"] = str(threads)
t0 = time.time()
index = NNDescent(X, n_neighbors=n_neighbors, max_candidates=max_candidates, tree_init=False)
indices, distances = index._neighbor_graph
t1 = time.time()
return indices, distances, t1-t0

def pynndescent_threaded(X, threads=1, n_neighbors=25, max_candidates=50):
t0 = time.time()
dist = pynndistances.named_distances["euclidean"]
dist_args = ()
threaded_nn_descent = threaded.make_nn_descent(dist, dist_args)
indices, distances = threaded_nn_descent(X, n_neighbors=n_neighbors, max_candidates=max_candidates, rng_state=None, chunk_size=X.shape[0]//threads, threads=threads)
t1 = time.time()
return indices, distances, t1-t0

def accuracy(expected, actual):
# Look at the size of corresponding row intersections
return np.array([len(np.intersect1d(x, y)) for x, y in zip(expected, actual)]).sum() / expected.size

def all_experiments():
n_neighbors = 25
max_candidates = 50
for rows in (1000, 5000, 10000, 20000, 50000, 100000):
for threads in (1, ):
yield (scikitlearn_brute, rows, threads, n_neighbors, max_candidates)
for rows in (1000, 5000, 10000, 20000):
for threads in (1, ):
yield (scikitlearn_ball_tree, rows, threads, n_neighbors, max_candidates)
for rows in (1000, 5000, 10000, 20000, 50000, 100000, 1000000):
for threads in cores_powers_of_two():
if rows < 50000 and threads > 1:
continue
if rows >= 1000000 and threads > 1:
continue
yield (pynndescent_regular, rows, threads, n_neighbors, max_candidates)
for rows in (1000, 5000, 10000, 20000, 50000, 100000, 1000000):
for threads in cores_powers_of_two():
if rows >= 50000 and threads < 4:
continue
if rows >= 1000000 and threads < n_cores:
continue
yield (pynndescent_threaded, rows, threads, n_neighbors, max_candidates)

def generate_experiments(predicate=None):
for exp in all_experiments():
if predicate is None or predicate(exp):
yield exp

# modify the predicate to run a subset of experiments
#predicate = lambda exp: (exp[0] == scikitlearn_brute or exp[0] == pynndescent_threaded) and exp[1] >= 100000 and exp[2] == 8
#predicate = lambda exp: (exp[0] == scikitlearn_brute and exp[1] == 50000) or (exp[0] == pynndescent_regular and exp[1] == 50000) or (exp[0] == pynndescent_threaded and exp[1] == 50000 and exp[2] == 8)
predicate = lambda exp: (exp[0] == scikitlearn_brute or exp[0] == pynndescent_regular or exp[0] == pynndescent_threaded) and exp[1] == 20000
for algorithm, rows, threads, n_neighbors, max_candidates in generate_experiments(predicate):
indices, distances, t = algorithm(dataset[:rows], threads, n_neighbors, max_candidates)
acc = accuracy(gold[rows], indices) if rows in gold else -1
print("{},{},{},{},{},{},{}".format(algorithm.__name__, threads, rows, n_neighbors, max_candidates, t, acc))

191 changes: 191 additions & 0 deletions benchmark_results.ipynb

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions benchmark_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import matplotlib.pyplot as plt
import pandas as pd

results = pd.read_csv('results/results_local_20190227.csv')

for algorithm, threads in (('scikitlearn_brute', 1), ('scikitlearn_ball_tree', 1), ('pynndescent_regular', 8), ('pynndescent_threaded', 1), ('pynndescent_threaded', 8)):
results_subset = results[(results['algorithm'] == algorithm) & (results['threads'] == threads)][['rows','duration']]
plt.loglog('rows', 'duration', data=results_subset, marker='o', label='{} (threads={})'.format(algorithm, threads))

plt.title("Nearest neighbor algorithms (D=128, NN=25)")
plt.xlabel('Rows')
plt.ylabel('Duration (s)')
plt.legend()

plt.show()
15 changes: 15 additions & 0 deletions benchmark_results2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import matplotlib.pyplot as plt
import pandas as pd

results = pd.read_csv('results/results_beefy_20190227.csv')

for algorithm, rows in (('pynndescent_regular', 50000), ('pynndescent_regular', 100000), ('pynndescent_threaded', 50000), ('pynndescent_threaded', 100000)):
results_subset = results[(results['algorithm'] == algorithm) & (results['rows'] == rows)][['threads','duration']]
plt.loglog('threads', 'duration', data=results_subset, marker='o', label='{} {} rows'.format(algorithm, rows))

plt.title("Nearest neighbor algorithms (D=128, NN=25)")
plt.xlabel('Threads')
plt.ylabel('Duration (s)')
plt.legend()

plt.show()

0 comments on commit 5353060

Please sign in to comment.