-
Notifications
You must be signed in to change notification settings - Fork 106
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
338 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# To run the "beefy" experiment | ||
# | ||
# Run on a GCE instance | ||
# n1-standard-64 - 240GB mem, 100GB disk | ||
# gcloud beta compute --project=hca-scale instances create ll-knn --zone=us-east1-b --machine-type=n1-standard-64 --subnet=default --network-tier=PREMIUM --maintenance-policy=MIGRATE --service-account=218219996328-compute@developer.gserviceaccount.com --scopes=https://www.googleapis.com/auth/cloud-platform --image=debian-9-stretch-v20190213 --image-project=debian-cloud --boot-disk-size=100GB --boot-disk-type=pd-standard --boot-disk-device-name=ll-knn | ||
# | ||
# sudo apt-get update && sudo apt-get install -y git python3-pip | ||
# pip3 install numba numpy scipy scikit-learn | ||
# pip3 install git+https://github.com/tomwhite/pynndescent@benchmarks | ||
# pip3 list | ||
# git clone https://github.com/tomwhite/pynndescent | ||
# cd pynndescent | ||
# git checkout benchmarks | ||
# python3 benchmark.py | ||
|
||
import multiprocessing | ||
import os | ||
import time | ||
|
||
import numpy as np | ||
from sklearn.neighbors import NearestNeighbors | ||
|
||
from pynndescent import distances as pynndistances | ||
from pynndescent import NNDescent | ||
from pynndescent import threaded | ||
|
||
np.random.seed(42) | ||
|
||
N = 100000 | ||
D = 128 | ||
dataset = np.random.rand(N, D).astype(np.float32) | ||
gold = {} | ||
|
||
n_cores = multiprocessing.cpu_count() | ||
|
||
def cores_powers_of_two(): | ||
i = 1 | ||
while True: | ||
yield i | ||
i *= 2 | ||
if i > n_cores: | ||
break | ||
|
||
def scikitlearn_brute(X, threads=1, n_neighbors=25, max_candidates=50): | ||
t0 = time.time() | ||
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute').fit(X) | ||
distances, indices = nbrs.kneighbors(X) | ||
t1 = time.time() | ||
gold[X.shape[0]] = indices | ||
return indices, distances, t1-t0 | ||
|
||
def scikitlearn_ball_tree(X, threads=1, n_neighbors=25, max_candidates=50): | ||
t0 = time.time() | ||
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(X) | ||
distances, indices = nbrs.kneighbors(X) | ||
t1 = time.time() | ||
return indices, distances, t1-t0 | ||
|
||
def pynndescent_regular(X, threads=1, n_neighbors=25, max_candidates=50): | ||
os.environ["NUMBA_NUM_THREADS"] = str(threads) | ||
t0 = time.time() | ||
index = NNDescent(X, n_neighbors=n_neighbors, max_candidates=max_candidates, tree_init=False) | ||
indices, distances = index._neighbor_graph | ||
t1 = time.time() | ||
return indices, distances, t1-t0 | ||
|
||
def pynndescent_threaded(X, threads=1, n_neighbors=25, max_candidates=50): | ||
t0 = time.time() | ||
dist = pynndistances.named_distances["euclidean"] | ||
dist_args = () | ||
threaded_nn_descent = threaded.make_nn_descent(dist, dist_args) | ||
indices, distances = threaded_nn_descent(X, n_neighbors=n_neighbors, max_candidates=max_candidates, rng_state=None, chunk_size=X.shape[0]//threads, threads=threads) | ||
t1 = time.time() | ||
return indices, distances, t1-t0 | ||
|
||
def accuracy(expected, actual): | ||
# Look at the size of corresponding row intersections | ||
return np.array([len(np.intersect1d(x, y)) for x, y in zip(expected, actual)]).sum() / expected.size | ||
|
||
def all_experiments(): | ||
n_neighbors = 25 | ||
max_candidates = 50 | ||
for rows in (1000, 5000, 10000, 20000, 50000, 100000): | ||
for threads in (1, ): | ||
yield (scikitlearn_brute, rows, threads, n_neighbors, max_candidates) | ||
for rows in (1000, 5000, 10000, 20000): | ||
for threads in (1, ): | ||
yield (scikitlearn_ball_tree, rows, threads, n_neighbors, max_candidates) | ||
for rows in (1000, 5000, 10000, 20000, 50000, 100000, 1000000): | ||
for threads in cores_powers_of_two(): | ||
if rows < 50000 and threads > 1: | ||
continue | ||
if rows >= 1000000 and threads > 1: | ||
continue | ||
yield (pynndescent_regular, rows, threads, n_neighbors, max_candidates) | ||
for rows in (1000, 5000, 10000, 20000, 50000, 100000, 1000000): | ||
for threads in cores_powers_of_two(): | ||
if rows >= 50000 and threads < 4: | ||
continue | ||
if rows >= 1000000 and threads < n_cores: | ||
continue | ||
yield (pynndescent_threaded, rows, threads, n_neighbors, max_candidates) | ||
|
||
def generate_experiments(predicate=None): | ||
for exp in all_experiments(): | ||
if predicate is None or predicate(exp): | ||
yield exp | ||
|
||
# modify the predicate to run a subset of experiments | ||
#predicate = lambda exp: (exp[0] == scikitlearn_brute or exp[0] == pynndescent_threaded) and exp[1] >= 100000 and exp[2] == 8 | ||
#predicate = lambda exp: (exp[0] == scikitlearn_brute and exp[1] == 50000) or (exp[0] == pynndescent_regular and exp[1] == 50000) or (exp[0] == pynndescent_threaded and exp[1] == 50000 and exp[2] == 8) | ||
predicate = lambda exp: (exp[0] == scikitlearn_brute or exp[0] == pynndescent_regular or exp[0] == pynndescent_threaded) and exp[1] == 20000 | ||
for algorithm, rows, threads, n_neighbors, max_candidates in generate_experiments(predicate): | ||
indices, distances, t = algorithm(dataset[:rows], threads, n_neighbors, max_candidates) | ||
acc = accuracy(gold[rows], indices) if rows in gold else -1 | ||
print("{},{},{},{},{},{},{}".format(algorithm.__name__, threads, rows, n_neighbors, max_candidates, t, acc)) | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
|
||
results = pd.read_csv('results/results_local_20190227.csv') | ||
|
||
for algorithm, threads in (('scikitlearn_brute', 1), ('scikitlearn_ball_tree', 1), ('pynndescent_regular', 8), ('pynndescent_threaded', 1), ('pynndescent_threaded', 8)): | ||
results_subset = results[(results['algorithm'] == algorithm) & (results['threads'] == threads)][['rows','duration']] | ||
plt.loglog('rows', 'duration', data=results_subset, marker='o', label='{} (threads={})'.format(algorithm, threads)) | ||
|
||
plt.title("Nearest neighbor algorithms (D=128, NN=25)") | ||
plt.xlabel('Rows') | ||
plt.ylabel('Duration (s)') | ||
plt.legend() | ||
|
||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
|
||
results = pd.read_csv('results/results_beefy_20190227.csv') | ||
|
||
for algorithm, rows in (('pynndescent_regular', 50000), ('pynndescent_regular', 100000), ('pynndescent_threaded', 50000), ('pynndescent_threaded', 100000)): | ||
results_subset = results[(results['algorithm'] == algorithm) & (results['rows'] == rows)][['threads','duration']] | ||
plt.loglog('threads', 'duration', data=results_subset, marker='o', label='{} {} rows'.format(algorithm, rows)) | ||
|
||
plt.title("Nearest neighbor algorithms (D=128, NN=25)") | ||
plt.xlabel('Threads') | ||
plt.ylabel('Duration (s)') | ||
plt.legend() | ||
|
||
plt.show() |