Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Fix similarity bug in NMSLIB indexer + documentation fixes #2899

Merged
merged 14 commits into from
Jul 30, 2020
Merged
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions docs/src/auto_examples/tutorials/run_annoy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
},
"outputs": [],
"source": [
"# Set up the model and vector that we are using in the comparison\nmodel.init_sims()\nannoy_index = AnnoyIndexer(model, 100)\n\n# Dry run to make sure both indexes are fully in RAM\nvector = model.wv.vectors_norm[0]\nmodel.wv.most_similar([vector], topn=5, indexer=annoy_index)\nmodel.wv.most_similar([vector], topn=5)\n\nimport time\nimport numpy as np\n\ndef avg_query_time(annoy_index=None, queries=1000):\n \"\"\"Average query time of a most_similar method over 1000 random queries.\"\"\"\n total_time = 0\n for _ in range(queries):\n rand_vec = model.wv.vectors_norm[np.random.randint(0, len(model.wv))]\n start_time = time.process_time()\n model.wv.most_similar([rand_vec], topn=5, indexer=annoy_index)\n total_time += time.process_time() - start_time\n return total_time / queries\n\nqueries = 1000\n\ngensim_time = avg_query_time(queries=queries)\nannoy_time = avg_query_time(annoy_index, queries=queries)\nprint(\"Gensim (s/query):\\t{0:.5f}\".format(gensim_time))\nprint(\"Annoy (s/query):\\t{0:.5f}\".format(annoy_time))\nspeed_improvement = gensim_time / annoy_time\nprint (\"\\nAnnoy is {0:.2f} times faster on average on this particular run\".format(speed_improvement))"
"# Set up the model and vector that we are using in the comparison\nmodel.init_sims()\nannoy_index = AnnoyIndexer(model, 100)\n\n# Dry run to make sure both indexes are fully in RAM\nnormed_vectors = model.wv.get_normed_vectors()\nvector = normed_vectors[0]\nmodel.wv.most_similar([vector], topn=5, indexer=annoy_index)\nmodel.wv.most_similar([vector], topn=5)\n\nimport time\nimport numpy as np\n\ndef avg_query_time(annoy_index=None, queries=1000):\n \"\"\"Average query time of a most_similar method over 1000 random queries.\"\"\"\n total_time = 0\n for _ in range(queries):\n rand_vec = normed_vectors[np.random.randint(0, len(model.wv))]\n start_time = time.process_time()\n model.wv.most_similar([rand_vec], topn=5, indexer=annoy_index)\n total_time += time.process_time() - start_time\n return total_time / queries\n\nqueries = 1000\n\ngensim_time = avg_query_time(queries=queries)\nannoy_time = avg_query_time(annoy_index, queries=queries)\nprint(\"Gensim (s/query):\\t{0:.5f}\".format(gensim_time))\nprint(\"Annoy (s/query):\\t{0:.5f}\".format(annoy_time))\nspeed_improvement = gensim_time / annoy_time\nprint (\"\\nAnnoy is {0:.2f} times faster on average on this particular run\".format(speed_improvement))"
]
},
{
Expand Down Expand Up @@ -234,7 +234,7 @@
},
"outputs": [],
"source": [
"exact_results = [element[0] for element in model.wv.most_similar([model.wv.vectors_norm[0]], topn=100)]\n\nx_values = []\ny_values_init = []\ny_values_accuracy = []\n\nfor x in range(1, 300, 10):\n x_values.append(x)\n start_time = time.time()\n annoy_index = AnnoyIndexer(model, x)\n y_values_init.append(time.time() - start_time)\n approximate_results = model.wv.most_similar([model.wv.vectors_norm[0]], topn=100, indexer=annoy_index)\n top_words = [result[0] for result in approximate_results]\n y_values_accuracy.append(len(set(top_words).intersection(exact_results)))"
"exact_results = [element[0] for element in model.wv.most_similar([normed_vectors[0]], topn=100)]\n\nx_values = []\ny_values_init = []\ny_values_accuracy = []\n\nfor x in range(1, 300, 10):\n x_values.append(x)\n start_time = time.time()\n annoy_index = AnnoyIndexer(model, x)\n y_values_init.append(time.time() - start_time)\n approximate_results = model.wv.most_similar([normed_vectors[0]], topn=100, indexer=annoy_index)\n top_words = [result[0] for result in approximate_results]\n y_values_accuracy.append(len(set(top_words).intersection(exact_results)))"
]
},
{
Expand Down
9 changes: 5 additions & 4 deletions docs/src/auto_examples/tutorials/run_annoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@
annoy_index = AnnoyIndexer(model, 100)

# Dry run to make sure both indexes are fully in RAM
vector = model.wv.vectors_norm[0]
normed_vectors = model.wv.get_normed_vectors()
vector = normed_vectors[0]
model.wv.most_similar([vector], topn=5, indexer=annoy_index)
model.wv.most_similar([vector], topn=5)

Expand All @@ -131,7 +132,7 @@ def avg_query_time(annoy_index=None, queries=1000):
"""Average query time of a most_similar method over 1000 random queries."""
total_time = 0
for _ in range(queries):
rand_vec = model.wv.vectors_norm[np.random.randint(0, len(model.wv))]
rand_vec = normed_vectors[np.random.randint(0, len(model.wv))]
start_time = time.process_time()
model.wv.most_similar([rand_vec], topn=5, indexer=annoy_index)
total_time += time.process_time() - start_time
Expand Down Expand Up @@ -286,7 +287,7 @@ def f(process_id):
# Build dataset of Initialization times and accuracy measures:
#

exact_results = [element[0] for element in model.wv.most_similar([model.wv.vectors_norm[0]], topn=100)]
exact_results = [element[0] for element in model.wv.most_similar([normed_vectors[0]], topn=100)]

x_values = []
y_values_init = []
Expand All @@ -297,7 +298,7 @@ def f(process_id):
start_time = time.time()
annoy_index = AnnoyIndexer(model, x)
y_values_init.append(time.time() - start_time)
approximate_results = model.wv.most_similar([model.wv.vectors_norm[0]], topn=100, indexer=annoy_index)
approximate_results = model.wv.most_similar([normed_vectors[0]], topn=100, indexer=annoy_index)
top_words = [result[0] for result in approximate_results]
y_values_accuracy.append(len(set(top_words).intersection(exact_results)))

Expand Down
2 changes: 1 addition & 1 deletion docs/src/auto_examples/tutorials/run_annoy.py.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
a18f2e2cf524dea755eb70bb385bf7fe
c6cd2a0225bbe49d97dc66c96d2b7f1c
Loading