diff --git a/docs/notebooks/soft_cosine_benchmark.ipynb b/docs/notebooks/soft_cosine_benchmark.ipynb index 9421b84c17..55673248d6 100644 --- a/docs/notebooks/soft_cosine_benchmark.ipynb +++ b/docs/notebooks/soft_cosine_benchmark.ipynb @@ -58,7 +58,7 @@ "from gensim.similarities import SparseTermSimilarityMatrix\n", "from gensim.similarities import UniformTermSimilarityIndex\n", "from gensim.similarities import LevenshteinSimilarityIndex\n", - "from gensim.models import WordEmbeddingSimilarityIndex\n", + "from gensim.similarities import WordEmbeddingSimilarityIndex\n", "from gensim.utils import simple_preprocess\n", "\n", "RANDOM_SEED = 12345\n", diff --git a/docs/notebooks/soft_cosine_tutorial.ipynb b/docs/notebooks/soft_cosine_tutorial.ipynb index aadbecf6a5..4c7fceb1df 100644 --- a/docs/notebooks/soft_cosine_tutorial.ipynb +++ b/docs/notebooks/soft_cosine_tutorial.ipynb @@ -143,8 +143,8 @@ "%%time\n", "import gensim.downloader as api\n", "\n", - "from gensim.models import WordEmbeddingSimilarityIndex\n", "from gensim.similarities import SparseTermSimilarityMatrix\n", + "from gensim.similarities import WordEmbeddingSimilarityIndex\n", "\n", "w2v_model = api.load(\"glove-wiki-gigaword-50\")\n", "similarity_index = WordEmbeddingSimilarityIndex(w2v_model)\n", @@ -296,7 +296,7 @@ "from gensim.corpora import Dictionary\n", "from gensim.models import TfidfModel\n", "from gensim.models import Word2Vec\n", - "from gensim.models import WordEmbeddingSimilarityIndex\n", + "from gensim.similarities import WordEmbeddingSimilarityIndex\n", "from gensim.similarities import SparseTermSimilarityMatrix\n", "\n", "dictionary = Dictionary(corpus)\n", diff --git a/docs/src/auto_examples/howtos/run_doc.ipynb b/docs/src/auto_examples/howtos/run_doc.ipynb index 48820b593a..7d3be48ac2 100644 --- a/docs/src/auto_examples/howtos/run_doc.ipynb +++ b/docs/src/auto_examples/howtos/run_doc.ipynb @@ -15,7 +15,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\nHow to Author Gensim Documentation\n==================================\n\nHow to author documentation for Gensim.\n\n" + "\nHow to Author Gensim Documentation\n==================================\n\nHow to author documentation for Gensim.\n" ] }, { @@ -54,7 +54,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Authoring Workflow\n------------------\n\nThere are several ways to author documentation.\nThe simplest and most straightforward is to author your ``script.py`` from scratch.\nYou'll have the following cycle:\n\n1. Make changes\n2. Run ``python script.py``\n3. Check standard output, standard error and return code\n4. If everything works well, stop.\n5. Otherwise, go back to step 1).\n\nIf the above is not your cup of tea, you can also author your documentation as a Jupyter notebook.\nThis is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary.\n\nOnce you're happy with the notebook, convert it to a script.py.\nThere's a helpful `script `__ that will do it for you.\nTo use it::\n\n python to_python.py < notebook.ipynb > script.py\n\nYou may have to touch up the resulting ``script.py``.\nMore specifically:\n\n- Update the title\n- Update the description\n- Fix any issues that the markdown-to-RST converter could not deal with\n\nOnce your script.py works, put it in a suitable subdirectory.\nPlease don't include your original Jupyter notebook in the repository - we won't be using it.\n\n" + "Authoring Workflow\n------------------\n\nThere are several ways to author documentation.\nThe simplest and most straightforward is to author your ``script.py`` from scratch.\nYou'll have the following cycle:\n\n1. Make changes\n2. Run ``python script.py``\n3. Check standard output, standard error and return code\n4. If everything works well, stop.\n5. Otherwise, go back to step 1).\n\nIf the above is not your cup of tea, you can also author your documentation as a Jupyter notebook.\nThis is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary.\n\nOnce you're happy with the notebook, convert it to a script.py.\nThere's a helpful `script `__ that will do it for you.\nTo use it::\n\n python to_python.py < notebook.ipynb > script.py\n\nYou may have to touch up the resulting ``script.py``.\nMore specifically:\n\n- Update the title\n- Update the description\n- Fix any issues that the markdown-to-RST converter could not deal with\n\nOnce your script.py works, put it in a suitable subdirectory.\nPlease don't include your original Jupyter notebook in the repository - we won't be using it.\n\n" ] }, { @@ -81,7 +81,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/howtos/run_doc.py b/docs/src/auto_examples/howtos/run_doc.py index 7e07ab74bd..15e870f1be 100644 --- a/docs/src/auto_examples/howtos/run_doc.py +++ b/docs/src/auto_examples/howtos/run_doc.py @@ -111,7 +111,7 @@ # This is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary. # # Once you're happy with the notebook, convert it to a script.py. -# There's a helpful `script `__ that will do it for you. +# There's a helpful `script `__ that will do it for you. # To use it:: # # python to_python.py < notebook.ipynb > script.py diff --git a/docs/src/auto_examples/howtos/run_doc.py.md5 b/docs/src/auto_examples/howtos/run_doc.py.md5 index d81fe9d241..979aa0eb5e 100644 --- a/docs/src/auto_examples/howtos/run_doc.py.md5 +++ b/docs/src/auto_examples/howtos/run_doc.py.md5 @@ -1 +1 @@ -b3db0b66859316de13e1a36fa6181657 \ No newline at end of file +512a76ce743dd12482d21784a76b60fe \ No newline at end of file diff --git a/docs/src/auto_examples/howtos/run_doc.rst b/docs/src/auto_examples/howtos/run_doc.rst index 88d1904295..c763ca1de0 100644 --- a/docs/src/auto_examples/howtos/run_doc.rst +++ b/docs/src/auto_examples/howtos/run_doc.rst @@ -1,12 +1,10 @@ -.. only:: html - - .. note:: - :class: sphx-glr-download-link-note +.. note:: + :class: sphx-glr-download-link-note - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title + Click :ref:`here ` to download the full example code +.. rst-class:: sphx-glr-example-title - .. _sphx_glr_auto_examples_howtos_run_doc.py: +.. _sphx_glr_auto_examples_howtos_run_doc.py: How to Author Gensim Documentation @@ -80,15 +78,6 @@ At the very top, you need a docstring describing what your script does. -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - - '\nTitle\n=====\n\nBrief description.\n' - The title is what will show up in the gallery. @@ -135,7 +124,7 @@ If the above is not your cup of tea, you can also author your documentation as a This is a more flexible approach that enables you to tweak parts of the documentation and re-run them as necessary. Once you're happy with the notebook, convert it to a script.py. -There's a helpful `script `__ that will do it for you. +There's a helpful `script `__ that will do it for you. To use it:: python to_python.py < notebook.ipynb > script.py @@ -207,9 +196,9 @@ At that stage, give yourself a pat on the back: you're done! .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 0.112 seconds) + **Total running time of the script:** ( 0 minutes 1.226 seconds) -**Estimated memory usage:** 6 MB +**Estimated memory usage:** 9 MB .. _sphx_glr_download_auto_examples_howtos_run_doc.py: @@ -222,13 +211,13 @@ At that stage, give yourself a pat on the back: you're done! - .. container:: sphx-glr-download sphx-glr-download-python + .. container:: sphx-glr-download :download:`Download Python source code: run_doc.py ` - .. container:: sphx-glr-download sphx-glr-download-jupyter + .. container:: sphx-glr-download :download:`Download Jupyter notebook: run_doc.ipynb ` diff --git a/docs/src/auto_examples/howtos/sg_execution_times.rst b/docs/src/auto_examples/howtos/sg_execution_times.rst index 628fe13c1f..ec9ea90bd7 100644 --- a/docs/src/auto_examples/howtos/sg_execution_times.rst +++ b/docs/src/auto_examples/howtos/sg_execution_times.rst @@ -5,14 +5,9 @@ Computation times ================= -**52:12.903** total execution time for **auto_examples_howtos** files: +**00:01.226** total execution time for **auto_examples_howtos** files: -+----------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` (``run_doc2vec_imdb.py``) | 52:12.903 | 3494.0 MB | -+----------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` (``run_compare_lda.py``) | 00:00.000 | 0.0 MB | -+----------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_howtos_run_doc.py` (``run_doc.py``) | 00:00.000 | 0.0 MB | -+----------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` (``run_downloader_api.py``) | 00:00.000 | 0.0 MB | -+----------------------------------------------------------------------------------------+-----------+-----------+ +- **00:01.226**: :ref:`sphx_glr_auto_examples_howtos_run_doc.py` (``run_doc.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` (``run_compare_lda.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` (``run_doc2vec_imdb.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` (``run_downloader_api.py``) diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index ca3c1ec019..1ec9276242 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -13,7 +13,7 @@ If you're thinking about contributing documentation, please see :ref:`sphx_glr_a .. raw:: html -
+
@@ -33,10 +33,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png - :alt: Core Concepts + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png - :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` + :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` .. raw:: html @@ -54,10 +53,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png - :alt: Corpora and Vector Spaces + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png - :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` + :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` .. raw:: html @@ -71,14 +69,13 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png - :alt: Topics and Transformations + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png - :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` + :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` .. raw:: html @@ -92,14 +89,13 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png - :alt: Similarity Queries + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png - :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` + :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` .. raw:: html @@ -112,7 +108,7 @@ Understanding this functionality is vital for using gensim effectively. /auto_examples/core/run_similarity_queries .. raw:: html -
+
@@ -131,10 +127,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png - :alt: Word2Vec Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` + :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` .. raw:: html @@ -152,10 +147,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png - :alt: Doc2Vec Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` + :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` .. raw:: html @@ -169,14 +163,13 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :alt: FastText Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -190,14 +183,13 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :alt: Fast Similarity Queries with Annoy and Word2Vec + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` .. raw:: html @@ -215,10 +207,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :alt: LDA Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` .. raw:: html @@ -236,10 +227,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :alt: Word Mover's Distance + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -250,9 +240,29 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod :hidden: /auto_examples/tutorials/run_wmd + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png + + :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /auto_examples/tutorials/run_scm .. raw:: html -
+
@@ -267,14 +277,13 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png - :alt: How to download pre-trained models and corpora + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` + :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` .. raw:: html @@ -288,14 +297,13 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png - :alt: How to Author Gensim Documentation + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_doc.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc.py` .. raw:: html @@ -313,10 +321,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png - :alt: How to reproduce the doc2vec 'Paragraph Vector' paper + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` .. raw:: html @@ -334,10 +341,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png - :alt: How to Compare LDA Models + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` + :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` .. raw:: html @@ -350,7 +356,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u /auto_examples/howtos/run_compare_lda .. raw:: html -
+
@@ -393,7 +399,7 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. raw:: html -
+
@@ -403,15 +409,15 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from :class: sphx-glr-footer-gallery - .. container:: sphx-glr-download sphx-glr-download-python + .. container:: sphx-glr-download - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` - .. container:: sphx-glr-download sphx-glr-download-jupyter + .. container:: sphx-glr-download - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/docs/src/auto_examples/tutorials/images/sphx_glr_run_scm_001.png b/docs/src/auto_examples/tutorials/images/sphx_glr_run_scm_001.png new file mode 100644 index 0000000000..327697506e Binary files /dev/null and b/docs/src/auto_examples/tutorials/images/sphx_glr_run_scm_001.png differ diff --git a/docs/src/auto_examples/tutorials/images/sphx_glr_run_wmd_001.png b/docs/src/auto_examples/tutorials/images/sphx_glr_run_wmd_001.png index 779c1df369..7d7ea7db56 100644 Binary files a/docs/src/auto_examples/tutorials/images/sphx_glr_run_wmd_001.png and b/docs/src/auto_examples/tutorials/images/sphx_glr_run_wmd_001.png differ diff --git a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png new file mode 100644 index 0000000000..f6a9fbd4b1 Binary files /dev/null and b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png differ diff --git a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png index 78c25f381e..381d4a9d59 100644 Binary files a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png and b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png differ diff --git a/docs/src/auto_examples/tutorials/run_scm.ipynb b/docs/src/auto_examples/tutorials/run_scm.ipynb new file mode 100644 index 0000000000..91fb6f86bd --- /dev/null +++ b/docs/src/auto_examples/tutorials/run_scm.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\nSoft Cosine Measure\n===================\n\nDemonstrates using Gensim's implemenation of the SCM.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Soft Cosine Measure (SCM) is a promising new tool in machine learning that\nallows us to submit a query and return the most relevant documents. This\ntutorial introduces SCM and shows how you can compute the SCM similarities\nbetween two documents using the ``inner_product`` method.\n\nSoft Cosine Measure basics\n--------------------------\n\nSoft Cosine Measure (SCM) is a method that allows us to assess the similarity\nbetween two documents in a meaningful way, even when they have no words in\ncommon. It uses a measure of similarity between words, which can be derived\n[2] using [word2vec][] [4] vector embeddings of words. It has been shown to\noutperform many of the state-of-the-art methods in the semantic text\nsimilarity task in the context of community question answering [2].\n\n\nSCM is illustrated below for two very similar sentences. The sentences have\nno words in common, but by modeling synonymy, SCM is able to accurately\nmeasure the similarity between the two sentences. The method also uses the\nbag-of-words vector representation of the documents (simply put, the word's\nfrequencies in the documents). The intution behind the method is that we\ncompute standard cosine similarity assuming that the document vectors are\nexpressed in a non-orthogonal basis, where the angle between two basis\nvectors is derived from the angle between the word2vec embeddings of the\ncorresponding words.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimg = mpimg.imread('scm-hello.png')\nimgplot = plt.imshow(img)\nplt.axis('off')\nplt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This method was perhaps first introduced in the article \u201cSoft Measure and\nSoft Cosine Measure: Measure of Features in Vector Space Model\u201d by Grigori\nSidorov, Alexander Gelbukh, Helena Gomez-Adorno, and David Pinto.\n\nIn this tutorial, we will learn how to use Gensim's SCM functionality, which\nconsists of the ``inner_product`` method for one-off computation, and the\n``SoftCosineSimilarity`` class for corpus-based similarity queries.\n\n.. Important::\n If you use Gensim's SCM functionality, please consider citing [1], [2] and [3].\n\nComputing the Soft Cosine Measure\n---------------------------------\nTo use SCM, you need some existing word embeddings.\nYou could train your own Word2Vec model, but that is beyond the scope of this tutorial\n(check out `sphx_glr_auto_examples_tutorials_run_word2vec.py` if you're interested).\nFor this tutorial, we'll be using an existing Word2Vec model.\n\nLet's take some sentences to compute the distance between.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Initialize logging.\nimport logging\nlogging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n\nsentence_obama = 'Obama speaks to the media in Illinois'\nsentence_president = 'The president greets the press in Chicago'\nsentence_orange = 'Oranges are my favorite fruit'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first two sentences sentences have very similar content, and as such the\nSCM should be high. By contrast, the third sentence is unrelated to the first\ntwo and the SCM should be low.\n\nBefore we compute the SCM, we want to remove stopwords (\"the\", \"to\", etc.),\nas these do not contribute a lot to the information in the sentences.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Import and download stopwords from NLTK.\nfrom nltk.corpus import stopwords\nfrom nltk import download\ndownload('stopwords') # Download stopwords list.\nstop_words = stopwords.words('english')\n\ndef preprocess(sentence):\n return [w for w in sentence.lower().split() if w not in stop_words]\n\nsentence_obama = preprocess(sentence_obama)\nsentence_president = preprocess(sentence_president)\nsentence_orange = preprocess(sentence_orange)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will build a dictionary and a TF-IDF model, and we will convert the\nsentences to the bag-of-words format.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.corpora import Dictionary\ndocuments = [sentence_obama, sentence_president, sentence_orange]\ndictionary = Dictionary(documents)\n\nsentence_obama = dictionary.doc2bow(sentence_obama)\nsentence_president = dictionary.doc2bow(sentence_president)\nsentence_orange = dictionary.doc2bow(sentence_orange)\n\nfrom gensim.models import TfidfModel\ndocuments = [sentence_obama, sentence_president, sentence_orange]\ntfidf = TfidfModel(documents)\n\nsentence_obama = tfidf[sentence_obama]\nsentence_president = tfidf[sentence_president]\nsentence_orange = tfidf[sentence_orange]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, as mentioned earlier, we will be using some downloaded pre-trained\nembeddings. We load these into a Gensim Word2Vec model class and we build\na term similarity mextrix using the embeddings.\n\n.. Important::\n The embeddings we have chosen here require a lot of memory.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import gensim.downloader as api\nmodel = api.load('word2vec-google-news-300')\n\nfrom gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex\ntermsim_index = WordEmbeddingSimilarityIndex(model)\ntermsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So let's compute SCM using the ``inner_product`` method.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "similarity = termsim_matrix.inner_product(sentence_obama, sentence_president, normalized=(True, True))\nprint('similarity = %.4f' % similarity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try the same thing with two completely unrelated sentences.\nNotice that the similarity is smaller.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "similarity = termsim_matrix.inner_product(sentence_obama, sentence_orange, normalized=(True, True))\nprint('similarity = %.4f' % similarity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "References\n----------\n\n1. Grigori Sidorov et al. *Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model*, 2014.\n2. Delphine Charlet and Geraldine Damnati, SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity between Questions for Community Question Answering, 2017.\n3. V\u00edt Novotn\u00fd. *Implementation Notes for the Soft Cosine Measure*, 2018.\n4. Tom\u00e1\u0161 Mikolov et al. Efficient Estimation of Word Representations in Vector Space, 2013.\n\n\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_scm.py b/docs/src/auto_examples/tutorials/run_scm.py new file mode 100644 index 0000000000..d57c9f6a1e --- /dev/null +++ b/docs/src/auto_examples/tutorials/run_scm.py @@ -0,0 +1,153 @@ +r""" +Soft Cosine Measure +=================== + +Demonstrates using Gensim's implemenation of the SCM. + +""" + +############################################################################### +# Soft Cosine Measure (SCM) is a promising new tool in machine learning that +# allows us to submit a query and return the most relevant documents. This +# tutorial introduces SCM and shows how you can compute the SCM similarities +# between two documents using the ``inner_product`` method. +# +# Soft Cosine Measure basics +# -------------------------- +# +# Soft Cosine Measure (SCM) is a method that allows us to assess the similarity +# between two documents in a meaningful way, even when they have no words in +# common. It uses a measure of similarity between words, which can be derived +# [2] using [word2vec][] [4] vector embeddings of words. It has been shown to +# outperform many of the state-of-the-art methods in the semantic text +# similarity task in the context of community question answering [2]. +# +# +# SCM is illustrated below for two very similar sentences. The sentences have +# no words in common, but by modeling synonymy, SCM is able to accurately +# measure the similarity between the two sentences. The method also uses the +# bag-of-words vector representation of the documents (simply put, the word's +# frequencies in the documents). The intution behind the method is that we +# compute standard cosine similarity assuming that the document vectors are +# expressed in a non-orthogonal basis, where the angle between two basis +# vectors is derived from the angle between the word2vec embeddings of the +# corresponding words. +# + +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +img = mpimg.imread('scm-hello.png') +imgplot = plt.imshow(img) +plt.axis('off') +plt.show() + +############################################################################### +# This method was perhaps first introduced in the article “Soft Measure and +# Soft Cosine Measure: Measure of Features in Vector Space Model” by Grigori +# Sidorov, Alexander Gelbukh, Helena Gomez-Adorno, and David Pinto. +# +# In this tutorial, we will learn how to use Gensim's SCM functionality, which +# consists of the ``inner_product`` method for one-off computation, and the +# ``SoftCosineSimilarity`` class for corpus-based similarity queries. +# +# .. Important:: +# If you use Gensim's SCM functionality, please consider citing [1], [2] and [3]. +# +# Computing the Soft Cosine Measure +# --------------------------------- +# To use SCM, you need some existing word embeddings. +# You could train your own Word2Vec model, but that is beyond the scope of this tutorial +# (check out :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` if you're interested). +# For this tutorial, we'll be using an existing Word2Vec model. +# +# Let's take some sentences to compute the distance between. +# + +# Initialize logging. +import logging +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +sentence_obama = 'Obama speaks to the media in Illinois' +sentence_president = 'The president greets the press in Chicago' +sentence_orange = 'Oranges are my favorite fruit' + +############################################################################### +# The first two sentences sentences have very similar content, and as such the +# SCM should be high. By contrast, the third sentence is unrelated to the first +# two and the SCM should be low. +# +# Before we compute the SCM, we want to remove stopwords ("the", "to", etc.), +# as these do not contribute a lot to the information in the sentences. +# + +# Import and download stopwords from NLTK. +from nltk.corpus import stopwords +from nltk import download +download('stopwords') # Download stopwords list. +stop_words = stopwords.words('english') + +def preprocess(sentence): + return [w for w in sentence.lower().split() if w not in stop_words] + +sentence_obama = preprocess(sentence_obama) +sentence_president = preprocess(sentence_president) +sentence_orange = preprocess(sentence_orange) + +############################################################################### +# Next, we will build a dictionary and a TF-IDF model, and we will convert the +# sentences to the bag-of-words format. +# +from gensim.corpora import Dictionary +documents = [sentence_obama, sentence_president, sentence_orange] +dictionary = Dictionary(documents) + +sentence_obama = dictionary.doc2bow(sentence_obama) +sentence_president = dictionary.doc2bow(sentence_president) +sentence_orange = dictionary.doc2bow(sentence_orange) + +from gensim.models import TfidfModel +documents = [sentence_obama, sentence_president, sentence_orange] +tfidf = TfidfModel(documents) + +sentence_obama = tfidf[sentence_obama] +sentence_president = tfidf[sentence_president] +sentence_orange = tfidf[sentence_orange] + +############################################################################### +# Now, as mentioned earlier, we will be using some downloaded pre-trained +# embeddings. We load these into a Gensim Word2Vec model class and we build +# a term similarity mextrix using the embeddings. +# +# .. Important:: +# The embeddings we have chosen here require a lot of memory. +# +import gensim.downloader as api +model = api.load('word2vec-google-news-300') + +from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex +termsim_index = WordEmbeddingSimilarityIndex(model) +termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf) + +############################################################################### +# So let's compute SCM using the ``inner_product`` method. +# +similarity = termsim_matrix.inner_product(sentence_obama, sentence_president, normalized=(True, True)) +print('similarity = %.4f' % similarity) + +############################################################################### +# Let's try the same thing with two completely unrelated sentences. +# Notice that the similarity is smaller. +# +similarity = termsim_matrix.inner_product(sentence_obama, sentence_orange, normalized=(True, True)) +print('similarity = %.4f' % similarity) + +############################################################################### +# +# References +# ---------- +# +# 1. Grigori Sidorov et al. *Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model*, 2014. +# 2. Delphine Charlet and Geraldine Damnati, SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity between Questions for Community Question Answering, 2017. +# 3. Vít Novotný. *Implementation Notes for the Soft Cosine Measure*, 2018. +# 4. Tomáš Mikolov et al. Efficient Estimation of Word Representations in Vector Space, 2013. +# diff --git a/docs/src/auto_examples/tutorials/run_scm.py.md5 b/docs/src/auto_examples/tutorials/run_scm.py.md5 new file mode 100644 index 0000000000..c6a7a736cb --- /dev/null +++ b/docs/src/auto_examples/tutorials/run_scm.py.md5 @@ -0,0 +1 @@ +0bbd28e31784997fb257771856183106 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_scm.rst b/docs/src/auto_examples/tutorials/run_scm.rst new file mode 100644 index 0000000000..ee5bd4fc6c --- /dev/null +++ b/docs/src/auto_examples/tutorials/run_scm.rst @@ -0,0 +1,313 @@ +.. note:: + :class: sphx-glr-download-link-note + + Click :ref:`here ` to download the full example code +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_auto_examples_tutorials_run_scm.py: + + +Soft Cosine Measure +=================== + +Demonstrates using Gensim's implemenation of the SCM. + +Soft Cosine Measure (SCM) is a promising new tool in machine learning that +allows us to submit a query and return the most relevant documents. This +tutorial introduces SCM and shows how you can compute the SCM similarities +between two documents using the ``inner_product`` method. + +Soft Cosine Measure basics +-------------------------- + +Soft Cosine Measure (SCM) is a method that allows us to assess the similarity +between two documents in a meaningful way, even when they have no words in +common. It uses a measure of similarity between words, which can be derived +[2] using [word2vec][] [4] vector embeddings of words. It has been shown to +outperform many of the state-of-the-art methods in the semantic text +similarity task in the context of community question answering [2]. + + +SCM is illustrated below for two very similar sentences. The sentences have +no words in common, but by modeling synonymy, SCM is able to accurately +measure the similarity between the two sentences. The method also uses the +bag-of-words vector representation of the documents (simply put, the word's +frequencies in the documents). The intution behind the method is that we +compute standard cosine similarity assuming that the document vectors are +expressed in a non-orthogonal basis, where the angle between two basis +vectors is derived from the angle between the word2vec embeddings of the +corresponding words. + + + +.. code-block:: default + + + import matplotlib.pyplot as plt + import matplotlib.image as mpimg + img = mpimg.imread('scm-hello.png') + imgplot = plt.imshow(img) + plt.axis('off') + plt.show() + + + + +.. image:: /auto_examples/tutorials/images/sphx_glr_run_scm_001.png + :class: sphx-glr-single-img + + + + +This method was perhaps first introduced in the article “Soft Measure and +Soft Cosine Measure: Measure of Features in Vector Space Model” by Grigori +Sidorov, Alexander Gelbukh, Helena Gomez-Adorno, and David Pinto. + +In this tutorial, we will learn how to use Gensim's SCM functionality, which +consists of the ``inner_product`` method for one-off computation, and the +``SoftCosineSimilarity`` class for corpus-based similarity queries. + +.. Important:: + If you use Gensim's SCM functionality, please consider citing [1], [2] and [3]. + +Computing the Soft Cosine Measure +--------------------------------- +To use SCM, you need some existing word embeddings. +You could train your own Word2Vec model, but that is beyond the scope of this tutorial +(check out :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` if you're interested). +For this tutorial, we'll be using an existing Word2Vec model. + +Let's take some sentences to compute the distance between. + + + +.. code-block:: default + + + # Initialize logging. + import logging + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + + sentence_obama = 'Obama speaks to the media in Illinois' + sentence_president = 'The president greets the press in Chicago' + sentence_orange = 'Oranges are my favorite fruit' + + + + + + + +The first two sentences sentences have very similar content, and as such the +SCM should be high. By contrast, the third sentence is unrelated to the first +two and the SCM should be low. + +Before we compute the SCM, we want to remove stopwords ("the", "to", etc.), +as these do not contribute a lot to the information in the sentences. + + + +.. code-block:: default + + + # Import and download stopwords from NLTK. + from nltk.corpus import stopwords + from nltk import download + download('stopwords') # Download stopwords list. + stop_words = stopwords.words('english') + + def preprocess(sentence): + return [w for w in sentence.lower().split() if w not in stop_words] + + sentence_obama = preprocess(sentence_obama) + sentence_president = preprocess(sentence_president) + sentence_orange = preprocess(sentence_orange) + + + + + +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + dtype=np.int): + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:30: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + method='lar', copy_X=True, eps=np.finfo(np.float).eps, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:167: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + method='lar', copy_X=True, eps=np.finfo(np.float).eps, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:284: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1101: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1127: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, positive=False): + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1362: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1602: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1738: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, copy_X=True, positive=False): + [nltk_data] Downloading package stopwords to /home/witiko/nltk_data... + [nltk_data] Package stopwords is already up-to-date! + + + +Next, we will build a dictionary and a TF-IDF model, and we will convert the +sentences to the bag-of-words format. + + + +.. code-block:: default + + from gensim.corpora import Dictionary + documents = [sentence_obama, sentence_president, sentence_orange] + dictionary = Dictionary(documents) + + sentence_obama = dictionary.doc2bow(sentence_obama) + sentence_president = dictionary.doc2bow(sentence_president) + sentence_orange = dictionary.doc2bow(sentence_orange) + + from gensim.models import TfidfModel + documents = [sentence_obama, sentence_president, sentence_orange] + tfidf = TfidfModel(documents) + + sentence_obama = tfidf[sentence_obama] + sentence_president = tfidf[sentence_president] + sentence_orange = tfidf[sentence_orange] + + + + + + + +Now, as mentioned earlier, we will be using some downloaded pre-trained +embeddings. We load these into a Gensim Word2Vec model class and we build +a term similarity mextrix using the embeddings. + +.. Important:: + The embeddings we have chosen here require a lot of memory. + + + +.. code-block:: default + + import gensim.downloader as api + model = api.load('word2vec-google-news-300') + + from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex + termsim_index = WordEmbeddingSimilarityIndex(model) + termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf) + + + + + + + +So let's compute SCM using the ``inner_product`` method. + + + +.. code-block:: default + + similarity = termsim_matrix.inner_product(sentence_obama, sentence_president, normalized=(True, True)) + print('similarity = %.4f' % similarity) + + + + + +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + similarity = 0.2575 + + + +Let's try the same thing with two completely unrelated sentences. +Notice that the similarity is smaller. + + + +.. code-block:: default + + similarity = termsim_matrix.inner_product(sentence_obama, sentence_orange, normalized=(True, True)) + print('similarity = %.4f' % similarity) + + + + + +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + similarity = 0.0000 + + + +References +---------- + +1. Grigori Sidorov et al. *Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model*, 2014. +2. Delphine Charlet and Geraldine Damnati, SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity between Questions for Community Question Answering, 2017. +3. Vít Novotný. *Implementation Notes for the Soft Cosine Measure*, 2018. +4. Tomáš Mikolov et al. Efficient Estimation of Word Representations in Vector Space, 2013. + + + +.. rst-class:: sphx-glr-timing + + **Total running time of the script:** ( 0 minutes 56.707 seconds) + +**Estimated memory usage:** 7701 MB + + +.. _sphx_glr_download_auto_examples_tutorials_run_scm.py: + + +.. only :: html + + .. container:: sphx-glr-footer + :class: sphx-glr-footer-example + + + + .. container:: sphx-glr-download + + :download:`Download Python source code: run_scm.py ` + + + + .. container:: sphx-glr-download + + :download:`Download Jupyter notebook: run_scm.ipynb ` + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/src/auto_examples/tutorials/run_wmd.ipynb b/docs/src/auto_examples/tutorials/run_wmd.ipynb index bb80fc3bab..99711a9278 100644 --- a/docs/src/auto_examples/tutorials/run_wmd.ipynb +++ b/docs/src/auto_examples/tutorials/run_wmd.ipynb @@ -15,7 +15,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\nWord Mover's Distance\n=====================\n\nDemonstrates using Gensim's implemenation of the WMD.\n\n\n" + "\nWord Mover's Distance\n=====================\n\nDemonstrates using Gensim's implemenation of the WMD.\n" ] }, { @@ -130,25 +130,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Normalizing word2vec vectors\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nWhen using the ``wmdistance`` method, it is beneficial to normalize the\nword2vec vectors first, so they all have equal length. To do this, simply\ncall ``model.init_sims(replace=True)`` and Gensim will take care of that for\nyou.\n\nUsually, one measures the distance between two Word2Vec vectors using the\ncosine distance (see `cosine similarity\n`_), which measures the\nangle between vectors. WMD, on the other hand, uses the Euclidean distance.\nThe Euclidean distance between two vectors might be large because their\nlengths differ, but the cosine distance is small because the angle between\nthem is small; we can mitigate some of this by normalizing the vectors.\n\n.. Important::\n Note that normalizing the vectors can take some time, especially if you have\n a large vocabulary and/or large vectors.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "model.init_sims(replace=True) # Normalizes the vectors in the word2vec class.\n\ndistance = model.wmdistance(sentence_obama, sentence_president) # Compute WMD as normal.\nprint('distance: %r' % distance)\n\ndistance = model.wmdistance(sentence_obama, sentence_orange)\nprint('distance = %.4f' % distance)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "References\n----------\n\n1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008.\n2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009.\n3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015.\n4. Thomas Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013.\n\n\n" + "References\n----------\n\n1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008.\n2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009.\n3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015.\n4. Tom\u00e1\u0161 Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013.\n\n\n" ] } ], @@ -168,7 +150,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/tutorials/run_wmd.py b/docs/src/auto_examples/tutorials/run_wmd.py index ac471848b2..06e263063a 100644 --- a/docs/src/auto_examples/tutorials/run_wmd.py +++ b/docs/src/auto_examples/tutorials/run_wmd.py @@ -114,35 +114,6 @@ def preprocess(sentence): distance = model.wmdistance(sentence_obama, sentence_orange) print('distance = %.4f' % distance) -############################################################################### -# Normalizing word2vec vectors -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# When using the ``wmdistance`` method, it is beneficial to normalize the -# word2vec vectors first, so they all have equal length. To do this, simply -# call ``model.init_sims(replace=True)`` and Gensim will take care of that for -# you. -# -# Usually, one measures the distance between two Word2Vec vectors using the -# cosine distance (see `cosine similarity -# `_), which measures the -# angle between vectors. WMD, on the other hand, uses the Euclidean distance. -# The Euclidean distance between two vectors might be large because their -# lengths differ, but the cosine distance is small because the angle between -# them is small; we can mitigate some of this by normalizing the vectors. -# -# .. Important:: -# Note that normalizing the vectors can take some time, especially if you have -# a large vocabulary and/or large vectors. -# -model.init_sims(replace=True) # Normalizes the vectors in the word2vec class. - -distance = model.wmdistance(sentence_obama, sentence_president) # Compute WMD as normal. -print('distance: %r' % distance) - -distance = model.wmdistance(sentence_obama, sentence_orange) -print('distance = %.4f' % distance) - ############################################################################### # References # ---------- @@ -150,5 +121,5 @@ def preprocess(sentence): # 1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008. # 2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009. # 3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. -# 4. Thomas Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. +# 4. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. # diff --git a/docs/src/auto_examples/tutorials/run_wmd.py.md5 b/docs/src/auto_examples/tutorials/run_wmd.py.md5 index da8b056d38..382c5b9954 100644 --- a/docs/src/auto_examples/tutorials/run_wmd.py.md5 +++ b/docs/src/auto_examples/tutorials/run_wmd.py.md5 @@ -1 +1 @@ -3885f6df5a8f72a6d5dd1336c54560a4 \ No newline at end of file +45521a352637a0f53e62f3e19e61fc07 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_wmd.rst b/docs/src/auto_examples/tutorials/run_wmd.rst index a5117f11ac..cc62e120dd 100644 --- a/docs/src/auto_examples/tutorials/run_wmd.rst +++ b/docs/src/auto_examples/tutorials/run_wmd.rst @@ -1,12 +1,10 @@ -.. only:: html - - .. note:: - :class: sphx-glr-download-link-note +.. note:: + :class: sphx-glr-download-link-note - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title + Click :ref:`here ` to download the full example code +.. rst-class:: sphx-glr-example-title - .. _sphx_glr_auto_examples_tutorials_run_wmd.py: +.. _sphx_glr_auto_examples_tutorials_run_wmd.py: Word Mover's Distance @@ -14,7 +12,6 @@ Word Mover's Distance Demonstrates using Gensim's implemenation of the WMD. - Word Mover's Distance (WMD) is a promising new tool in machine learning that allows us to submit a query and return the most relevant documents. This tutorial introduces WMD and shows how you can compute the WMD distance @@ -57,20 +54,9 @@ distribution of document 1 to the distribution of document 2. .. image:: /auto_examples/tutorials/images/sphx_glr_run_wmd_001.png - :alt: run wmd :class: sphx-glr-single-img -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - /Volumes/work/workspace/vew/gensim3.6/lib/python3.6/site-packages/matplotlib/figure.py:445: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure. - % get_backend()) - - This method was introduced in the article "From Word Embeddings To Document @@ -114,7 +100,6 @@ Let's take some sentences to compute the distance between. - These sentences have very similar content, and as such the WMD should be low. Before we compute the WMD, we want to remove stopwords ("the", "to", etc.), as these do not contribute a lot to the information in the sentences. @@ -146,13 +131,41 @@ as these do not contribute a lot to the information in the sentences. .. code-block:: none - [nltk_data] Downloading package stopwords to - [nltk_data] /Users/kofola3/nltk_data... + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + dtype=np.int): + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:30: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + method='lar', copy_X=True, eps=np.finfo(np.float).eps, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:167: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + method='lar', copy_X=True, eps=np.finfo(np.float).eps, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:284: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1101: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1127: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, positive=False): + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1362: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1602: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, + /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1738: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. + Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations + eps=np.finfo(np.float).eps, copy_X=True, positive=False): + [nltk_data] Downloading package stopwords to /home/witiko/nltk_data... [nltk_data] Package stopwords is already up-to-date! - Now, as mentioned earlier, we will be using some downloaded pre-trained embeddings. We load these into a Gensim Word2Vec model class. @@ -170,16 +183,6 @@ embeddings. We load these into a Gensim Word2Vec model class. -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 2020-09-30 19:33:05,053 : INFO : loading projection weights from /Users/kofola3/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz - 2020-09-30 19:34:10,211 : INFO : loaded (3000000, 300) matrix from /Users/kofola3/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz - - So let's compute WMD using the ``wmdistance`` method. @@ -201,13 +204,10 @@ So let's compute WMD using the ``wmdistance`` method. .. code-block:: none - 2020-09-30 19:34:11,705 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2020-09-30 19:34:11,707 : INFO : built Dictionary(8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...) from 2 documents (total 8 corpus positions) distance = 1.0175 - Let's try the same thing with two completely unrelated sentences. Notice that the distance is larger. @@ -228,81 +228,25 @@ Let's try the same thing with two completely unrelated sentences. Notice that th .. code-block:: none - 2020-09-30 19:34:23,254 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2020-09-30 19:34:23,256 : INFO : built Dictionary(7 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'favorite']...) from 2 documents (total 7 corpus positions) - distance = 1.3663 - - - - -Normalizing word2vec vectors -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When using the ``wmdistance`` method, it is beneficial to normalize the -word2vec vectors first, so they all have equal length. To do this, simply -call ``model.init_sims(replace=True)`` and Gensim will take care of that for -you. - -Usually, one measures the distance between two Word2Vec vectors using the -cosine distance (see `cosine similarity -`_), which measures the -angle between vectors. WMD, on the other hand, uses the Euclidean distance. -The Euclidean distance between two vectors might be large because their -lengths differ, but the cosine distance is small because the angle between -them is small; we can mitigate some of this by normalizing the vectors. - -.. Important:: - Note that normalizing the vectors can take some time, especially if you have - a large vocabulary and/or large vectors. - - - -.. code-block:: default - - model.init_sims(replace=True) # Normalizes the vectors in the word2vec class. - - distance = model.wmdistance(sentence_obama, sentence_president) # Compute WMD as normal. - print('distance: %r' % distance) - - distance = model.wmdistance(sentence_obama, sentence_orange) - print('distance = %.4f' % distance) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 2020-09-30 19:34:23,443 : WARNING : destructive init_sims(replace=True) deprecated & no longer required for space-efficiency - 2020-09-30 19:34:27,347 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2020-09-30 19:34:27,348 : INFO : built Dictionary(8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...) from 2 documents (total 8 corpus positions) - distance: 1.0174646259300113 - 2020-09-30 19:34:27,353 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2020-09-30 19:34:27,353 : INFO : built Dictionary(7 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'favorite']...) from 2 documents (total 7 corpus positions) distance = 1.3663 - References ---------- 1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008. 2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009. 3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. -4. Thomas Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. +4. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 1 minutes 25.936 seconds) + **Total running time of the script:** ( 0 minutes 55.983 seconds) -**Estimated memory usage:** 7158 MB +**Estimated memory usage:** 7537 MB .. _sphx_glr_download_auto_examples_tutorials_run_wmd.py: @@ -315,13 +259,13 @@ References - .. container:: sphx-glr-download sphx-glr-download-python + .. container:: sphx-glr-download :download:`Download Python source code: run_wmd.py ` - .. container:: sphx-glr-download sphx-glr-download-jupyter + .. container:: sphx-glr-download :download:`Download Jupyter notebook: run_wmd.ipynb ` diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index 7003c2957e..4757a47562 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -5,18 +5,12 @@ Computation times ================= -**11:26.674** total execution time for **auto_examples_tutorials** files: +**00:56.707** total execution time for **auto_examples_tutorials** files: -+-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``) | 11:26.674 | 7177.5 MB | -+-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+-----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+-----------+ +- **00:56.707**: :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` (``run_scm.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``) +- **00:00.000**: :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``) diff --git a/docs/src/gallery/tutorials/run_scm.py b/docs/src/gallery/tutorials/run_scm.py new file mode 100644 index 0000000000..d57c9f6a1e --- /dev/null +++ b/docs/src/gallery/tutorials/run_scm.py @@ -0,0 +1,153 @@ +r""" +Soft Cosine Measure +=================== + +Demonstrates using Gensim's implemenation of the SCM. + +""" + +############################################################################### +# Soft Cosine Measure (SCM) is a promising new tool in machine learning that +# allows us to submit a query and return the most relevant documents. This +# tutorial introduces SCM and shows how you can compute the SCM similarities +# between two documents using the ``inner_product`` method. +# +# Soft Cosine Measure basics +# -------------------------- +# +# Soft Cosine Measure (SCM) is a method that allows us to assess the similarity +# between two documents in a meaningful way, even when they have no words in +# common. It uses a measure of similarity between words, which can be derived +# [2] using [word2vec][] [4] vector embeddings of words. It has been shown to +# outperform many of the state-of-the-art methods in the semantic text +# similarity task in the context of community question answering [2]. +# +# +# SCM is illustrated below for two very similar sentences. The sentences have +# no words in common, but by modeling synonymy, SCM is able to accurately +# measure the similarity between the two sentences. The method also uses the +# bag-of-words vector representation of the documents (simply put, the word's +# frequencies in the documents). The intution behind the method is that we +# compute standard cosine similarity assuming that the document vectors are +# expressed in a non-orthogonal basis, where the angle between two basis +# vectors is derived from the angle between the word2vec embeddings of the +# corresponding words. +# + +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +img = mpimg.imread('scm-hello.png') +imgplot = plt.imshow(img) +plt.axis('off') +plt.show() + +############################################################################### +# This method was perhaps first introduced in the article “Soft Measure and +# Soft Cosine Measure: Measure of Features in Vector Space Model” by Grigori +# Sidorov, Alexander Gelbukh, Helena Gomez-Adorno, and David Pinto. +# +# In this tutorial, we will learn how to use Gensim's SCM functionality, which +# consists of the ``inner_product`` method for one-off computation, and the +# ``SoftCosineSimilarity`` class for corpus-based similarity queries. +# +# .. Important:: +# If you use Gensim's SCM functionality, please consider citing [1], [2] and [3]. +# +# Computing the Soft Cosine Measure +# --------------------------------- +# To use SCM, you need some existing word embeddings. +# You could train your own Word2Vec model, but that is beyond the scope of this tutorial +# (check out :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` if you're interested). +# For this tutorial, we'll be using an existing Word2Vec model. +# +# Let's take some sentences to compute the distance between. +# + +# Initialize logging. +import logging +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +sentence_obama = 'Obama speaks to the media in Illinois' +sentence_president = 'The president greets the press in Chicago' +sentence_orange = 'Oranges are my favorite fruit' + +############################################################################### +# The first two sentences sentences have very similar content, and as such the +# SCM should be high. By contrast, the third sentence is unrelated to the first +# two and the SCM should be low. +# +# Before we compute the SCM, we want to remove stopwords ("the", "to", etc.), +# as these do not contribute a lot to the information in the sentences. +# + +# Import and download stopwords from NLTK. +from nltk.corpus import stopwords +from nltk import download +download('stopwords') # Download stopwords list. +stop_words = stopwords.words('english') + +def preprocess(sentence): + return [w for w in sentence.lower().split() if w not in stop_words] + +sentence_obama = preprocess(sentence_obama) +sentence_president = preprocess(sentence_president) +sentence_orange = preprocess(sentence_orange) + +############################################################################### +# Next, we will build a dictionary and a TF-IDF model, and we will convert the +# sentences to the bag-of-words format. +# +from gensim.corpora import Dictionary +documents = [sentence_obama, sentence_president, sentence_orange] +dictionary = Dictionary(documents) + +sentence_obama = dictionary.doc2bow(sentence_obama) +sentence_president = dictionary.doc2bow(sentence_president) +sentence_orange = dictionary.doc2bow(sentence_orange) + +from gensim.models import TfidfModel +documents = [sentence_obama, sentence_president, sentence_orange] +tfidf = TfidfModel(documents) + +sentence_obama = tfidf[sentence_obama] +sentence_president = tfidf[sentence_president] +sentence_orange = tfidf[sentence_orange] + +############################################################################### +# Now, as mentioned earlier, we will be using some downloaded pre-trained +# embeddings. We load these into a Gensim Word2Vec model class and we build +# a term similarity mextrix using the embeddings. +# +# .. Important:: +# The embeddings we have chosen here require a lot of memory. +# +import gensim.downloader as api +model = api.load('word2vec-google-news-300') + +from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex +termsim_index = WordEmbeddingSimilarityIndex(model) +termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf) + +############################################################################### +# So let's compute SCM using the ``inner_product`` method. +# +similarity = termsim_matrix.inner_product(sentence_obama, sentence_president, normalized=(True, True)) +print('similarity = %.4f' % similarity) + +############################################################################### +# Let's try the same thing with two completely unrelated sentences. +# Notice that the similarity is smaller. +# +similarity = termsim_matrix.inner_product(sentence_obama, sentence_orange, normalized=(True, True)) +print('similarity = %.4f' % similarity) + +############################################################################### +# +# References +# ---------- +# +# 1. Grigori Sidorov et al. *Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model*, 2014. +# 2. Delphine Charlet and Geraldine Damnati, SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity between Questions for Community Question Answering, 2017. +# 3. Vít Novotný. *Implementation Notes for the Soft Cosine Measure*, 2018. +# 4. Tomáš Mikolov et al. Efficient Estimation of Word Representations in Vector Space, 2013. +# diff --git a/docs/src/gallery/tutorials/run_wmd.py b/docs/src/gallery/tutorials/run_wmd.py index ac471848b2..06e263063a 100644 --- a/docs/src/gallery/tutorials/run_wmd.py +++ b/docs/src/gallery/tutorials/run_wmd.py @@ -114,35 +114,6 @@ def preprocess(sentence): distance = model.wmdistance(sentence_obama, sentence_orange) print('distance = %.4f' % distance) -############################################################################### -# Normalizing word2vec vectors -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# When using the ``wmdistance`` method, it is beneficial to normalize the -# word2vec vectors first, so they all have equal length. To do this, simply -# call ``model.init_sims(replace=True)`` and Gensim will take care of that for -# you. -# -# Usually, one measures the distance between two Word2Vec vectors using the -# cosine distance (see `cosine similarity -# `_), which measures the -# angle between vectors. WMD, on the other hand, uses the Euclidean distance. -# The Euclidean distance between two vectors might be large because their -# lengths differ, but the cosine distance is small because the angle between -# them is small; we can mitigate some of this by normalizing the vectors. -# -# .. Important:: -# Note that normalizing the vectors can take some time, especially if you have -# a large vocabulary and/or large vectors. -# -model.init_sims(replace=True) # Normalizes the vectors in the word2vec class. - -distance = model.wmdistance(sentence_obama, sentence_president) # Compute WMD as normal. -print('distance: %r' % distance) - -distance = model.wmdistance(sentence_obama, sentence_orange) -print('distance = %.4f' % distance) - ############################################################################### # References # ---------- @@ -150,5 +121,5 @@ def preprocess(sentence): # 1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008. # 2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009. # 3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. -# 4. Thomas Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. +# 4. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. # diff --git a/docs/src/gallery/tutorials/scm-hello.png b/docs/src/gallery/tutorials/scm-hello.png new file mode 100644 index 0000000000..d87eac9fff Binary files /dev/null and b/docs/src/gallery/tutorials/scm-hello.png differ diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index bce59620b7..4dd0528f50 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -887,8 +887,9 @@ class SoftCosineSimilarity(interfaces.SimilarityABC): >>> from gensim.test.utils import common_texts >>> from gensim.corpora import Dictionary - >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex + >>> from gensim.models import Word2Vec >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix + >>> from gensim.similarities import WordEmbeddingSimilarityIndex >>> >>> model = Word2Vec(common_texts, vector_size=20, min_count=1) # train word-vectors >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv) @@ -900,12 +901,11 @@ class SoftCosineSimilarity(interfaces.SimilarityABC): >>> query = 'graph trees computer'.split() # make a query >>> sims = docsim_index[dictionary.doc2bow(query)] # calculate similarity of query to each doc from bow_corpus - Check out `Tutorial Notebook - `_ + Check out `the Gallery `__ for more examples. """ - def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256): + def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256, normalized=(True, True)): """ Parameters @@ -918,14 +918,20 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256): The number of results to retrieve for a query, if None - return similarities with all elements from corpus. chunksize: int, optional Size of one corpus chunk. + normalized : tuple of {True, False, 'maintain'}, optional + First/second value specifies whether the query/document vectors in the inner product + will be L2-normalized (True; corresponds to the soft cosine similarity measure; default), + maintain their L2-norm during change of basis ('maintain'; corresponds to query + expansion with partial membership), or kept as-is (False; + corresponds to query expansion). See Also -------- - :class:`gensim.similarities.SparseTermSimilarityMatrix` - A sparse term similarity matrix build using a term similarity index. - :class:`gensim.similarities.LevenshteinSimilarityIndex` + :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` + A sparse term similarity matrix built using a term similarity index. + :class:`~gensim.similarities.termsim.LevenshteinSimilarityIndex` A term similarity index that computes Levenshtein similarities between terms. - :class:`gensim.models.WordEmbeddingSimilarityIndex` + :class:`~gensim.similarities.termsim.WordEmbeddingSimilarityIndex` A term similarity index that computes cosine similarities between word embeddings. """ @@ -934,6 +940,7 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256): self.corpus = corpus self.num_best = num_best self.chunksize = chunksize + self.normalized = normalized # Normalization of features is undesirable, since soft cosine similarity requires special # normalization using the similarity matrix. Therefore, we would just be normalizing twice, @@ -970,7 +977,7 @@ def get_similarities(self, query): is_corpus, query = utils.is_corpus(query) if not is_corpus and isinstance(query, numpy.ndarray): query = [self.corpus[i] for i in query] # convert document indexes to actual documents - result = self.similarity_matrix.inner_product(query, self.corpus, normalized=(True, True)) + result = self.similarity_matrix.inner_product(query, self.corpus, normalized=self.normalized) if scipy.sparse.issparse(result): return numpy.asarray(result.todense()) @@ -985,9 +992,8 @@ def __str__(self): class WmdSimilarity(interfaces.SimilarityABC): """Compute negative WMD similarity against a corpus of documents. - See :class:`~gensim.models.keyedvectors.KeyedVectors` for more information. - Also, tutorial `notebook - `_ for more examples. + Check out `the Gallery `__ + for more examples. When using this code, please consider citing the following papers: diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py index c047587339..8f39e9b36c 100644 --- a/gensim/similarities/termsim.py +++ b/gensim/similarities/termsim.py @@ -35,7 +35,7 @@ class TermSimilarityIndex(SaveLoad): See Also -------- :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` - Build a term similarity matrix and compute the Soft Cosine Measure. + A sparse term similarity matrix built using a term similarity index. """ def most_similar(self, term, topn=10): @@ -79,7 +79,7 @@ class UniformTermSimilarityIndex(TermSimilarityIndex): See Also -------- :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` - Build a term similarity matrix and compute the Soft Cosine Measure. + A sparse term similarity matrix built using a term similarity index. Notes ----- @@ -120,7 +120,7 @@ class WordEmbeddingSimilarityIndex(TermSimilarityIndex): See Also -------- :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` - Build a term similarity matrix and compute the Soft Cosine Measure. + A sparse term similarity matrix built using a term similarity index. """ def __init__(self, keyedvectors, threshold=0.0, exponent=2.0, kwargs=None): @@ -405,8 +405,8 @@ class SparseTermSimilarityMatrix(SaveLoad): -------- >>> from gensim.test.utils import common_texts >>> from gensim.corpora import Dictionary - >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex - >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix + >>> from gensim.models import Word2Vec + >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex >>> from gensim.similarities.index import AnnoyIndexer >>> from scikits.sparse.cholmod import cholesky >>> @@ -423,8 +423,7 @@ class SparseTermSimilarityMatrix(SaveLoad): >>> >>> word_embeddings = cholesky(similarity_matrix.matrix).L() # obtain word embeddings from similarity matrix - Check out `Tutorial Notebook - `_ + Check out `the Gallery `_ for more examples. Parameters @@ -468,6 +467,15 @@ class SparseTermSimilarityMatrix(SaveLoad): ValueError If `dictionary` is empty. + See Also + -------- + :class:`~gensim.similarities.docsim.SoftCosineSimilarity` + A document similarity index using the soft cosine similarity over the term similarity matrix. + :class:`~gensim.similarities.termsim.LevenshteinSimilarityIndex` + A term similarity index that computes Levenshtein similarities between terms. + :class:`~gensim.similarities.termsim.WordEmbeddingSimilarityIndex` + A term similarity index that computes cosine similarities between word embeddings. + """ def __init__(self, source, dictionary=None, tfidf=None, symmetric=True, dominant=False, nonzero_limit=100, dtype=np.float32):