From 8766edcd8e4baf3cfa08cdc22bb25cb9f2e0b55f Mon Sep 17 00:00:00 2001 From: Menshikh Ivan Date: Tue, 21 Nov 2017 02:32:21 +0500 Subject: [PATCH] Add tox and pytest to gensim, integration with Travis and Appveyor. Fix #1613, 1644 (#1721) * remove flake8 config from setup.cfg * create distinct test_env for win * ignore stuff from tox * basic tox config * add global env vars for full test run * force-recreate for envs * show top20 slowest tests * add upload/download wheels/docs * fix E501 [1] * fix E501 [2] * fix E501 [3] * fix E501 [4] * fix E501 [5] * fix E501 [6] * travis + tox * Install tox for travis * simplify travis file * more verbosity with tox * Fix numpy scipy versions * Try to avoid pip install hang * Fix tox * Add build_ext * Fix dtm test * remove install/run sh * Fix imports & indentation * remove flake-diff * Add docs building to Travis * join flake8 and docs to one job * add re-run for failed tests (to avoid FP) + calculate code coverage * fix WR segfault (veeeery buggy implementation) * attempt to make multiOS configuration * fix mistake with cython * Try to fix appveyor wheels problem * Remove commented parts & add cache for travis --- .gitignore | 2 + .travis.yml | 20 +- appveyor.yml | 51 +---- continuous_integration/travis/flake8_diff.sh | 159 -------------- continuous_integration/travis/install.sh | 13 -- continuous_integration/travis/run.sh | 11 - gensim/corpora/indexedcorpus.py | 7 +- gensim/corpora/lowcorpus.py | 3 +- gensim/corpora/sharded_corpus.py | 18 +- gensim/corpora/svmlightcorpus.py | 3 +- gensim/corpora/textcorpus.py | 5 +- gensim/corpora/wikicorpus.py | 11 +- gensim/examples/dmlcz/dmlcorpus.py | 9 +- gensim/examples/dmlcz/gensim_build.py | 2 +- gensim/examples/dmlcz/gensim_xml.py | 9 +- gensim/examples/dmlcz/sources.py | 9 +- gensim/interfaces.py | 8 +- gensim/matutils.py | 44 ++-- gensim/models/atmodel.py | 80 +++++-- gensim/models/callbacks.py | 10 +- gensim/models/coherencemodel.py | 7 +- gensim/models/doc2vec.py | 28 ++- gensim/models/fasttext.py | 51 +++-- gensim/models/hdpmodel.py | 13 +- gensim/models/keyedvectors.py | 13 +- gensim/models/lda_dispatcher.py | 12 +- gensim/models/lda_worker.py | 13 +- gensim/models/ldamodel.py | 38 ++-- gensim/models/ldamulticore.py | 13 +- gensim/models/ldaseqmodel.py | 94 ++++++--- gensim/models/lsi_worker.py | 3 +- gensim/models/lsimodel.py | 33 ++- gensim/models/phrases.py | 11 +- gensim/models/rpmodel.py | 3 +- gensim/models/translation_matrix.py | 21 +- gensim/models/word2vec.py | 131 ++++++++---- gensim/models/wrappers/dtmmodel.py | 6 +- gensim/models/wrappers/fasttext.py | 6 +- gensim/models/wrappers/ldamallet.py | 47 +++-- gensim/models/wrappers/varembed.py | 3 +- gensim/models/wrappers/wordrank.py | 18 +- gensim/scripts/glove2word2vec.py | 4 +- gensim/scripts/make_wiki_online.py | 6 +- gensim/scripts/make_wiki_online_lemma.py | 6 +- gensim/scripts/make_wiki_online_nodebug.py | 6 +- gensim/scripts/make_wikicorpus.py | 6 +- gensim/scripts/word2vec2tensor.py | 10 +- gensim/scripts/word2vec_standalone.py | 44 +++- gensim/similarities/docsim.py | 7 +- gensim/sklearn_api/atmodel.py | 3 +- gensim/sklearn_api/d2vmodel.py | 4 +- gensim/sklearn_api/hdp.py | 7 +- gensim/sklearn_api/ldamodel.py | 16 +- gensim/sklearn_api/ldaseqmodel.py | 8 +- gensim/sklearn_api/lsimodel.py | 10 +- gensim/sklearn_api/phrases.py | 20 +- gensim/sklearn_api/rpmodel.py | 7 +- gensim/sklearn_api/tfidf.py | 6 +- gensim/summarization/summarizer.py | 4 +- gensim/test/test_atmodel.py | 6 +- gensim/test/test_doc2vec.py | 13 +- gensim/test/test_dtm.py | 8 +- gensim/test/test_fasttext.py | 13 +- gensim/test/test_keras_integration.py | 8 +- gensim/test/test_ldamallet_wrapper.py | 7 +- gensim/test/test_ldamodel.py | 7 +- gensim/test/test_lsimodel.py | 9 +- gensim/test/test_normmodel.py | 6 +- gensim/test/test_parsing.py | 4 +- gensim/test/test_rpmodel.py | 3 +- gensim/test/test_similarities.py | 3 +- gensim/test/test_sklearn_api.py | 211 +++++++++++++++++-- gensim/test/test_tmdiff.py | 3 +- gensim/test/test_translation_matrix.py | 17 +- gensim/test/test_word2vec.py | 10 +- gensim/test/test_wordrank_wrapper.py | 2 +- gensim/topic_coherence/segmentation.py | 9 +- gensim/utils.py | 16 +- setup.cfg | 3 - setup.py | 23 +- tox.ini | 78 +++++++ 81 files changed, 1070 insertions(+), 601 deletions(-) delete mode 100755 continuous_integration/travis/flake8_diff.sh delete mode 100755 continuous_integration/travis/install.sh delete mode 100755 continuous_integration/travis/run.sh create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index f0ed3e97ff..6939309d25 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,8 @@ Thumbs.db # Other # ######### +.tox/ +.cache/ .project .pydevproject .ropeproject diff --git a/.travis.yml b/.travis.yml index 211d9b0d1f..f97bac263f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,18 +5,24 @@ cache: directories: - $HOME/.cache/pip - $HOME/.ccache - + - $HOME/.pip-cache dist: trusty language: python matrix: include: - - env: PYTHON_VERSION="2.7" NUMPY_VERSION="1.11.3" SCIPY_VERSION="0.18.1" ONLY_CODESTYLE="yes" - - env: PYTHON_VERSION="2.7" NUMPY_VERSION="1.11.3" SCIPY_VERSION="0.18.1" ONLY_CODESTYLE="no" - - env: PYTHON_VERSION="3.5" NUMPY_VERSION="1.11.3" SCIPY_VERSION="0.18.1" ONLY_CODESTYLE="no" - - env: PYTHON_VERSION="3.6" NUMPY_VERSION="1.11.3" SCIPY_VERSION="0.18.1" ONLY_CODESTYLE="no" + - python: '2.7' + env: TOXENV="flake8, docs" + + - python: '2.7' + env: TOXENV="py27-linux" + + - python: '3.5' + env: TOXENV="py35-linux" + - python: '3.6' + env: TOXENV="py36-linux" -install: source continuous_integration/travis/install.sh -script: bash continuous_integration/travis/run.sh +install: pip install tox +script: tox -vv diff --git a/appveyor.yml b/appveyor.yml index 50de6882d8..04da45cd43 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,29 +13,20 @@ environment: secure: qXqY3dFmLOqvxa3Om2gQi/BjotTOK+EP2IPLolBNo0c61yDtNWxbmE4wH3up72Be matrix: - # - PYTHON: "C:\\Python27" - # PYTHON_VERSION: "2.7.12" - # PYTHON_ARCH: "32" - - PYTHON: "C:\\Python27-x64" PYTHON_VERSION: "2.7.12" PYTHON_ARCH: "64" - - # - PYTHON: "C:\\Python35" - # PYTHON_VERSION: "3.5.2" - # PYTHON_ARCH: "32" + TOXENV: "py27-win" - PYTHON: "C:\\Python35-x64" PYTHON_VERSION: "3.5.2" PYTHON_ARCH: "64" - - # - PYTHON: "C:\\Python36" - # PYTHON_VERSION: "3.6.0" - # PYTHON_ARCH: "32" + TOXENV: "py35-win" - PYTHON: "C:\\Python36-x64" PYTHON_VERSION: "3.6.0" PYTHON_ARCH: "64" + TOXENV: "py36-win" init: - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" @@ -57,48 +48,16 @@ install: # not already installed. - "powershell ./continuous_integration/appveyor/install.ps1" - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - - "python -m pip install -U pip" + - "python -m pip install -U pip tox" # Check that we have the expected version and architecture for Python - "python --version" - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" - # Install the build and runtime dependencies of the project. - - "%CMD_IN_ENV% pip install --timeout=60 --trusted-host 28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com -r continuous_integration/appveyor/requirements.txt" - - "%CMD_IN_ENV% python setup.py bdist_wheel bdist_wininst" - - ps: "ls dist" - - # Install the genreated wheel package to test it - - "pip install --pre --no-index --find-links dist/ gensim" - -# Not a .NET project, we build scikit-learn in the install step instead build: false test_script: - # Change to a non-source folder to make sure we run the tests on the - # installed library. - - "mkdir empty_folder" - - "cd empty_folder" - - "pip install pyemd testfixtures sklearn Morfessor==2.0.2a4" - - "pip freeze" - - "python -c \"import nose; nose.main()\" -s -v gensim" - # Move back to the project folder - - "cd .." - -artifacts: - # Archive the generated wheel package in the ci.appveyor.com build report. - - path: dist\* -on_success: - # Upload the generated wheel package to Rackspace - # On Windows, Apache Libcloud cannot find a standard CA cert bundle so we - # disable the ssl checks. - - "python -m wheelhouse_uploader upload --no-ssl-check --local-folder=dist gensim-windows-wheels" - -notifications: - - provider: Webhook - url: https://webhooks.gitter.im/e/62c44ad26933cd7ed7e8 - on_build_success: false - on_build_failure: True + - tox -vv cache: # Use the appveyor cache to avoid re-downloading large archives such diff --git a/continuous_integration/travis/flake8_diff.sh b/continuous_integration/travis/flake8_diff.sh deleted file mode 100755 index c4a034ddbf..0000000000 --- a/continuous_integration/travis/flake8_diff.sh +++ /dev/null @@ -1,159 +0,0 @@ -#!/bin/bash -# This is a modified script from scikit-learn project. - -# This script is used in Travis to check that PRs do not add obvious -# flake8 violations. It relies on two things: -# - find common ancestor between branch and -# gensim remote -# - run flake8 --diff on the diff between the branch and the common -# ancestor -# -# Additional features: -# - the line numbers in Travis match the local branch on the PR -# author machine. -# - ./continuous_integration/travis/flake8_diff.sh can be run locally for quick -# turn-around - -set -e -# pipefail is necessary to propagate exit codes -set -o pipefail - -PROJECT=RaRe-Technologies/gensim -PROJECT_URL=https://github.com/${PROJECT}.git -FLAKE_CONFIG_FILE=setup.cfg - -# Find the remote with the project name (upstream in most cases) -REMOTE=$(git remote -v | grep ${PROJECT} | cut -f1 | head -1 || echo '') - -# Add a temporary remote if needed. For example this is necessary when -# Travis is configured to run in a fork. In this case 'origin' is the -# fork and not the reference repo we want to diff against. -if [[ -z "$REMOTE" ]]; then - TMP_REMOTE=tmp_reference_upstream - REMOTE=${TMP_REMOTE} - git remote add ${REMOTE} ${PROJECT_URL} -fi - -echo "Remotes:" -echo '--------------------------------------------------------------------------------' -git remote --verbose - -# Travis does the git clone with a limited depth (50 at the time of -# writing). This may not be enough to find the common ancestor with -# $REMOTE/develop so we unshallow the git checkout -if [[ -a .git/shallow ]]; then - echo -e '\nTrying to unshallow the repo:' - echo '--------------------------------------------------------------------------------' - git fetch --unshallow -fi - -if [[ "$TRAVIS" == "true" ]]; then - if [[ "$TRAVIS_PULL_REQUEST" == "false" ]] - then - # In main repo, using TRAVIS_COMMIT_RANGE to test the commits - # that were pushed into a branch - if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then - if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then - echo "New branch, no commit range from Travis so passing this test by convention" - exit 0 - fi - COMMIT_RANGE=${TRAVIS_COMMIT_RANGE} - fi - else - # We want to fetch the code as it is in the PR branch and not - # the result of the merge into develop. This way line numbers - # reported by Travis will match with the local code. - LOCAL_BRANCH_REF=travis_pr_${TRAVIS_PULL_REQUEST} - # In Travis the PR target is always origin - git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:refs/${LOCAL_BRANCH_REF} - fi -fi - -# If not using the commit range from Travis we need to find the common -# ancestor between $LOCAL_BRANCH_REF and $REMOTE/develop -if [[ -z "$COMMIT_RANGE" ]]; then - if [[ -z "$LOCAL_BRANCH_REF" ]]; then - LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD) - fi - echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:" - echo '--------------------------------------------------------------------------------' - git log -2 ${LOCAL_BRANCH_REF} - - REMOTE_MASTER_REF="$REMOTE/develop" - # Make sure that $REMOTE_MASTER_REF is a valid reference - echo -e "\nFetching $REMOTE_MASTER_REF" - echo '--------------------------------------------------------------------------------' - git fetch ${REMOTE} develop:refs/remotes/${REMOTE_MASTER_REF} - LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short ${LOCAL_BRANCH_REF}) - REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short ${REMOTE_MASTER_REF}) - - COMMIT=$(git merge-base ${LOCAL_BRANCH_REF} ${REMOTE_MASTER_REF}) || \ - echo "No common ancestor found for $(git show ${LOCAL_BRANCH_REF} -q) and $(git show ${REMOTE_MASTER_REF} -q)" - - if [ -z "$COMMIT" ]; then - exit 1 - fi - - COMMIT_SHORT_HASH=$(git rev-parse --short ${COMMIT}) - - echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\ - "and $REMOTE_MASTER_REF ($REMOTE_MASTER_SHORT_HASH) is $COMMIT_SHORT_HASH:" - echo '--------------------------------------------------------------------------------' - git show --no-patch ${COMMIT_SHORT_HASH} - - COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH" - - if [[ -n "$TMP_REMOTE" ]]; then - git remote remove ${TMP_REMOTE} - fi - -else - echo "Got the commit range from Travis: $COMMIT_RANGE" -fi - -echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \ - "($(git rev-list ${COMMIT_RANGE} | wc -l) commit(s)):" -echo '--------------------------------------------------------------------------------' - -# We ignore files from sklearn/externals. -# Excluding vec files since they contain non-utf8 content and flake8 raises exception for non-utf8 input -# We need the following command to exit with 0 hence the echo in case -# there is no match -MODIFIED_PY_FILES="$(git diff --name-only ${COMMIT_RANGE} | grep '[a-zA-Z0-9]*.py$' || echo "no_match")" -MODIFIED_IPYNB_FILES="$(git diff --name-only ${COMMIT_RANGE} | grep '[a-zA-Z0-9]*.ipynb$' || echo "no_match")" - - -echo "*.py files: " ${MODIFIED_PY_FILES} -echo "*.ipynb files: " ${MODIFIED_IPYNB_FILES} - - -check_files() { - files="$1" - shift - options="$*" - if [ -n "$files" ]; then - # Conservative approach: diff without context (--unified=0) so that code - # that was not changed does not create failures - git diff --unified=0 ${COMMIT_RANGE} -- ${files} | flake8 --config ${FLAKE_CONFIG_FILE} --diff --show-source ${options} - fi -} - -if [[ "$MODIFIED_PY_FILES" == "no_match" ]]; then - echo "No .py files has been modified" -else - check_files "$(echo "$MODIFIED_PY_FILES" )" -fi -echo -e "No problem detected by flake8\n" - -if [[ "$MODIFIED_IPYNB_FILES" == "no_match" ]]; then - echo "No .ipynb file has been modified" -else - for fname in ${MODIFIED_IPYNB_FILES} - do - echo "File: $fname" - jupyter nbconvert --to script --stdout ${fname} | flake8 --config ${FLAKE_CONFIG_FILE} --show-source --builtins=get_ipython || true - done -fi - -echo "Build documentation" -pip install .[docs] && cd docs/src && make clean html diff --git a/continuous_integration/travis/install.sh b/continuous_integration/travis/install.sh deleted file mode 100755 index c14ac86925..0000000000 --- a/continuous_integration/travis/install.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -set -e - -deactivate -wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh -chmod +x miniconda.sh && ./miniconda.sh -b -export PATH=/home/travis/miniconda2/bin:$PATH -conda update --yes conda - - -conda create --yes -n gensim-test python=${PYTHON_VERSION} pip atlas flake8 jupyter numpy==${NUMPY_VERSION} scipy==${SCIPY_VERSION} && source activate gensim-test -pip install . && pip install .[test] diff --git a/continuous_integration/travis/run.sh b/continuous_integration/travis/run.sh deleted file mode 100755 index 8203956c25..0000000000 --- a/continuous_integration/travis/run.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -e - -pip freeze - -if [[ "$ONLY_CODESTYLE" == "yes" ]]; then - continuous_integration/travis/flake8_diff.sh -else - python setup.py test -fi diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 0dd0e30d2a..559081b886 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -56,7 +56,8 @@ def __init__(self, fname, index_fname=None): self.length = None @classmethod - def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): + def serialize(serializer, fname, corpus, id2word=None, index_fname=None, + progress_cnt=None, labels=None, metadata=False): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index @@ -93,7 +94,9 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs) if offsets is None: - raise NotImplementedError("Called serialize on class %s which doesn't support indexing!" % serializer.__name__) + raise NotImplementedError( + "Called serialize on class %s which doesn't support indexing!" % serializer.__name__ + ) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index d5265f6571..e293c998a1 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -77,7 +77,8 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id - self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) + # build a mapping of word id(int) -> word (string) + self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) else: logger.info("using provided word mapping (%i ids)", len(id2word)) self.id2word = id2word diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 4d0fde4999..049e22f226 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -456,7 +456,10 @@ def resize_shards(self, shardsize): for old_shard_n, old_shard_name in enumerate(old_shard_names): os.remove(old_shard_name) except Exception as e: - logger.error('Exception occurred during old shard no. %d removal: %s.\nAttempting to at least move new shards in.', old_shard_n, str(e)) + logger.error( + 'Exception occurred during old shard no. %d removal: %s.\nAttempting to at least move new shards in.', + old_shard_n, str(e) + ) finally: # If something happens with cleaning up - try to at least get the # new guys in. @@ -673,7 +676,10 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): Returns the resulting s_result. """ if (result_stop - result_start) != (stop - start): - raise ValueError('Result start/stop range different than stop/start range (%d - %d vs. %d - %d)'.format(result_start, result_stop, start, stop)) + raise ValueError( + 'Result start/stop range different than stop/start range (%d - %d vs. %d - %d)' + .format(result_start, result_stop, start, stop) + ) # Dense data: just copy using numpy's slice notation if not self.sparse_serialization: @@ -685,7 +691,10 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): # result. else: if s_result.shape != (result_start, self.dim): - raise ValueError('Assuption about sparse s_result shape invalid: {0} expected rows, {1} real rows.'.format(result_start, s_result.shape[0])) + raise ValueError( + 'Assuption about sparse s_result shape invalid: {0} expected rows, {1} real rows.' + .format(result_start, s_result.shape[0]) + ) tmp_matrix = self.current_shard[start:stop] s_result = sparse.vstack([s_result, tmp_matrix]) @@ -786,7 +795,8 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, ShardedCorpus(fname, corpus, **kwargs) @classmethod - def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, **kwargs): + def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, + labels=None, metadata=False, **kwargs): """ Iterate through the document stream `corpus`, saving the documents as a ShardedCorpus to `fname`. diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 290414836e..c19aa321e2 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -119,7 +119,8 @@ def line2doc(self, line): if not parts: raise ValueError('invalid line format in %s' % self.fname) target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]] - doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based + # ignore 'qid' features, convert 1-based feature ids to 0-based + doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] return doc, target @staticmethod diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index e479b551c3..7f78f5ca91 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -112,8 +112,9 @@ class TextCorpus(interfaces.CorpusABC): 6. remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords """ - def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, - token_filters=None): + + def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, + tokenizer=None, token_filters=None): """ Args: input (str): path to top-level directory to traverse for corpus documents. diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index de04b36c62..0c1c229bac 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -105,7 +105,8 @@ def remove_markup(text): text = re.sub(RE_P13, '\n\\3', text) # leave only cell content # remove empty mark-up text = text.replace('[]', '') - if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations + # stop if nothing changed between two iterations or after a fixed number of iterations + if old == text or iters > 2: break # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists' @@ -243,7 +244,8 @@ def extract_pages(f, filter_namespaces=False): _extract_pages = extract_pages # for backward compatibility -def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): +def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, + token_max_len=TOKEN_MAX_LEN, lower=True): """ Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). @@ -279,7 +281,8 @@ def _process_article(args): class WikiCorpus(TextCorpus): """ - Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. + Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 + or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. @@ -289,7 +292,7 @@ class WikiCorpus(TextCorpus): `_. >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h - >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word + >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping """ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py index 07fc247f8b..a0d9007fa3 100644 --- a/gensim/examples/dmlcz/dmlcorpus.py +++ b/gensim/examples/dmlcz/dmlcorpus.py @@ -36,7 +36,7 @@ class DmlConfig(object): def __init__(self, configId, resultDir, acceptLangs=None): self.resultDir = resultDir # output files will be stored in this directory - self.configId = configId # configId is a string that is used as filename prefix for all files, so keep it simple + self.configId = configId self.sources = {} # all article sources; see sources.DmlSource class for an example of source if acceptLangs is None: # which languages to accept @@ -48,7 +48,7 @@ def resultFile(self, fname): return os.path.join(self.resultDir, self.configId + '_' + fname) def acceptArticle(self, metadata): - lang = metadata.get('language', 'unk') # if there was no language field in the article metadata, set language to 'unk' = unknown + lang = metadata.get('language', 'unk') if 'any' not in self.acceptLangs and lang not in self.acceptLangs: return False return True @@ -118,7 +118,10 @@ def buildDictionary(self): # convert to bag-of-words, but ignore the result -- here we only care about updating token ids _ = self.dictionary.doc2bow(words, allowUpdate=True) # noqa:F841 - logger.info("built %s from %i documents (total %i corpus positions)", self.dictionary, len(self.documents), numPositions) + logger.info( + "built %s from %i documents (total %i corpus positions)", + self.dictionary, len(self.documents), numPositions + ) def processConfig(self, config, shuffle=False): """ diff --git a/gensim/examples/dmlcz/gensim_build.py b/gensim/examples/dmlcz/gensim_build.py index bb62103109..873c7915ab 100755 --- a/gensim/examples/dmlcz/gensim_build.py +++ b/gensim/examples/dmlcz/gensim_build.py @@ -48,7 +48,7 @@ def buildDmlCorpus(config): dml.buildDictionary() dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words - dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs) + dml.save(config.resultFile('.pkl')) dml.saveAsText() # save id mappings and documents as text data (matrix market format) return dml diff --git a/gensim/examples/dmlcz/gensim_xml.py b/gensim/examples/dmlcz/gensim_xml.py index 0b8661ac77..9fbbc1d92f 100755 --- a/gensim/examples/dmlcz/gensim_xml.py +++ b/gensim/examples/dmlcz/gensim_xml.py @@ -60,7 +60,7 @@ def generateSimilar(corpus, index, method): articles = [] # collect similars in this list for docNo2, score in topSims: # for each most similar article - if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring) + if score > MIN_SCORE and docNo != docNo2: source, (intId, pathId) = corpus.documents[docNo2] meta = corpus.getMeta(docNo2) suffix, author, title = '', meta.get('author', ''), meta.get('title', '') @@ -106,7 +106,8 @@ def generateSimilar(corpus, index, method): corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) input = MmCorpus(config.resultFile('_%s.mm' % method)) - assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) + assert len(input) == len(corpus), \ + "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) # initialize structure for similarity queries if method == 'lsi' or method == 'rp': # for these methods, use dense vectors @@ -114,7 +115,7 @@ def generateSimilar(corpus, index, method): else: index = SparseMatrixSimilarity(input, num_best=MAX_SIMILAR + 1) - index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) - generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format + index.normalize = False + generateSimilar(corpus, index, method) logging.info("finished running %s", program) diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py index 8124101acd..c9782f80c4 100644 --- a/gensim/examples/dmlcz/sources.py +++ b/gensim/examples/dmlcz/sources.py @@ -103,7 +103,7 @@ def parseDmlMeta(cls, xmlfile): for line in xml: if line.find('') >= 0: # end of
, we're done break - p = re.search(PAT_TAG, line) # HAX assumes one element = one line; proper xml parsing probably better... but who cares + p = re.search(PAT_TAG, line) if p: name, cont = p.groups() name = name.split()[0] @@ -202,7 +202,8 @@ def isArticle(self, path): logger.info('missing dspace_id in %s', path) return False # and contain either fulltext.txt or fulltext_dspace.txt file - if not (os.path.exists(os.path.join(path, 'fulltext.txt')) or os.path.exists(os.path.join(path, 'fulltext-dspace.txt'))): + if not (os.path.exists(os.path.join(path, 'fulltext.txt')) + or os.path.exists(os.path.join(path, 'fulltext-dspace.txt'))): logger.info('missing fulltext in %s', path) return False # and contain the meta.xml file @@ -264,7 +265,9 @@ def endElement(self, name): def characters(self, text): # for text, we only care about tokens directly within the

tag if self.path[-1] == 'p': - tokens = [token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if not token.isdigit()] + tokens = [ + token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if not token.isdigit() + ] self.tokens.extend(tokens) # endclass ArxmlivHandler diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 81f85a8527..6cc7e8d872 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -56,7 +56,10 @@ def __iter__(self): def save(self, *args, **kwargs): import warnings - warnings.warn("corpus.save() stores only the (tiny) iteration object; to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)") + warnings.warn( + "corpus.save() stores only the (tiny) iteration object; " + "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)" + ) super(CorpusABC, self).save(*args, **kwargs) def __len__(self): @@ -104,7 +107,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): class TransformedCorpus(CorpusABC): def __init__(self, obj, corpus, chunksize=None, **kwargs): self.obj, self.corpus, self.chunksize = obj, corpus, chunksize - for key, value in kwargs.items(): # add the new parameters like per_word_topics to base class object of LdaModel + # add the new parameters like per_word_topics to base class object of LdaModel + for key, value in kwargs.items(): setattr(self.obj, key, value) self.metadata = False diff --git a/gensim/matutils.py b/gensim/matutils.py index 717612cb0f..b7ade1be9f 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -188,7 +188,10 @@ def scipy2scipy_clipped(matrix, topn, eps=1e-9): matrix_indices = np.concatenate(matrix_indices).ravel() matrix_data = np.concatenate(matrix_data).ravel() # Instantiate and return a sparse csr_matrix which preserves the order of indices/data. - return scipy.sparse.csr.csr_matrix((matrix_data, matrix_indices, matrix_indptr), shape=(matrix.shape[0], np.max(matrix_indices) + 1)) + return scipy.sparse.csr.csr_matrix( + (matrix_data, matrix_indices, matrix_indptr), + shape=(matrix.shape[0], np.max(matrix_indices) + 1) + ) def scipy2sparse(vec, eps=1e-9): @@ -531,18 +534,21 @@ def jensen_shannon(vec1, vec2, num_features=None): def hellinger(vec1, vec2): """ Hellinger distance is a distance metric to quantify the similarity between two probability distributions. - Distance between distributions will be a number between <0,1>, where 0 is minimum distance (maximum similarity) and 1 is maximum distance (minimum similarity). + Distance between distributions will be a number between <0,1>, where 0 is minimum distance (maximum similarity) + and 1 is maximum distance (minimum similarity). """ if scipy.sparse.issparse(vec1): vec1 = vec1.toarray() if scipy.sparse.issparse(vec2): vec2 = vec2.toarray() if isbow(vec1) and isbow(vec2): - # if it is a bag of words format, instead of converting to dense we use dictionaries to calculate appropriate distance + # if it is a BoW format, instead of converting to dense we use dictionaries to calculate appropriate distance vec1, vec2 = dict(vec1), dict(vec2) if len(vec2) < len(vec1): vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector - sim = np.sqrt(0.5 * sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) + sim = np.sqrt( + 0.5 * sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1)) + ) return sim else: sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum()) @@ -671,7 +677,10 @@ def write_headers(self, num_docs, num_terms, num_nnz): logger.info("saving sparse matrix to %s", self.fname) self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody else: - logger.info("saving sparse %sx%s matrix with %i non-zero entries to %s", num_docs, num_terms, num_nnz, self.fname) + logger.info( + "saving sparse %sx%s matrix with %i non-zero entries to %s", + num_docs, num_terms, num_nnz, self.fname + ) self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz))) self.last_docno = -1 self.headers_written = True @@ -693,7 +702,8 @@ def write_vector(self, docno, vector): assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno) vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries for termid, weight in vector: # write term ids in sorted order - self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) # +1 because MM format starts counting from 1 + # +1 because MM format starts counting from 1 + self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) self.last_docno = docno return (vector[-1][0], len(vector)) if vector else (-1, 0) @@ -746,7 +756,10 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, num_terms = num_terms or _num_terms if num_docs * num_terms != 0: - logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)", num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms) + logger.info( + "saved %ix%i matrix, density=%.3f%% (%i/%i)", + num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms + ) # now write proper headers, by seeking and overwriting the spaces written earlier mw.fake_headers(num_docs, num_terms, num_nnz) @@ -798,8 +811,10 @@ def __init__(self, input, transposed=True): try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): - raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % - (self.input, header)) + raise ValueError( + "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % + (self.input, header) + ) except StopIteration: pass @@ -812,7 +827,10 @@ def __init__(self, input, transposed=True): self.num_docs, self.num_terms = self.num_terms, self.num_docs break - logger.info("accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz) + logger.info( + "accepted corpus with %i documents, %i features, %i non-zero entries", + self.num_docs, self.num_terms, self.num_nnz + ) def __len__(self): return self.num_docs @@ -848,7 +866,8 @@ def __iter__(self): docid, termid, val = utils.to_unicode(line).split() # needed for python3 if not self.transposed: termid, docid = docid, termid - docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based + # -1 because matrix market indexes are 1-based => convert to 0-based + docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: # change of document: return the document read so far (its id is prevId) @@ -892,7 +911,8 @@ def docbyoffset(self, offset): docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid - docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based + # -1 because matrix market indexes are 1-based => convert to 0-based + docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: if previd >= 0: diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 6abcf6d34a..02b18984ac 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -22,9 +22,11 @@ Distributed computation and multiprocessing is not implemented at the moment, but may be coming in the future. -The model was introduced by Rosen-Zvi and co-authors in 2004 (https://mimno.infosci.cornell.edu/info6150/readings/398.pdf). +The model was introduced by Rosen-Zvi and co-authors in 2004 +(https://mimno.infosci.cornell.edu/info6150/readings/398.pdf). -A tutorial can be found at https://github.com/RaRe-Technologies/gensim/tree/develop/docs/notebooks/atmodel_tutorial.ipynb. +A tutorial can be found at +https://github.com/RaRe-Technologies/gensim/tree/develop/docs/notebooks/atmodel_tutorial.ipynb. """ @@ -201,7 +203,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents - >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data + >>> model = AuthorTopicModel( + ... corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) """ # NOTE: this doesn't call constructor of a base class, but duplicates most of this code @@ -209,7 +212,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d self.dtype = np.float64 # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the - # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. + # infrastructure to implement a distributed author-topic model is already in place, + # such as the AuthorTopicState. distributed = False self.dispatcher = None self.numworkers = 1 @@ -256,7 +260,10 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d self.serialized = serialized if serialized and not serialization_path: - raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).") + raise ValueError( + "If serialized corpora are used, a the path to a folder " + "where the corpus should be saved must be provided (serialized_path)." + ) if serialized and serialization_path: assert not isfile(serialization_path), \ "A file already exists at the serialization_path path; " \ @@ -330,11 +337,14 @@ def extend_corpus(self, corpus): # Re-serialize the entire corpus while appending the new documents. if isinstance(corpus, MmCorpus): # Check that we are not attempting to overwrite the serialized corpus. - assert self.corpus.input != corpus.input, 'Input corpus cannot have the same file path as the model corpus (serialization_path).' + assert self.corpus.input != corpus.input, \ + 'Input corpus cannot have the same file path as the model corpus (serialization_path).' corpus_chain = chain(self.corpus, corpus) # A generator with the old and new documents. - copyfile(self.serialization_path, self.serialization_path + '.tmp') # Make a temporary copy of the file where the corpus is serialized. + # Make a temporary copy of the file where the corpus is serialized. + copyfile(self.serialization_path, self.serialization_path + '.tmp') self.corpus.input = self.serialization_path + '.tmp' # Point the old corpus at this temporary file. - MmCorpus.serialize(self.serialization_path, corpus_chain) # Re-serialize the old corpus, and extend it with the new corpus. + # Re-serialize the old corpus, and extend it with the new corpus. + MmCorpus.serialize(self.serialization_path, corpus_chain) self.corpus = MmCorpus(self.serialization_path) # Store the new serialized corpus object in self.corpus. remove(self.serialization_path + '.tmp') # Remove the temporary file again. else: @@ -424,7 +434,8 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c # Update gamma. # phi is computed implicitly below, for ai, a in enumerate(authors_d): - tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]]) * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T) + tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]])\ + * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T) # Update gamma. # Interpolation between document d's "local" gamma (tilde_gamma), @@ -504,15 +515,17 @@ def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None): total_docs = len(chunk) corpus_words = sum(cnt for document in chunk for _, cnt in document) subsample_ratio = 1.0 * total_docs / len(chunk) - perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) + perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / \ + (subsample_ratio * corpus_words) logger.info( "%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words", perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words ) return perwordbound - def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None, - update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): + def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, + passes=None, update_every=None, eval_every=None, iterations=None, + gamma_threshold=None, chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations @@ -598,7 +611,9 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, num_input_authors = len(self.author2doc) else: if doc2author is None and author2doc is None: - raise ValueError('at least one of author2doc/doc2author must be specified, to establish input space dimensionality') + raise ValueError( + 'at least one of author2doc/doc2author must be specified, to establish input space dimensionality' + ) # If either doc2author or author2doc is missing, construct them from the other. if doc2author is None: @@ -689,14 +704,19 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, updates_per_pass = max(1, lencorpus / updateafter) logger.info( - "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once " - "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f", + "running %s author-topic training, %s topics, %s authors, " + "%i passes over the supplied corpus of %i documents, updating model once " + "every %i documents, evaluating perplexity every %i documents, " + "iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold ) if updates_per_pass * passes < 10: - logger.warning("too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy") + logger.warning( + "too few updates, training might not converge; " + "consider increasing the number of passes or iterations to improve accuracy" + ) # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, @@ -714,7 +734,8 @@ def rho(): dirty = False reallen = 0 - for chunk_no, chunk_doc_idx in enumerate(utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)): + for chunk_no, chunk_doc_idx in enumerate( + utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)): chunk = [self.corpus[d] for d in chunk_doc_idx] reallen += len(chunk) # keep track of how many documents we've processed so far @@ -814,7 +835,10 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, if not chunk_doc_idx: # If author2doc and doc2author are not provided, chunk is assumed to be a subset of # self.corpus, and chunk_doc_idx is thus required. - raise ValueError('Either author dictionaries or chunk_doc_idx must be provided. Consult documentation of bound method.') + raise ValueError( + 'Either author dictionaries or chunk_doc_idx must be provided. ' + 'Consult documentation of bound method.' + ) elif author2doc is not None and doc2author is not None: # Training on held-out documents (documents not seen during training). # All authors in dictionaries must still be seen during training. @@ -823,9 +847,15 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, raise ValueError('bound cannot be called with authors not seen during training.') if chunk_doc_idx: - raise ValueError('Either author dictionaries or chunk_doc_idx must be provided, not both. Consult documentation of bound method.') + raise ValueError( + 'Either author dictionaries or chunk_doc_idx must be provided, not both. ' + 'Consult documentation of bound method.' + ) else: - raise ValueError('Either both author2doc and doc2author should be provided, or neither. Consult documentation of bound method.') + raise ValueError( + 'Either both author2doc and doc2author should be provided, or neither. ' + 'Consult documentation of bound method.' + ) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) @@ -884,7 +914,10 @@ def get_document_topics(self, word_id, minimum_probability=None): """ - raise NotImplementedError('Method "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.') + raise NotImplementedError( + 'Method "get_document_topics" is not valid for the author-topic model. ' + 'Use the "get_author_topics" method.' + ) def get_author_topics(self, author_name, minimum_probability=None): """ @@ -905,7 +938,10 @@ def get_author_topics(self, author_name, minimum_probability=None): topic_dist = self.state.gamma[author_id, :] / sum(self.state.gamma[author_id, :]) - author_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= minimum_probability] + author_topics = [ + (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) + if topicvalue >= minimum_probability + ] return author_topics diff --git a/gensim/models/callbacks.py b/gensim/models/callbacks.py index ed1ed7aea5..824b9b0e1d 100644 --- a/gensim/models/callbacks.py +++ b/gensim/models/callbacks.py @@ -53,8 +53,8 @@ def __init__(self, corpus=None, texts=None, dictionary=None, coherence=None, texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. If both are provided, dictionary will be used. - window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their - probability estimator. For 'u_mass' this doesn't matter. + window_size : Is the size of the window to be used for coherence measures using boolean + sliding window as their probability estimator. For 'u_mass' this doesn't matter. If left 'None' the default window sizes are used which are: 'c_v' : 110 @@ -148,7 +148,8 @@ def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=T `hellinger` `jaccard` num_words : is quantity of most relevant words that used if distance == `jaccard` (also used for annotation) - n_ann_terms : max quantity of words in intersection/symmetric difference between topics (used for annotation) + n_ann_terms : max quantity of words in intersection/symmetric difference + between topics (used for annotation) diagonal : difference between identical topic no.s annotation : intersection or difference of words between topics normed (bool) : If `true`, matrix/array Z will be normalized @@ -195,7 +196,8 @@ def __init__(self, distance="jaccard", num_words=100, n_ann_terms=10, diagonal=T `hellinger` `jaccard` num_words : is quantity of most relevant words that used if distance == `jaccard` (also used for annotation) - n_ann_terms : max quantity of words in intersection/symmetric difference between topics (used for annotation) + n_ann_terms : max quantity of words in intersection/symmetric difference + between topics (used for annotation) diagonal : difference between identical topic no.s annotation : intersection or difference of words between topics normed (bool) : If `true`, matrix/array Z will be normalized diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 10cb1cce7a..75cfb9bd97 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -110,7 +110,8 @@ class CoherenceModel(interfaces.TransformationABC): topics = [['human', 'computer', 'system', 'interface'], ['graph', 'minors', 'trees', 'eps']] - cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided. + # note that a dictionary has to be provided. + cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass') cm.get_coherence() Model persistency is achieved via its load/save methods. @@ -142,8 +143,8 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= corpus : Gensim document corpus. dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. If both are provided, dictionary will be used. - window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their - probability estimator. For 'u_mass' this doesn't matter. + window_size : Is the size of the window to be used for coherence measures using boolean sliding window + as their probability estimator. For 'u_mass' this doesn't matter. If left 'None' the default window sizes are used which are: 'c_v' : 110 diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c3e9b9e294..ca6bd5cba5 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -29,13 +29,16 @@ -.. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents. http://arxiv.org/pdf/1405.4053v2.pdf -.. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. -.. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. - In Proceedings of NIPS, 2013. +.. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents. + http://arxiv.org/pdf/1405.4053v2.pdf +.. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. + Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. +.. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. + Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. .. [blog] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ -.. [#tutorial] Doc2vec in gensim tutorial, https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb +.. [#tutorial] Doc2vec in gensim tutorial, + https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb @@ -383,8 +386,12 @@ def estimated_lookup_memory(self): def reset_weights(self, model): length = max(len(self.doctags), self.count) if self.mapfile_path: - self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size)) - self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,)) + self.doctag_syn0 = np_memmap( + self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size) + ) + self.doctag_syn0_lockf = np_memmap( + self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,) + ) self.doctag_syn0_lockf.fill(1.0) else: self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL) @@ -626,7 +633,9 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 """ if 'sentences' in kwargs: - raise DeprecationWarning("'sentences' in doc2vec was renamed to 'documents'. Please use documents parameter.") + raise DeprecationWarning( + "'sentences' in doc2vec was renamed to 'documents'. Please use documents parameter." + ) super(Doc2Vec, self).__init__( sg=(1 + dm) % 2, @@ -688,7 +697,8 @@ def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False if not checked_string_types: if isinstance(document.words, string_types): logger.warning( - "Each 'words' should be a list of words (usually unicode strings). First 'words' here is instead plain %s.", + "Each 'words' should be a list of words (usually unicode strings). " + "First 'words' here is instead plain %s.", type(document.words) ) checked_string_types += 1 diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 72e28344cc..6174754314 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -45,7 +45,8 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) - train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) # train on the sliding window for target word + # train on the sliding window for target word + train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) result += len(word_vocabs) return result @@ -75,11 +76,10 @@ def train_batch_sg(model, sentences, alpha, work=None): class FastText(Word2Vec): - def __init__( - self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, word_ngrams=1, loss='ns', sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, - trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): + def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, word_ngrams=1, loss='ns', sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, + sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): # fastText specific params self.bucket = bucket @@ -89,10 +89,12 @@ def __init__( if self.word_ngrams <= 1 and self.max_n == 0: self.bucket = 0 - super(FastText, self).__init__(sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, + super(FastText, self).__init__( + sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word, - trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words) + trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words + ) def initialize_word_vectors(self): self.wv = FastTextKeyedVectors() @@ -102,13 +104,16 @@ def initialize_word_vectors(self): def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): if update: if not len(self.wv.vocab): - raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus " - "before doing an online update.") + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus before doing an online update." + ) self.old_vocab_len = len(self.wv.vocab) self.old_hash2index_len = len(self.wv.hash2index) - super(FastText, self).build_vocab(sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update) + super(FastText, self).build_vocab( + sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update + ) self.init_ngrams(update=update) def init_ngrams(self, update=False): @@ -165,10 +170,18 @@ def init_ngrams(self, update=False): rand_obj = np.random rand_obj.seed(self.seed) - new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)) + new_vocab_rows = rand_obj.uniform( + -1.0 / self.vector_size, 1.0 / self.vector_size, + (len(self.wv.vocab) - self.old_vocab_len, self.vector_size) + ) new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) - new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)) - new_ngram_lockf_rows = ones((len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) + new_ngram_rows = rand_obj.uniform( + -1.0 / self.vector_size, 1.0 / self.vector_size, + (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size) + ) + new_ngram_lockf_rows = ones( + (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL + ) self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows]) @@ -179,9 +192,13 @@ def reset_ngram_weights(self): rand_obj = np.random rand_obj.seed(self.seed) for index in range(len(self.wv.vocab)): - self.wv.syn0_vocab[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size) + self.wv.syn0_vocab[index] = rand_obj.uniform( + -1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size + ) for index in range(len(self.wv.hash2index)): - self.wv.syn0_ngrams[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size) + self.wv.syn0_ngrams[index] = rand_obj.uniform( + -1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size + ) def _do_train_job(self, sentences, alpha, inits): work, neu1 = inits diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index aeb771e637..884d6d01ed 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -391,7 +391,8 @@ def update_lambda(self, sstats, word_list, opt_o): self.m_rhot = rhot # Update appropriate columns of lambda based on documents. - self.m_lambda[:, word_list] = self.m_lambda[:, word_list] * (1 - rhot) + rhot * self.m_D * sstats.m_var_beta_ss / sstats.m_chunksize + self.m_lambda[:, word_list] = \ + self.m_lambda[:, word_list] * (1 - rhot) + rhot * self.m_D * sstats.m_var_beta_ss / sstats.m_chunksize self.m_lambda_sum = (1 - rhot) * self.m_lambda_sum + \ rhot * self.m_D * np.sum(sstats.m_var_beta_ss, axis=1) / sstats.m_chunksize @@ -399,7 +400,8 @@ def update_lambda(self, sstats, word_list, opt_o): self.m_timestamp[word_list] = self.m_updatect self.m_r.append(self.m_r[-1] + np.log(1 - rhot)) - self.m_varphi_ss = (1.0 - rhot) * self.m_varphi_ss + rhot * sstats.m_var_sticks_ss * self.m_D / sstats.m_chunksize + self.m_varphi_ss = \ + (1.0 - rhot) * self.m_varphi_ss + rhot * sstats.m_var_sticks_ss * self.m_D / sstats.m_chunksize if opt_o: self.optimal_ordering() @@ -429,7 +431,8 @@ def update_expectations(self): """ for w in xrange(self.m_W): self.m_lambda[:, w] *= np.exp(self.m_r[-1] - self.m_r[self.m_timestamp[w]]) - self.m_Elogbeta = psi(self.m_eta + self.m_lambda) - psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) + self.m_Elogbeta = \ + psi(self.m_eta + self.m_lambda) - psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True @@ -538,7 +541,9 @@ def suggested_lda_model(self): The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. """ alpha, beta = self.hdp_to_lda() - ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state, dtype=np.float64) + ldam = ldamodel.LdaModel( + num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state, dtype=np.float64 + ) ldam.expElogbeta[:] = beta return ldam diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 183f45ada3..6b9ebc3145 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -5,7 +5,8 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ -Word vector storage and similarity look-ups. Common code independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc) +Word vector storage and similarity look-ups. +Common code independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc) The word vectors are considered read-only in this class. @@ -19,7 +20,8 @@ >>> word_vectors.save(fname) >>> word_vectors = KeyedVectors.load(fname) -The vectors can also be instantiated from an existing file on disk in the original Google's word2vec C format as a KeyedVectors instance:: +The vectors can also be instantiated from an existing file on disk +in the original Google's word2vec C format as a KeyedVectors instance:: >>> from gensim.models.keyedvectors import KeyedVectors >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format @@ -679,7 +681,9 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. - See questions-words.txt in https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip for an example. + See questions-words.txt in + https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip + for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. @@ -762,7 +766,8 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0]) logger.info('Pairs with unknown words ratio: %.1f%%', oov) - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): + def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, + case_insensitive=True, dummy4unknown=False): """ Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`. diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py index 6b3bd53c44..1c0c695ee9 100755 --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -180,13 +180,21 @@ def exit(self): def main(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--maxsize", help="How many jobs (=chunks of N documents) to keep 'pre-fetched' in a queue (default: %(default)s)", type=int, default=MAX_JOBS_QUEUE) + parser.add_argument( + "--maxsize", + help="How many jobs (=chunks of N documents) to keep 'pre-fetched' in a queue (default: %(default)s)", + type=int, default=MAX_JOBS_QUEUE + ) parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", action='store_const', default=True, const=False) parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) - parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING) + parser.add_argument( + '-v', '--verbose', + help='Verbose flag', + action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING + ) args = parser.parse_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py index 8656672db6..13f93648d2 100755 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -47,7 +47,8 @@ def __init__(self): def initialize(self, myid, dispatcher, **model_params): self.lock_update = threading.Lock() self.jobsdone = 0 # how many jobs has this worker completed? - self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + self.myid = myid self.dispatcher = dispatcher self.finished = False logger.info("initializing worker #%s", myid) @@ -116,9 +117,15 @@ def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) - parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", action='store_const', default=True, const=False) + parser.add_argument( + "--no-broadcast", help="Disable broadcast (default: %(default)s)", action='store_const', + default=True, const=False + ) parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) - parser.add_argument('-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING) + parser.add_argument( + '-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", + const=logging.INFO, default=logging.WARNING + ) args = parser.parse_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index c3dabb6ee2..1b50f0a9b8 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -236,7 +236,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). - Turn on `distributed` to force distributed computing (see the `web tutorial `_ + Turn on `distributed` to force distributed computing + (see the `web tutorial `_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every @@ -269,7 +270,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: - raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + raise ValueError( + 'at least one of corpus/id2word must be specified, to establish input space dimensionality' + ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") @@ -379,7 +382,8 @@ def init_dir_prior(self, prior, name): logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype) elif prior == 'asymmetric': - init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype) + init_prior = \ + np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype) init_prior /= init_prior.sum() logger.info("using asymmetric %s %s", name, list(init_prior)) elif prior == 'auto': @@ -1050,11 +1054,13 @@ def diff(self, other, distance="kullback_leibler", num_words=100, Available values: `kullback_leibler`, `hellinger`, `jaccard` and `jensen_shannon` `num_words` is quantity of most relevant words that used if distance == `jaccard` (also used for annotation) `n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation) - `diagonal` set to True if the difference is required only between the identical topic no.s (returns diagonal of diff matrix) + `diagonal` set to True if the difference is required only between the identical topic no.s + (returns diagonal of diff matrix) `annotation` whether the intersection or difference of words between two topics should be returned - Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j + Returns a matrix Z with shape (m1.num_topics, m2.num_topics), + where Z[i][j] - difference between topic_i and topic_j and matrix annotation (if True) with shape (m1.num_topics, m2.num_topics, 2, None), - where: + where:: annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and `int_k` is word from intersection of `topic_i` and `topic_j` and @@ -1098,7 +1104,8 @@ def diff(self, other, distance="kullback_leibler", num_words=100, if diagonal: assert t1_size == t2_size, \ - "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix" + "Both input models should have same no. of topics, " \ + "as the diagonal will only be valid in a square matrix" # initialize z and annotation array z = np.zeros(t1_size) if annotation: @@ -1169,7 +1176,7 @@ def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, ** those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main concern here is the `alpha` array if for instance using `alpha='auto'`. - Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2) + Please refer to the wiki recipes section (goo.gl/qoje24) for an example on how to work around these issues. """ if self.state is not None: @@ -1193,9 +1200,11 @@ def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, ** separately_explicit = ['expElogbeta', 'sstats'] # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some # array manually. - if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1): + if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or \ + (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1): separately_explicit.append('alpha') - if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1): + if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or \ + (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1): separately_explicit.append('eta') # Merge separately_explicit with separately. if separately: @@ -1222,7 +1231,8 @@ def load(cls, fname, *args, **kwargs): # check if `random_state` attribute has been set after main pickle load # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim - # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value + # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, + # so set `random_state` as the default value if not hasattr(result, 'random_state'): result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)` logging.warning("random_state not set so using default value") @@ -1240,8 +1250,10 @@ def load(cls, fname, *args, **kwargs): id2word_fname = utils.smart_extension(fname, '.id2word') # check if `id2word_fname` file is present on disk - # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file - # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load + # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, + # so set `result.id2word` using the `id2word_fname` file + # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, + # so `result.id2word` already set after the main pickle load if os.path.isfile(id2word_fname): try: result.id2word = utils.unpickle(id2word_fname) diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index 186029c971..2287b49584 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -193,8 +193,10 @@ def update(self, corpus, chunks_as_numpy=False): updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s LDA training, %s topics, %i passes over the supplied corpus of %i documents, " - "updating every %i documents, evaluating every ~%i documents, iterating %ix with a convergence threshold of %f", - updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, self.iterations, self.gamma_threshold + "updating every %i documents, evaluating every ~%i documents, " + "iterating %ix with a convergence threshold of %f", + updatetype, self.num_topics, self.passes, lencorpus, updateafter, + evalafter, self.iterations, self.gamma_threshold ) if updates_per_pass * self.passes < 10: @@ -232,7 +234,9 @@ def process_result_queue(force=False): if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)): self.do_mstep(rho(), other, pass_ > 0) other.reset() - if self.eval_every is not None and ((force and queue_size[0] == 0) or (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)): + if self.eval_every is not None and \ + ((force and queue_size[0] == 0) or + (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)): self.log_perplexity(chunk, total_docs=lencorpus) chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=chunks_as_numpy) @@ -247,7 +251,8 @@ def process_result_queue(force=False): chunk_put = True queue_size[0] += 1 logger.info( - "PROGRESS: pass %i, dispatched chunk #%i = documents up to #%i/%i, outstanding queue size %i", + "PROGRESS: pass %i, dispatched chunk #%i = documents up to #%i/%i, " + "outstanding queue size %i", pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0] ) except queue.Full: diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 8173d0d292..467ab47c6c 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -57,19 +57,21 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ `time_slice` as described above is a list which contains the number of documents in each time-slice - `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size and printing topics. + `id2word` is a mapping from word ids (integers) to words (strings). + It is used to determine the vocabulary size and printing topics. `alphas` is a prior of your choice and should be a double or float value. default is 0.01 `num_topics` is the number of requested latent topics to be extracted from the training corpus. `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. - You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a np matrix through sstats. + You can use your own sstats of an LDA model previously trained as well by specifying 'own' + and passing a np matrix through sstats. If you wish to just pass a previously used LDA model, pass it through `lda_model` Shape of sstats is (vocab_len, num_topics) - `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the - beta distribution. + `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter + defined in the beta distribution. `passes` is the number of passes of the initial LdaModel. @@ -77,7 +79,9 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ """ self.id2word = id2word if corpus is None and self.id2word is None: - raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + raise ValueError( + 'at least one of corpus/id2word must be specified, to establish input space dimensionality' + ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") @@ -109,8 +113,10 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ self.num_time_slices = len(time_slice) self.alphas = np.full(num_topics, alphas) - # topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic - # the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. + # topic_chains contains for each topic a 'state space language model' object + # which in turn has information about each topic + # the sslm class is described below and contains information + # on topic-word probabilities and doc-topic probabilities. self.topic_chains = [] for topic in range(0, num_topics): sslm_ = sslm( @@ -204,7 +210,8 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, lhoods = np.resize(np.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1)) # compute the likelihood of a sequential corpus under an LDA # seq model and find the evidence lower bound. This is the E - Step - bound, gammas = self.lda_seq_infer(corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize) + bound, gammas = \ + self.lda_seq_infer(corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize) self.gammas = gammas logger.info("M Step") @@ -381,14 +388,18 @@ def doc_topics(self, doc_number): def dtm_vis(self, time, corpus): """ - returns term_frequency, vocab, doc_lengths, topic-term distributions and doc_topic distributions, specified by pyLDAvis format. + returns term_frequency, vocab, doc_lengths, topic-term distributions and doc_topic distributions, + specified by pyLDAvis format. all of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis. input parameter is the year to do the visualisation. """ doc_topic = np.copy(self.gammas) doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis] - topic_term = [np.exp(np.transpose(chain.e_log_prob)[time]) / np.exp(np.transpose(chain.e_log_prob)[time]).sum() for k, chain in enumerate(self.topic_chains)] + topic_term = [ + np.exp(np.transpose(chain.e_log_prob)[time]) / np.exp(np.transpose(chain.e_log_prob)[time]).sum() + for k, chain in enumerate(self.topic_chains) + ] doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)] @@ -420,7 +431,8 @@ def __getitem__(self, doc): """ Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed. """ - lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64) + lda_model = \ + ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64) lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len)) ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc) @@ -475,7 +487,8 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va def update_zeta(self): """ Updates the Zeta Variational Parameter. - Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)), over every time-slice. + Zeta is described in the appendix and is equal + to sum (exp(mean[word] + Variance[word] / 2)), over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound. """ for j, val in enumerate(self.zeta): @@ -484,17 +497,23 @@ def update_zeta(self): def compute_post_variance(self, word, chain_variance): """ - Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] - This function accepts the word to compute variance for, along with the associated sslm class object, and returns variance and fwd_variance + Based on the Variational Kalman Filtering approach for Approximate Inference + [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] + This function accepts the word to compute variance for, along with the associated sslm class object, + and returns variance and fwd_variance Computes Var[\beta_{t,w}] for t = 1:T :math:: - fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) = (obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) * (fwd\_variance[t - 1] + obs\_variance) + fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) = + (obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) * + (fwd\_variance[t - 1] + obs\_variance) :math:: - variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) = fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 * (variance[t - 1] - (fwd\_variance[t-1] + obs\_variance)) + variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) = + fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 * + (variance[t - 1] - (fwd\_variance[t-1] + obs\_variance)) """ INIT_VARIANCE_CONST = 1000 @@ -524,15 +543,19 @@ def compute_post_variance(self, word, chain_variance): def compute_post_mean(self, word, chain_variance): """ - Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] - This function accepts the word to compute mean for, along with the associated sslm class object, and returns mean and fwd_mean + Based on the Variational Kalman Filtering approach for Approximate Inference + [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] + This function accepts the word to compute mean for, along with the associated sslm class object, + and returns mean and fwd_mean Essentially a forward-backward to compute E[\beta_{t,w}] for t = 1:T. Fwd_Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:t ) - = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] + (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta + = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] + + (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T ) - = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) + (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t] + = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) + + (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t] """ T = self.num_time_slices @@ -561,7 +584,8 @@ def compute_expected_log_prob(self): """ Compute the expected log probability given values of m. The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper; - The below implementation is the result of solving the equation and is as implemented in the original Blei DTM code. + The below implementation is the result of solving the equation and is as implemented + in the original Blei DTM code. """ for (w, t), val in np.ndenumerate(self.e_log_prob): self.e_log_prob[w][t] = self.mean[w][t + 1] - np.log(self.zeta[t]) @@ -570,7 +594,8 @@ def compute_expected_log_prob(self): def sslm_counts_init(self, obs_variance, chain_variance, sstats): """ Initialize State Space Language Model with LDA sufficient statistics. - Called for each topic-chain and initializes intial mean, variance and Topic-Word probabilities for the first time-slice. + Called for each topic-chain and initializes intial mean, variance and Topic-Word probabilities + for the first time-slice. """ W = self.vocab_len T = self.num_time_slices @@ -610,7 +635,8 @@ def fit_sslm(self, sstats): converged = sslm_fit_threshold + 1 # computing variance, fwd_variance - self.variance, self.fwd_variance = (np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)]))) + self.variance, self.fwd_variance = \ + (np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)]))) # column sum of sstats totals = sstats.sum(axis=0) @@ -657,7 +683,8 @@ def compute_bound(self, sstats, totals): chain_variance = self.chain_variance # computing mean, fwd_mean - self.mean, self.fwd_mean = (np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, w)])) + self.mean, self.fwd_mean = \ + (np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, w)])) self.zeta = self.update_zeta() for w in range(0, w): @@ -679,9 +706,11 @@ def compute_bound(self, sstats, totals): # w_phi_l is only used in Document Influence Model; the values are aleays zero in this case # w_phi_l = sslm.w_phi_l[w][t - 1] # exp_i = np.exp(-prev_m) - # term_1 += (np.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - np.log(chain_variance) + # term_1 += (np.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - + # (v / chain_variance) - np.log(chain_variance) - term_1 += (np.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - np.log(chain_variance) + term_1 += \ + (np.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - np.log(chain_variance) term_2 += sstats[w][t - 1] * m ent += np.log(v) / 2 # note the 2pi's cancel with term1 (see doc) @@ -877,7 +906,8 @@ def update_phi(self, doc_number, time): This is done based on the original Blei-LDA paper, where: log_phi := beta * exp(Ψ(gamma)), over every topic for every word. - TODO: incorporate lee-sueng trick used in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. + TODO: incorporate lee-sueng trick used in + **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ num_topics = self.lda.num_topics # digamma values @@ -955,15 +985,19 @@ def compute_lda_lhood(self): # below code only to be used in DIM mode # if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"): # influence_topic = ldapost.doc_weight[k] - # influence_term = - ((influence_topic * influence_topic + sigma_l * sigma_l) / 2.0 / (sigma_d * sigma_d)) + # influence_term = \ + # - ((influence_topic * influence_topic + sigma_l * sigma_l) / 2.0 / (sigma_d * sigma_d)) e_log_theta_k = digamma(self.gamma[k]) - digsum - lhood_term = (self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + gammaln(self.gamma[k]) - gammaln(self.lda.alpha[k]) + lhood_term = \ + (self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + \ + gammaln(self.gamma[k]) - gammaln(self.lda.alpha[k]) # TODO: check why there's an IF n = 0 for word_id, count in self.doc: if self.phi[n][k] > 0: - lhood_term += count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k]) + lhood_term += \ + count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k]) n += 1 self.lhood[k] = lhood_term lhood += lhood_term diff --git a/gensim/models/lsi_worker.py b/gensim/models/lsi_worker.py index ffb31eafb9..ceca83d9e6 100755 --- a/gensim/models/lsi_worker.py +++ b/gensim/models/lsi_worker.py @@ -44,7 +44,8 @@ def __init__(self): def initialize(self, myid, dispatcher, **model_params): self.lock_update = threading.Lock() self.jobsdone = 0 # how many jobs has this worker completed? - self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + self.myid = myid self.dispatcher = dispatcher self.finished = False logger.info("initializing worker #%s", myid) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index f2238a0e12..1ab3e68401 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -105,7 +105,8 @@ def ascarray(a, name=''): class Projection(utils.SaveLoad): - def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS, dtype=np.float64): + def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, + extra_dims=P2_EXTRA_DIMS, dtype=np.float64): """ Construct the (U, S) projection from a corpus `docs`. The projection can be later updated by merging it with another Projection via `self.merge()`. @@ -133,7 +134,8 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER logger.info("computing sparse SVD of %s matrix", str(docs.shape)) if not scipy.sparse.issparse(docs): docs = matutils.corpus2csc(docs) - ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested + # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested + ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) u = ut.T del ut, vt k = clip_spectrum(s**2, self.k) @@ -195,10 +197,14 @@ def merge(self, other, decay=1.0): # see http://www.mail-archive.com/np-discussion@scipy.org/msg07224.html and # bug ticket http://projects.scipy.org/np/ticket/706 # sdoering: replaced np's linalg.svd with scipy's linalg.svd: - u_k, s_k, _ = scipy.linalg.svd(k, full_matrices=False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in np :( //sdoering: maybe there is one in scipy? + + # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper + # for partial svd/eigendecomp in np :( //sdoering: maybe there is one in scipy? + u_k, s_k, _ = scipy.linalg.svd(k, full_matrices=False) except scipy.linalg.LinAlgError: logger.error("SVD(A) failed; trying SVD(A * A^T)") - u_k, s_k, _ = scipy.linalg.svd(np.dot(k, k.T), full_matrices=False) # if this fails too, give up with an exception + # if this fails too, give up with an exception + u_k, s_k, _ = scipy.linalg.svd(np.dot(k, k.T), full_matrices=False) s_k = np.sqrt(s_k) # go back from eigen values to singular values k = clip_spectrum(s_k**2, self.k) @@ -291,7 +297,9 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, self.dtype = dtype if corpus is None and self.id2word is None: - raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + raise ValueError( + 'at least one of corpus/id2word must be specified, to establish input space dimensionality' + ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") @@ -387,7 +395,8 @@ def add_documents(self, corpus, chunksize=None, decay=None): if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i", chunk_no) - self.dispatcher.putjob(job) # put job into queue; this will eventually block, because the queue has a small finite size + # put job into queue; this will eventually block, because the queue has a small finite size + self.dispatcher.putjob(job) del job logger.info("dispatched documents up to #%s", doc_no) else: @@ -450,7 +459,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512): topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication - # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory + # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), + # but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec) @@ -721,7 +731,8 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs) - chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC + # documents = columns of sparse CSC + chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) tmp = chunk.T * yold tmp = chunk * tmp del chunk @@ -754,8 +765,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix", str(x.shape)) - u, s, vt = scipy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) - s = np.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus + # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) + u, s, vt = scipy.linalg.svd(x) + # sqrt to go back from singular values of X to singular values of B = singular values of the corpus + s = np.sqrt(s) q = qt.T.copy() del qt diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 0c0ad58dd7..2f2b2c4b9a 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -20,7 +20,8 @@ >>> phrases = Phrases(sentence_stream) -and then create a performant Phraser object to transform any sentence (list of token strings) using the standard gensim syntax: +and then create a performant Phraser object to transform any sentence (list of token strings) +using the standard gensim syntax: >>> bigram = Phraser(phrases) >>> sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'] @@ -270,7 +271,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, else: raise ValueError('unknown scoring method string %s specified' % (scoring)) - scoring_parameters = ['worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count'] + scoring_parameters = [ + 'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count' + ] if callable(scoring): if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters): self.scoring = scoring @@ -461,8 +464,8 @@ def __getitem__(self, sentence): @classmethod def load(cls, *args, **kwargs): """ - Load a previously saved Phrases class. Handles backwards compatibility from older Phrases versions which did not support - pluggable scoring functions. Otherwise, relies on utils.load + Load a previously saved Phrases class. Handles backwards compatibility from + older Phrases versions which did not support pluggable scoring functions. Otherwise, relies on utils.load """ # for python 2 and 3 compatibility. basestring is used to check if model.scoring is a string diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index f5753c75c5..0c8f7c8b26 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -66,7 +66,8 @@ def initialize(self, corpus): # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). randmat = 1 - 2 * np.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1 - self.projection = np.asfortranarray(randmat, dtype=np.float32) # convert from int32 to floats, for faster multiplications + # convert from int32 to floats, for faster multiplications + self.projection = np.asfortranarray(randmat, dtype=np.float32) # TODO: check whether the Fortran-order shenanigans still make sense. In the original # code (~2010), this made a BIG difference for np BLAS implementations; perhaps now the wrappers # are smarter and this is no longer needed? diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 4999616597..78233136d2 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -35,7 +35,8 @@ .. [1] Dinu, Georgiana, Angeliki Lazaridou, and Marco Baroni. "Improving zero-shot learning by mitigating the hubness problem." arXiv preprint arXiv:1412.6568 (2014). -.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. +.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. + Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. """ @@ -69,7 +70,8 @@ def build(cls, lang_vec, lexicon=None): Construct a space class for the lexicon, if it's provided. Args: `lang_vec`: word2vec model that extract word vector for lexicon - `lexicon`: the default is None, if it is not provided, the lexicon is all the lang_vec's word, i.e. lang_vec.vocab.keys() + `lexicon`: the default is None, if it is not provided, the lexicon is all the lang_vec's word, + i.e. lang_vec.vocab.keys() Returns: `Space` object for the lexicon """ @@ -220,17 +222,26 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec # If the language word vector not provided by user, use the model's # language word vector as default if source_lang_vec is None: - warnings.warn("The parameter source_lang_vec isn't specified, use the model's source language word vector as default.") + warnings.warn( + "The parameter source_lang_vec isn't specified, " + "use the model's source language word vector as default." + ) source_lang_vec = self.source_lang_vec if target_lang_vec is None: - warnings.warn("The parameter target_lang_vec isn't specified, use the model's target language word vector as default.") + warnings.warn( + "The parameter target_lang_vec isn't specified, " + "use the model's target language word vector as default." + ) target_lang_vec = self.target_lang_vec # If additional is provided, bootstrapping vocabulary from the source language word vector model. if gc: if sample_num is None: - raise RuntimeError("When using the globally corrected neighbour retrieval method, the `sample_num` parameter(i.e. the number of words sampled from source space) must be provided.") + raise RuntimeError( + "When using the globally corrected neighbour retrieval method, " + "the `sample_num` parameter(i.e. the number of words sampled from source space) must be provided." + ) lexicon = set(source_lang_vec.index2word) addition = min(sample_num, len(lexicon) - len(source_words)) lexicon = self.random_state.choice(list(lexicon.difference(source_words)), addition) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 4ca0974a17..4c9b7f557f 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -9,12 +9,14 @@ Produce word vectors with deep learning via word2vec's "skip-gram and CBOW models", using either hierarchical softmax or negative sampling [1]_ [2]_. -NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. See wrappers for FastText, VarEmbed and WordRank. +NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. +See wrappers for FastText, VarEmbed and WordRank. The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ and extended with additional functionality. -For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, visit http://radimrehurek.com/2014/02/word2vec-tutorial/ +For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, +visit http://radimrehurek.com/2014/02/word2vec-tutorial/ **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training** (70x speedup compared to plain NumPy implementation [3]_). @@ -28,14 +30,17 @@ >>> model.save(fname) >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! -The word vectors are stored in a KeyedVectors instance in model.wv. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: +The word vectors are stored in a KeyedVectors instance in model.wv. +This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: >>> model.wv['computer'] # numpy vector of a word array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) -The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance:: +The word vectors can also be instantiated from an existing file on disk in the word2vec C format +as a KeyedVectors instance:: - NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing:: + NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, + vocabulary frequency and the binary tree is missing:: >>> from gensim.models.keyedvectors import KeyedVectors >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format @@ -74,7 +79,8 @@ and so on. -If you're finished training a model (i.e. no more updates, only querying), then switch to the :mod:`gensim.models.KeyedVectors` instance in wv +If you're finished training a model (i.e. no more updates, only querying), +then switch to the :mod:`gensim.models.KeyedVectors` instance in wv >>> word_vectors = model.wv >>> del model @@ -88,9 +94,10 @@ >>> bigram_transformer = gensim.models.Phrases(sentences) >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) -.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. -.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. - In Proceedings of NIPS, 2013. +.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. + Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. +.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. + Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. .. [3] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ """ from __future__ import division # py3 "true division" @@ -329,8 +336,8 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h return neu1e -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False, - context_vectors=None, context_locks=None, is_ft=False): +def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, + compute_loss=False, context_vectors=None, context_locks=None, is_ft=False): if context_vectors is None: if is_ft: context_vectors_vocab = model.wv.syn0_vocab @@ -421,7 +428,8 @@ class Word2Vec(utils.SaveLoad): then switch to the :mod:`gensim.models.KeyedVectors` instance in wv The model can be stored/loaded via its `save()` and `load()` methods, or stored/loaded in a format - compatible with the original word2vec implementation via `wv.save_word2vec_format()` and `KeyedVectors.load_word2vec_format()`. + compatible with the original word2vec implementation via `wv.save_word2vec_format()` + and `KeyedVectors.load_word2vec_format()`. """ @@ -540,7 +548,10 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) + self.train( + sentences, total_examples=self.corpus_count, epochs=self.iter, + start_alpha=self.alpha, end_alpha=self.min_alpha + ) else: if trim_rule is not None: logger.warning( @@ -617,7 +628,8 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ Each sentence must be a list of unicode strings. """ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) self.finalize_vocab(update=update) # build tables & arrays def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): @@ -652,16 +664,20 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) """ logger.info("Processing provided word frequencies") - raw_vocab = word_freq # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab + # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) + # to be directly the raw vocab + raw_vocab = word_freq logger.info( "collected %i different raw word, with total frequency of %i", len(raw_vocab), sum(itervalues(raw_vocab)) ) - self.corpus_count = corpus_count if corpus_count else 0 # Since no sentences are provided, this is to control the corpus_count + # Since no sentences are provided, this is to control the corpus_count + self.corpus_count = corpus_count if corpus_count else 0 self.raw_vocab = raw_vocab - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + # trim by min_count & precalculate downsampling + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) self.finalize_vocab(update=update) # build tables & arrays def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): @@ -926,7 +942,8 @@ def train(self, sentences, total_examples=None, total_words=None, logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, + self.hs, self.sample, self.negative, self.window ) if not self.wv.vocab: @@ -936,7 +953,8 @@ def train(self, sentences, total_examples=None, total_words=None, if not hasattr(self, 'corpus_count'): raise ValueError( - "The number of sentences in the training corpus is missing. Did you load the model via KeyedVectors.load_word2vec_format?" + "The number of sentences in the training corpus is missing. " + "Did you load the model via KeyedVectors.load_word2vec_format?" "Models loaded via load_word2vec_format don't support further training. " "Instead start with a blank model, scan_vocab on the new corpus, " "intersect_word2vec_format with the old model, then train." @@ -1093,13 +1111,19 @@ def job_producer(): raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed ) if job_tally < 10 * self.workers: - logger.warning("under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay") + logger.warning( + "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" + ) # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: - logger.warning("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) + logger.warning( + "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples + ) if total_words and total_words != raw_word_count: - logger.warning("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) + logger.warning( + "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words + ) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed @@ -1119,9 +1143,11 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor Note that you should specify total_sentences; we'll run into problems if you ask to score more than this number of sentences but it is inefficient to set the value too high. - See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification. + See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of + how to use such scores in document classification. - .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. + .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, + in Proceedings of the 2015 Conference of the Association of Computational Linguistics. .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb """ @@ -1410,7 +1436,8 @@ def n_similarity(self, ws1, ws2): return self.wv.n_similarity(ws1, ws2) def predict_output_word(self, context_words_list, topn=10): - """Report the probability distribution of the center word given the context words as input to the trained model.""" + """Report the probability distribution of the center word given the context words + as input to the trained model.""" if not self.negative: raise RuntimeError( "We have currently only implemented predict_output_word for the negative sampling scheme, " @@ -1434,7 +1461,8 @@ def predict_output_word(self, context_words_list, topn=10): prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] # returning the most probable output words with their probabilities + # returning the most probable output words with their probabilities + return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] def init_sims(self, replace=False): """ @@ -1478,7 +1506,8 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): """ return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs) - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): + def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, + case_insensitive=True, dummy4unknown=False): """ Deprecated. Use self.wv.evaluate_word_pairs() instead. Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs` @@ -1486,7 +1515,9 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) def __str__(self): - return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha) + return "%s(vocab=%s, size=%s, alpha=%s)" % ( + self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha + ) def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): warnings.warn( @@ -1675,11 +1706,11 @@ class PathLineSentences(object): """ Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. - The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending - with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. + The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. + Any file not ending with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. - The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already - preprocessed and separated by whitespace. + The format of files (either text, or compressed text files) in the path is one sentence = one line, + with words already preprocessed and separated by whitespace. """ @@ -1726,7 +1757,8 @@ def __iter__(self): i += self.max_sentence_length -# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 +# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 \ +# -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": import argparse logging.basicConfig( @@ -1751,14 +1783,35 @@ def __iter__(self): parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) + parser.add_argument( + "-sample", + help="Set threshold for occurrence of words. " + "Those that appear with higher frequency in the training data will be randomly down-sampled;" + " default is 1e-3, useful range is (0, 1e-5)", + type=float, default=1e-3 + ) + parser.add_argument( + "-hs", help="Use Hierarchical Softmax; default is 0 (not used)", + type=int, default=0, choices=[0, 1] + ) + parser.add_argument( + "-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", + type=int, default=5 + ) parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) + parser.add_argument( + "-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", + type=int, default=5 + ) + parser.add_argument( + "-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", + type=int, default=1, choices=[0, 1] + ) + parser.add_argument( + "-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", + type=int, default=0, choices=[0, 1] + ) parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 1f450a457a..8bbadfc663 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -13,7 +13,8 @@ Example: ->>> model = gensim.models.wrappers.DtmModel('dtm-win64.exe', my_corpus, my_timeslices, num_topics=20, id2word=dictionary) +>>> model = gensim.models.wrappers.DtmModel('dtm-win64.exe', my_corpus, my_timeslices, +... num_topics=20, id2word=dictionary) .. [1] https://github.com/magsilva/dtm/tree/master/bin @@ -320,7 +321,8 @@ def print_topic(self, topicid, time, topn=10, num_words=None): def dtm_vis(self, corpus, time): """ - returns term_frequency, vocab, doc_lengths, topic-term distributions and doc_topic distributions, specified by pyLDAvis format. + returns term_frequency, vocab, doc_lengths, topic-term distributions and doc_topic distributions, + specified by pyLDAvis format. all of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis. input parameter is the year to do the visualisation. """ diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 39d512d6b7..29927b7051 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -123,7 +123,8 @@ def init_sims(self, replace=False): self.syn0_ngrams[i, :] /= sqrt((self.syn0_ngrams[i, :] ** 2).sum(-1)) self.syn0_ngrams_norm = self.syn0_ngrams else: - self.syn0_ngrams_norm = (self.syn0_ngrams / sqrt((self.syn0_ngrams ** 2).sum(-1))[..., newaxis]).astype(REAL) + self.syn0_ngrams_norm = \ + (self.syn0_ngrams / sqrt((self.syn0_ngrams ** 2).sum(-1))[..., newaxis]).astype(REAL) def __contains__(self, word): """ @@ -280,7 +281,8 @@ def load_model_params(self, file_handle): magic, version = self.struct_unpack(file_handle, '@2i') if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format self.new_format = True - dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@12i1d') + dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \ + self.struct_unpack(file_handle, '@12i1d') else: # older format self.new_format = False dim = magic diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index e58c85f17a..7f39e12c5a 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -21,7 +21,8 @@ Example: - >>> model = gensim.models.wrappers.LdaMallet('/Users/kofola/mallet-2.0.7/bin/mallet', corpus=my_corpus, num_topics=20, id2word=dictionary) + >>> model = gensim.models.wrappers.LdaMallet('/Users/kofola/mallet-2.0.7/bin/mallet', + ... corpus=my_corpus, num_topics=20, id2word=dictionary) >>> print model[my_vector] # print LDA topics of a document .. [1] http://mallet.cs.umass.edu/ @@ -65,13 +66,16 @@ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=N `workers` is the number of threads, for parallel training. - `prefix` is the string prefix under which all data files will be stored; default: system temp + random filename prefix. + `prefix` is the string prefix under which all data files will be stored; + default: system temp + random filename prefix. - `optimize_interval` optimize hyperparameters every N iterations (sometimes leads to Java exception; 0 to switch off hyperparameter optimization). + `optimize_interval` optimize hyperparameters every N iterations (sometimes leads to Java exception; + 0 to switch off hyperparameter optimization). `iterations` is the number of sampling iterations. - `topic_threshold` is the threshold of the probability above which we consider a topic. This is basically for sparse topic distribution. + `topic_threshold` is the threshold of the probability above which we consider a topic. + This is basically for sparse topic distribution. """ self.mallet_path = mallet_path @@ -143,7 +147,10 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True): self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format - cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex \"\S+\" --input %s --output %s" + cmd = \ + self.mallet_path + \ + " import-file --preserve-case --keep-sequence " \ + "--remove-stopwords --token-regex \"\S+\" --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') @@ -167,7 +174,8 @@ def train(self, corpus): check_output(args=cmd, shell=True) self.word_topics = self.load_word_topics() # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. - # word_topics has replaced wordtopics throughout the code; wordtopics just stores the values of word_topics when train is called. + # word_topics has replaced wordtopics throughout the code; + # wordtopics just stores the values of word_topics when train is called. self.wordtopics = self.word_topics def __getitem__(self, bow, iterations=100): @@ -177,7 +185,9 @@ def __getitem__(self, bow, iterations=100): bow = [bow] self.convert_input(bow, infer=True) - cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' + cmd = \ + self.mallet_path + ' infer-topics --input %s --inferencer %s ' \ + '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % ( self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold @@ -239,7 +249,8 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) - sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) # add a little random jitter, to randomize results around the same alpha + # add a little random jitter, to randomize results around the same alpha + sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha)) sorted_topics = list(matutils.argsort(sort_alpha)) chosen_topics = sorted_topics[: num_topics // 2] + sorted_topics[-num_topics // 2:] shown = [] @@ -308,19 +319,25 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True): # the MALLET doctopic format changed in 2.0.8 to exclude the id, # this handles the file differently dependent on the pattern if len(parts) == 2 * self.num_topics: - doc = [(int(id_), float(weight)) for id_, weight in zip(*[iter(parts)] * 2) if abs(float(weight)) > eps] + doc = [ + (int(id_), float(weight)) for id_, weight in zip(*[iter(parts)] * 2) + if abs(float(weight)) > eps + ] elif len(parts) == self.num_topics and mallet_version != '2.0.7': doc = [(id_, float(weight)) for id_, weight in enumerate(parts) if abs(float(weight)) > eps] else: if mallet_version == "2.0.7": """ - 1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 1 0.005575655428533364 - 2 2 0 0.9184413079632608 40.009062076892971008 3 0.009062076892971008 2 0.009062076892971008 1 0.009062076892971008 - In the above example there is a mix of the above if and elif statement. There are neither `2*num_topics` nor `num_topics` elements. - It has 2 formats 40.009062076892971008 and 0 1.0780612802674239 which cannot be handled by above if elif. - Also, there are some topics are missing(meaning that the topic is not there) which is another reason why the above if elif - fails even when the `mallet` produces the right results + 1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 + 2 2 0 0.9184413079632608 40.009062076892971008 3 0.009062076892971008 + In the above example there is a mix of the above if and elif statement. + There are neither `2*num_topics` nor `num_topics` elements. + It has 2 formats 40.009062076892971008 and 0 1.0780612802674239 + which cannot be handled by above if elif. + Also, there are some topics are missing(meaning that the topic is not there) + which is another reason why the above if elif fails even when the `mallet` + produces the right results """ count = 0 diff --git a/gensim/models/wrappers/varembed.py b/gensim/models/wrappers/varembed.py index 30bf859ec7..1dae20271d 100644 --- a/gensim/models/wrappers/varembed.py +++ b/gensim/models/wrappers/varembed.py @@ -5,7 +5,8 @@ # Copyright (C) 2017 Radim Rehurek """ -Python wrapper around word representation learning from Varembed models, a library for efficient learning of word representations +Python wrapper around word representation learning from Varembed models, +a library for efficient learning of word representations and sentence classification [1]. This module allows ability to obtain word vectors for out-of-vocabulary words, for the Varembed model[2]. diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index c31cd28adc..077e5a29ec 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -54,18 +54,21 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, `wr_path` is the absolute path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line - `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. + `out_name` is name of the directory which will be created (in wordrank folder) + to save embeddings and training data. It will contain following contents: Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt - A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths + A meta directory which contain: 'vocab.txt' - vocab words, + 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. `min_count` = ignore all words with total frequency lower than this. - `max_vocab_size` upper bound on vocabulary size, i.e. keep the most frequent words. Default is 0 for no limit. + `max_vocab_size` upper bound on vocabulary size, i.e. keep the most frequent words. + Default is 0 for no limit. `sgd_num` number of SGD taken for each data point. `lrate` is the learning rate (too high diverges, give Nan). `period` is the period of xi variable updates @@ -78,7 +81,8 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. `np` number of copies to execute. (mpirun option) - `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging + `cleanup_files` if True, delete directory and files used by this wrapper, + setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors """ @@ -140,8 +144,10 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, iter += 1 else: logger.warning( - "Resultant embedding will be from %d iterations rather than the input %d iterations, as wordrank dumps the embedding only at dump_period intervals. " - "Input an appropriate combination of parameters (iter, dump_period) such that \"iter mod dump_period\" is zero.", + "Resultant embedding will be from %d iterations rather than the input %d iterations, " + "as wordrank dumps the embedding only at dump_period intervals. " + "Input an appropriate combination of parameters (iter, dump_period) " + "such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter ) diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 88574acee0..30f62c9b11 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -56,7 +56,9 @@ def glove2word2vec(glove_input_file, word2vec_output_file): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, help="Input file, in gloVe format (read-only).") - parser.add_argument("-o", "--output", required=True, help="Output file, in word2vec text format (will be overwritten).") + parser.add_argument( + "-o", "--output", required=True, help="Output file, in word2vec text format (will be overwritten)." + ) args = parser.parse_args() # do the actual conversion diff --git a/gensim/scripts/make_wiki_online.py b/gensim/scripts/make_wiki_online.py index 37c437f3e1..0ec9704724 100755 --- a/gensim/scripts/make_wiki_online.py +++ b/gensim/scripts/make_wiki_online.py @@ -31,7 +31,8 @@ lemmatization to get a lemma of each token (instead of plain alphabetic tokenizer). The package is available at https://github.com/clips/pattern . -Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en +Example: + python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki """ @@ -78,7 +79,8 @@ dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') diff --git a/gensim/scripts/make_wiki_online_lemma.py b/gensim/scripts/make_wiki_online_lemma.py index 37c437f3e1..0ec9704724 100755 --- a/gensim/scripts/make_wiki_online_lemma.py +++ b/gensim/scripts/make_wiki_online_lemma.py @@ -31,7 +31,8 @@ lemmatization to get a lemma of each token (instead of plain alphabetic tokenizer). The package is available at https://github.com/clips/pattern . -Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en +Example: + python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki """ @@ -78,7 +79,8 @@ dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') diff --git a/gensim/scripts/make_wiki_online_nodebug.py b/gensim/scripts/make_wiki_online_nodebug.py index 37c437f3e1..0ec9704724 100755 --- a/gensim/scripts/make_wiki_online_nodebug.py +++ b/gensim/scripts/make_wiki_online_nodebug.py @@ -31,7 +31,8 @@ lemmatization to get a lemma of each token (instead of plain alphabetic tokenizer). The package is available at https://github.com/clips/pattern . -Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en +Example: + python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki """ @@ -78,7 +79,8 @@ dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py index 37c437f3e1..0ec9704724 100755 --- a/gensim/scripts/make_wikicorpus.py +++ b/gensim/scripts/make_wikicorpus.py @@ -31,7 +31,8 @@ lemmatization to get a lemma of each token (instead of plain alphabetic tokenizer). The package is available at https://github.com/clips/pattern . -Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en +Example: + python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki """ @@ -78,7 +79,8 @@ dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index f8878aed79..c445253b26 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -6,7 +6,8 @@ # Copyright (C) 2016 Radim Rehurek """ -USAGE: $ python -m gensim.scripts.word2vec2tensor --input --output [--binary] +USAGE: $ python -m gensim.scripts.word2vec2tensor --input --output \ + [--binary] Where: @@ -18,7 +19,8 @@ The script will create two TSV files. A 2d tensor format file, and a Word Embedding metadata file. Both files will use the --output file name as prefix. -This script is used to convert the word2vec format to Tensorflow 2D tensor and metadata formats for Embedding Visualization +This script is used to convert the word2vec format to Tensorflow 2D tensor +and metadata formats for Embedding Visualization. To use the generated TSV 2D tensor and metadata file in the Projector Visualizer, please 1) Open http://projector.tensorflow.org/. @@ -73,7 +75,9 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, help="Input word2vec model") parser.add_argument("-o", "--output", required=True, help="Output tensor file name prefix") - parser.add_argument("-b", "--binary", required=False, help="If word2vec model in binary format, set True, else False") + parser.add_argument( + "-b", "--binary", required=False, help="If word2vec model in binary format, set True, else False" + ) args = parser.parse_args() word2vec2tensor(args.input, args.output, args.binary) diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py index 878e588613..57f4d907ba 100644 --- a/gensim/scripts/word2vec_standalone.py +++ b/gensim/scripts/word2vec_standalone.py @@ -43,9 +43,11 @@ Use the continuous bag of words model; default is 1 (use 0 for skip-gram model) -accuracy Compute accuracy of the resulting model analogical inference power on questions file - See an example of questions file at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt + See an example of questions file + at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt -Example: python -m gensim.scripts.word2vec_standalone -train data.txt -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 +Example: python -m gensim.scripts.word2vec_standalone -train data.txt \ + -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 """ @@ -70,16 +72,38 @@ parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; " - "default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) + parser.add_argument( + "-sample", + help="Set threshold for occurrence of words. " + "Those that appear with higher frequency in the training data will be randomly down-sampled; " + "default is 1e-3, useful range is (0, 1e-5)", + type=float, default=1e-3) + parser.add_argument( + "-hs", help="Use Hierarchical Softmax; default is 0 (not used)", + type=int, default=0, choices=[0, 1] + ) + parser.add_argument( + "-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", + type=int, default=5 + ) parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3) parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) - parser.add_argument("-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) + parser.add_argument( + "-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", + type=int, default=5 + ) + parser.add_argument( + "-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", + type=float + ) + parser.add_argument( + "-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", + type=int, default=1, choices=[0, 1] + ) + parser.add_argument( + "-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", + type=int, default=0, choices=[0, 1] + ) parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 20c9f8518c..6016e32f49 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -486,7 +486,9 @@ class for description of the other parameters. """ if num_features is None: - logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") + logger.warning( + "scanning corpus to determine the number of features (consider setting `num_features` explicitly)" + ) num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features @@ -577,7 +579,8 @@ class WmdSimilarity(interfaces.SimilarityABC): .. Matt Kusner et al. "From Word Embeddings To Document Distances". Example: - # See Tutorial Notebook for more examples https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/WMD_tutorial.ipynb + # See Tutorial Notebook for more examples + https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/WMD_tutorial.ipynb >>> # Given a document collection "corpus", train word2vec model. >>> model = word2vec(corpus) >>> instance = WmdSimilarity(corpus, model, num_best=10) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index 8845bdd816..792e950491 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -78,7 +78,8 @@ def transform(self, author_names): if not isinstance(author_names, list): author_names = [author_names] - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + # returning dense representation for compatibility with sklearn + # but we should go back to sparse representation in the future topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names] return np.reshape(np.array(topics), (len(author_names), self.num_topics)) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 245231ad45..f3d0bcbdb4 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -78,7 +78,9 @@ def fit(self, X, y=None): def transform(self, docs): """ Return the vector representations for the input documents. - The input `docs` should be a list of lists like : [ ['calculus', 'mathematical'], ['geometry', 'operations', 'curves'] ] + The input `docs` should be a list of lists like + [['calculus', 'mathematical'], + ['geometry', 'operations', 'curves']] or a single document like : ['calculus', 'mathematical'] """ if self.gensim_model is None: diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index 41b3412972..80bb13e19d 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -68,7 +68,9 @@ def transform(self, docs): Takes a list of documents as input ('docs'). Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). - The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] + The input `docs` should be in BOW format and can be a list of documents like + [[(4, 1), (7, 1)], + [(9, 1), (13, 1)], [(2, 1), (6, 1)]] or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: @@ -86,7 +88,8 @@ def transform(self, docs): distribution.append(topicd) max_num_topics = max(max_num_topics, max(topic[0] for topic in topicd) + 1) - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + # returning dense representation for compatibility with sklearn + # but we should go back to sparse representation in the future distribution = [matutils.sparse2full(t, max_num_topics) for t in distribution] return np.reshape(np.array(distribution), (len(docs), max_num_topics)) diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index ba2ab62ef5..178a52c571 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -77,16 +77,21 @@ def transform(self, docs): Takes a list of documents as input ('docs'). Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). - The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] + The input `docs` should be in BOW format and can be a list of documents like + [[(4, 1), (7, 1)], + [(9, 1), (13, 1)], [(2, 1), (6, 1)]] or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # The input as array of array if isinstance(docs[0], tuple): docs = [docs] - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + # returning dense representation for compatibility with sklearn + # but we should go back to sparse representation in the future distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] return np.reshape(np.array(distribution), (len(docs), self.num_topics)) @@ -124,8 +129,9 @@ def score(self, X, y=None): if self.scorer == 'perplexity': corpus_words = sum(cnt for document in X for _, cnt in document) subsample_ratio = 1.0 - perwordbound = self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) - return -1 * np.exp2(-perwordbound) # returning (-1*perplexity) to select model with minimum perplexity value + perwordbound = \ + self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) + return -1 * np.exp2(-perwordbound) # returning (-1*perplexity) to select model with minimum value elif self.scorer == 'u_mass': goodcm = models.CoherenceModel(model=self.gensim_model, corpus=X, coherence=self.scorer, topn=3) return goodcm.get_coherence() diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py index 2c5d0879d4..1328e22af1 100644 --- a/gensim/sklearn_api/ldaseqmodel.py +++ b/gensim/sklearn_api/ldaseqmodel.py @@ -62,11 +62,15 @@ def fit(self, X, y=None): def transform(self, docs): """ Return the topic proportions for the documents passed. - The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] + The input `docs` should be in BOW format and can be a list of documents like + [[(4, 1), (7, 1)], + [(9, 1), (13, 1)], [(2, 1), (6, 1)]] or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # The input as array of array if isinstance(docs[0], tuple): diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 87d813d768..7034df7da6 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -24,7 +24,8 @@ class LsiTransformer(TransformerMixin, BaseEstimator): Base LSI module """ - def __init__(self, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): + def __init__(self, num_topics=200, id2word=None, chunksize=20000, + decay=1.0, onepass=True, power_iters=2, extra_samples=100): """ Sklearn wrapper for LSI model. See gensim.model.LsiModel for parameter details. """ @@ -58,7 +59,9 @@ def transform(self, docs): Takes a list of documents as input ('docs'). Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). - The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] + The input `docs` should be in BOW format and can be a list of documents like + [[(4, 1), (7, 1)], + [(9, 1), (13, 1)], [(2, 1), (6, 1)]] or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: @@ -69,7 +72,8 @@ def transform(self, docs): # The input as array of array if isinstance(docs[0], tuple): docs = [docs] - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + # returning dense representation for compatibility with sklearn + # but we should go back to sparse representation in the future distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] return np.reshape(np.array(distribution), (len(docs), self.num_topics)) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index fcd7d4c5f1..c64b809bb7 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -22,7 +22,7 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, - delimiter=b'_', progress_per=10000, scoring='default'): + delimiter=b'_', progress_per=10000, scoring='default'): """ Sklearn wrapper for Phrases model. """ @@ -38,8 +38,11 @@ def fit(self, X, y=None): """ Fit the model according to the given training data. """ - self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, - max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring) + self.gensim_model = models.Phrases( + sentences=X, min_count=self.min_count, threshold=self.threshold, + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, + progress_per=self.progress_per, scoring=self.scoring + ) return self def transform(self, docs): @@ -47,7 +50,9 @@ def transform(self, docs): Return the input documents to return phrase tokens. """ if self.gensim_model is None: - raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") + raise NotFittedError( + "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." + ) # input as python lists if isinstance(docs[0], string_types): @@ -56,8 +61,11 @@ def transform(self, docs): def partial_fit(self, X): if self.gensim_model is None: - self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, - max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring) + self.gensim_model = models.Phrases( + sentences=X, min_count=self.min_count, threshold=self.threshold, + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, + progress_per=self.progress_per, scoring=self.scoring + ) self.gensim_model.add_vocab(X) return self diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index 59d4c87a45..c2f50f5d0f 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -43,7 +43,9 @@ def transform(self, docs): """ Take documents/corpus as input. Return RP representation of the input documents/corpus. - The input `docs` can correspond to multiple documents like : [ [(0, 1.0), (1, 1.0), (2, 1.0)], [(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)] ] + The input `docs` can correspond to multiple documents like + [[(0, 1.0), (1, 1.0), (2, 1.0)], + [(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]] or a single document like : [(0, 1.0), (1, 1.0), (2, 1.0)] """ if self.gensim_model is None: @@ -54,6 +56,7 @@ def transform(self, docs): # The input as array of array if isinstance(docs[0], tuple): docs = [docs] - # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future + # returning dense representation for compatibility with sklearn + # but we should go back to sparse representation in the future presentation = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs] return np.reshape(np.array(presentation), (len(docs), self.num_topics)) diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index 7952d11e75..c0a45f1823 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -37,8 +37,10 @@ def fit(self, X, y=None): """ Fit the model according to the given training data. """ - self.gensim_model = TfidfModel(corpus=X, id2word=self.id2word, dictionary=self.dictionary, - wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize) + self.gensim_model = TfidfModel( + corpus=X, id2word=self.id2word, dictionary=self.dictionary, + wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize + ) return self def transform(self, docs): diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 2e2d4ed45e..1b1251b1d7 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -115,7 +115,9 @@ def _extract_important_sentences(sentences, corpus, important_docs, word_count): # If no "word_count" option is provided, the number of sentences is # reduced by the provided ratio. Else, the ratio is ignored. - return important_sentences if word_count is None else _get_sentences_with_word_count(important_sentences, word_count) + return important_sentences \ + if word_count is None \ + else _get_sentences_with_word_count(important_sentences, word_count) def _format_results(extracted_sentences, split): diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index f1d68ae7a7..cd1293e6fa 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -90,7 +90,8 @@ def testTransform(self): # output of the model slightly. vec = matutils.sparse2full(jill_topics, 2) # convert to dense vector, for easier equality tests expected = [0.91, 0.08] - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering + # must contain the same values, up to re-ordering + passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) if passed: break logging.warning( @@ -240,7 +241,8 @@ def testTransformSerialized(self): # output of the model slightly. vec = matutils.sparse2full(jill_topics, 2) # convert to dense vector, for easier equality tests expected = [0.91, 0.08] - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering + # must contain the same values, up to re-ordering + passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # Delete the MmCorpus used for serialization inside the author-topic model. remove(datapath('testcorpus_serialization.mm')) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 0d49c9e2e3..0499dd2210 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -142,7 +142,10 @@ def test_string_doctags(self): self.assertEqual(model.docvecs['_*0'].shape, (100,)) self.assertTrue(all(model.docvecs['_*0'] == model.docvecs[0])) self.assertTrue(max(d.offset for d in model.docvecs.doctags.values()) < len(model.docvecs.doctags)) - self.assertTrue(max(model.docvecs._int_index(str_key) for str_key in model.docvecs.doctags.keys()) < len(model.docvecs.doctag_syn0)) + self.assertTrue( + max(model.docvecs._int_index(str_key) for str_key in model.docvecs.doctags.keys()) + < len(model.docvecs.doctag_syn0) + ) # verify docvecs.most_similar() returns string doctags rather than indexes self.assertEqual(model.docvecs.offset2doctag[0], model.docvecs.most_similar([model.docvecs[0]])[0][0]) @@ -161,7 +164,10 @@ def test_similarity_unseen_docs(self): model = doc2vec.Doc2Vec(min_count=1) model.build_vocab(corpus) - self.assertTrue(model.docvecs.similarity_unseen_docs(model, rome_str, rome_str) > model.docvecs.similarity_unseen_docs(model, rome_str, car_str)) + self.assertTrue( + model.docvecs.similarity_unseen_docs(model, rome_str, rome_str) > + model.docvecs.similarity_unseen_docs(model, rome_str, car_str) + ) def model_sanity(self, model, keep_training=True): """Any non-trivial model on DocsLeeCorpus can pass these sanity checks""" @@ -189,7 +195,8 @@ def model_sanity(self, model, keep_training=True): self.assertTrue(np.allclose(list(zip(*sims))[1], list(zip(*sims2))[1])) # close-enough dists # sim results should be in clip range if given - clip_sims = model.docvecs.most_similar(fire1, clip_start=len(model.docvecs) // 2, clip_end=len(model.docvecs) * 2 // 3) + clip_sims = \ + model.docvecs.most_similar(fire1, clip_start=len(model.docvecs) // 2, clip_end=len(model.docvecs) * 2 // 3) sims_doc_id = [docid for docid, sim in clip_sims] for s_id in sims_doc_id: self.assertTrue(len(model.docvecs) // 2 <= s_id <= len(model.docvecs) * 2 // 3) diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py index a52766b8cd..efc84f4e75 100644 --- a/gensim/test/test_dtm.py +++ b/gensim/test/test_dtm.py @@ -10,7 +10,6 @@ from subprocess import CalledProcessError import gensim import os -import sys import unittest from gensim import corpora from gensim.test.utils import datapath @@ -24,11 +23,8 @@ def setUp(self): self.id2word = corpora.Dictionary.load(datapath('dtm_test.dict')) # first you need to setup the environment variable $DTM_PATH for the dtm executable file self.dtm_path = os.environ.get('DTM_PATH', None) - if self.dtm_path is None: - if sys.version_info >= (2, 7, 0): - self.skipTest("$DTM_PATH is not properly set up.") - else: - logging.warning("$DTM_PATH is not properly set up.") + if not self.dtm_path: + self.skipTest("$DTM_PATH is not properly set up.") def testDtm(self): if self.dtm_path is not None: diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index d56272b4e1..69aa9d074a 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -255,7 +255,9 @@ def test_n_similarity(self): self.assertEqual(self.test_model.n_similarity(['the'], ['and']), self.test_model.n_similarity(['and'], ['the'])) # Out of vocab check self.assertTrue(np.allclose(self.test_model.n_similarity(['night', 'nights'], ['nights', 'night']), 1.0)) - self.assertEqual(self.test_model.n_similarity(['night'], ['nights']), self.test_model.n_similarity(['nights'], ['night'])) + self.assertEqual( + self.test_model.n_similarity(['night'], ['nights']), self.test_model.n_similarity(['nights'], ['night']) + ) def test_similarity(self): # In vocab, sanity check @@ -440,12 +442,17 @@ def test_sg_neg_online(self): @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_hs_online(self): - model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1) + model = FT_gensim( + sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1 + ) self.online_sanity(model) @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_neg_online(self): - model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5, min_count=5, iter=1, seed=42, workers=1, sample=0) + model = FT_gensim( + sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5, + min_count=5, iter=1, seed=42, workers=1, sample=0 + ) self.online_sanity(model) diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py index 3d3abd6f3b..85c2c2c4bb 100644 --- a/gensim/test/test_keras_integration.py +++ b/gensim/test/test_keras_integration.py @@ -26,7 +26,6 @@ class TestKerasWord2VecWrapper(unittest.TestCase): def setUp(self): self.model_cos_sim = word2vec.Word2Vec(common_texts, size=100, min_count=1, hs=1) - # self.model_twenty_ng = word2vec.Word2Vec(word2vec.LineSentence(datapath('20_newsgroup_keras_w2v_data.txt')), min_count=1) self.model_twenty_ng = word2vec.Word2Vec(min_count=1) def testWord2VecTraining(self): @@ -75,7 +74,8 @@ def testEmbeddingLayerCosineSim(self): def testEmbeddingLayer20NewsGroup(self): """ - Test Keras 'Embedding' layer returned by 'get_embedding_layer' function for a smaller version of the 20NewsGroup classification problem. + Test Keras 'Embedding' layer returned by 'get_embedding_layer' function + for a smaller version of the 20NewsGroup classification problem. """ MAX_SEQUENCE_LENGTH = 1000 @@ -141,7 +141,9 @@ def testEmbeddingLayer20NewsGroup(self): fit_ret_val = model.fit(x_train, y_train, epochs=1) # verify the type of the object returned after training - self.assertTrue(type(fit_ret_val) == keras.callbacks.History) # value returned is a `History` instance. Its `history` attribute contains all information collected during training. + # value returned is a `History` instance. + # Its `history` attribute contains all information collected during training. + self.assertTrue(type(fit_ret_val) == keras.callbacks.History) if __name__ == '__main__': diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index b780ad42f6..42ed6890d4 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -50,7 +50,8 @@ def testTransform(self): transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.49, 0.51] - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering + # must contain the same values, up to re-ordering + passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) if passed: break logging.warning( @@ -73,7 +74,8 @@ def testSparseTransform(self): transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [1.0, 0.0] - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering + # must contain the same values, up to re-ordering + passed = np.allclose(sorted(vec), sorted(expected), atol=1e-2) if passed: break logging.warning( @@ -153,7 +155,6 @@ def testLargeMmapCompressed(self): # test loading the large model arrays with mmap self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r') -# endclass TestLdaMallet if __name__ == '__main__': diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 98c4a38b74..15cf1bb260 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -55,10 +55,13 @@ def testTransform(self): vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] - passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # must contain the same values, up to re-ordering + # must contain the same values, up to re-ordering + passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) if passed: break - logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)", i, sorted(vec), sorted(expected)) + logging.warning( + "LDA failed to converge on attempt %i (got %s, expected %s)", i, sorted(vec), sorted(expected) + ) self.assertTrue(passed) def testAlphaAuto(self): diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index 3ff59c1e43..eb4be02fd5 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -60,7 +60,8 @@ def testTransformFloat32(self): transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = np.array([-0.6594664, 0.142115444]) # scaled LSI version - self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1.e-5)) # transformed entries must be equal up to sign + # transformed entries must be equal up to sign + self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1.e-5)) def testCorpusTransform(self): """Test lsi[corpus] transformation.""" @@ -85,7 +86,8 @@ def testOnlineTransform(self): # create the transformation model model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once - model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later + # start with no documents, we will add them later + model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # train model on a single document model.add_documents([corpus[0]]) @@ -111,7 +113,8 @@ def testOnlineTransform(self): # make sure the final transformation is the same as if we had decomposed the whole corpus at once vec1 = matutils.sparse2full(model[doc], model.num_topics) vec2 = matutils.sparse2full(model2[doc], model2.num_topics) - self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign + # the two LSI representations must equal up to sign + self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5)) def testPersistence(self): fname = get_tmpfile('gensim_models_lsi.tst') diff --git a/gensim/test/test_normmodel.py b/gensim/test/test_normmodel.py index fa7a4096fd..daa62e72c7 100644 --- a/gensim/test/test_normmodel.py +++ b/gensim/test/test_normmodel.py @@ -136,7 +136,8 @@ def testPersistence(self): model2 = normmodel.NormModel.load(fname) self.assertTrue(model.norms == model2.norms) tstvec = [] - self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) # try projecting an empty vector + # try projecting an empty vector + self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) def testPersistenceCompressed(self): fname = get_tmpfile('gensim_models.tst.gz') @@ -145,7 +146,8 @@ def testPersistenceCompressed(self): model2 = normmodel.NormModel.load(fname, mmap=None) self.assertTrue(model.norms == model2.norms) tstvec = [] - self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) # try projecting an empty vector + # try projecting an empty vector + self.assertTrue(np.allclose(model.normalize(tstvec), model2.normalize(tstvec))) if __name__ == '__main__': diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py index 02ca13fb6b..c356de7e81 100644 --- a/gensim/test/test_parsing.py +++ b/gensim/test/test_parsing.py @@ -8,7 +8,9 @@ import logging import unittest import numpy as np -from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation2, strip_tags, strip_short, strip_numeric, strip_non_alphanum, strip_multiple_whitespaces, split_alphanum, stem_text +from gensim.parsing.preprocessing import \ + remove_stopwords, strip_punctuation2, strip_tags, strip_short, strip_numeric, strip_non_alphanum, \ + strip_multiple_whitespaces, split_alphanum, stem_text # several documents diff --git a/gensim/test/test_rpmodel.py b/gensim/test/test_rpmodel.py index 6d09dbcb84..c1438a4c5c 100644 --- a/gensim/test/test_rpmodel.py +++ b/gensim/test/test_rpmodel.py @@ -26,7 +26,8 @@ def setUp(self): def testTransform(self): # create the transformation model - np.random.seed(13) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results) + # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results) + np.random.seed(13) model = rpmodel.RpModel(self.corpus, num_topics=2) # transform one document diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 5c54685c8e..58cbf2f734 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -324,7 +324,8 @@ def testFull(self, num_best=None): if num_best is not None: # Sparse array. for i, sim in sims: - self.assertTrue(numpy.alltrue(sim > 0.0)) # Note that similarities are bigger than zero, as they are the 1/ 1 + distances. + # Note that similarities are bigger than zero, as they are the 1/ 1 + distances. + self.assertTrue(numpy.alltrue(sim > 0.0)) else: self.assertTrue(sims[0] == 1.0) # Similarity of a document with itself is 0.0. self.assertTrue(numpy.alltrue(sims[1:] > 0.0)) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 5f8a0c59f7..3793c79948 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -56,25 +56,195 @@ corpus_new = [dictionary_new.doc2bow(text) for text in texts_new] texts_ldaseq = [ - [u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'], - [u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'], - [u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'], - [u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], - [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], - [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'], - [u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], - [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'], - [u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], - [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], - [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'], - [u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], - [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], - [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], - [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], - [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'], - [u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], - [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], - [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], + [ + u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', + u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior' + ], + [ + u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', + u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', + u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', + u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', + u'hiring', u'conducting', u'interviews' + ], + [ + u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', + u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', + u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', + u'participating' + ], + [ + u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', + u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', + u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', + u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', + u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', + u'openings', u'jobs' + ], + [ + u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', + u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', + u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', + u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', + u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', + u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', + u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', + u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', + u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', + u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', + u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', + u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', + u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', + u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', + u'skills', u'engineering', u'quality', u'engineering' + ], + [ + u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', + u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', + u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', + u'knowledge', u'applications', u'manipulate', u'applications', u'engineering' + ], + [ + u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', + u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', + u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', + u'solutions', u'biggest', u'insurers', u'operates', u'investment' + ], + [ + u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', + u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', + u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', + u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', + u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', + u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', + u'estimation', u'testing', u'procedures', u'voltage', u'engineering' + ], + [ + u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', + u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', + u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', + u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', + u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', + u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', + u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', + u'contracting', u'southwest', u'electrical', u'contractors' + ], + [ + u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', + u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', + u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', + u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', + u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', + u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', + u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', + u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', + u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', + u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', + u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', + u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', + u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians' + ], + [ + u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', + u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', + u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', + u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', + u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', + u'automated', u'participate', u'ongoing' + ], + [ + u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', + u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', + u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', + u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', + u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', + u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', + u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', + u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', + u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', + u'attach' + ], + [ + u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', + u'asrc', u'engineering', u'technology', u'contracts' + ], + [ + u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', + u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', + u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', + u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', + u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', + u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', + u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', + u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', + u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', + u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', + u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', + u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', + u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', + u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews' + ], + [ + u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', + u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', + u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', + u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', + u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', + u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', + u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', + u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', + u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', + u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', + u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', + u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie' + ], + [ + u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', + u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', + u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', + u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', + u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', + u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures' + ], + [ + u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', + u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', + u'multiple', u'engineering', u'techexpousa', u'reviews' + ], + [ + u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', + u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', + u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', + u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', + u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', + u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', + u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', + u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', + u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', + u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', + u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', + u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration' + ], + [ + u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', + u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', + u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', + u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', + u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', + u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', + u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', + u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', + u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', + u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', + u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', + u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', + u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', + u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', + u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', + u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', + u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', + u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance' + ], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'], ['bank', 'river', 'shore', 'water'], ['river', 'water', 'flow', 'fast', 'tree'], @@ -165,7 +335,8 @@ def testConsistencyWithGensimModel(self): bow = self.model.id2word.doc2bow(texts_new) matrix_transformer_api = self.model.transform(bow) matrix_gensim_model = gensim_ldamodel[bow] - matrix_gensim_model_dense = matutils.sparse2full(matrix_gensim_model, 10) # convert into dense representation to be able to compare with transformer output + # convert into dense representation to be able to compare with transformer output + matrix_gensim_model_dense = matutils.sparse2full(matrix_gensim_model, 10) passed = numpy.allclose(matrix_transformer_api, matrix_gensim_model_dense, atol=1e-1) self.assertTrue(passed) diff --git a/gensim/test/test_tmdiff.py b/gensim/test/test_tmdiff.py index f49c930a63..e8dae8870e 100644 --- a/gensim/test/test_tmdiff.py +++ b/gensim/test/test_tmdiff.py @@ -50,7 +50,8 @@ def testIdentity(self): self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) # test for diagonal case - mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True) + mdiff, annotation = \ + self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True) for (int_tokens, diff_tokens) in annotation: self.assertEqual(diff_tokens, []) diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index 2c68f2c5c1..6767b9150a 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -52,7 +52,9 @@ def test_translate_nn(self): model.train(self.word_pairs) test_source_word, test_target_word = zip(*self.test_word_pairs) - translated_words = model.translate(test_source_word, topn=5, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec) + translated_words = model.translate( + test_source_word, topn=5, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec + ) for idx, item in enumerate(self.test_word_pairs): self.assertTrue(item[1] in translated_words[item[0]]) @@ -63,7 +65,10 @@ def test_translate_gc(self): model.train(self.word_pairs) test_source_word, test_target_word = zip(*self.test_word_pairs) - translated_words = model.translate(test_source_word, topn=5, gc=1, sample_num=3, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec) + translated_words = model.translate( + test_source_word, topn=5, gc=1, sample_num=3, + source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec + ) for idx, item in enumerate(self.test_word_pairs): self.assertTrue(item[1] in translated_words[item[0]]) @@ -93,12 +98,16 @@ def setUp(self): self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file) def test_translation_matrix(self): - model = translation_matrix.BackMappingTranslationMatrix(self.train_docs[:5], self.source_doc_vec, self.target_doc_vec) + model = translation_matrix.BackMappingTranslationMatrix( + self.train_docs[:5], self.source_doc_vec, self.target_doc_vec + ) transmat = model.train(self.train_docs[:5]) self.assertEqual(transmat.shape, (100, 100)) def test_infer_vector(self): - model = translation_matrix.BackMappingTranslationMatrix(self.train_docs[:5], self.source_doc_vec, self.target_doc_vec) + model = translation_matrix.BackMappingTranslationMatrix( + self.train_docs[:5], self.source_doc_vec, self.target_doc_vec + ) model.train(self.train_docs[:5]) infered_vec = model.infer_vector(self.target_doc_vec.docvecs[self.train_docs[5].tags]) self.assertEqual(infered_vec.shape, (100, )) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index da535850ec..9a46d321da 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -104,7 +104,9 @@ def testBuildVocabFromFreq(self): self.assertEqual(model_neg.wv.vocab['time'].count, 2) self.assertEqual(model_neg.wv.vocab['interface'].count, 2) self.assertEqual(model_neg.wv.vocab['response'].count, 2) - new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1} + new_freq_dict = { + 'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1 + } model_hs.build_vocab_from_freq(new_freq_dict, update=True) model_neg.build_vocab_from_freq(new_freq_dict, update=True) self.assertEqual(model_hs.wv.vocab['graph'].count, 4) @@ -416,7 +418,8 @@ def testVocab(self): model = word2vec.Word2Vec(min_count=1, hs=1, negative=0) model.build_vocab(corpus) self.assertTrue(len(model.wv.vocab) == 6981) - # with min_count=1, we're not throwing away anything, so make sure the word counts add up to be the entire corpus + # with min_count=1, we're not throwing away anything, + # so make sure the word counts add up to be the entire corpus self.assertEqual(sum(v.count for v in model.wv.vocab.values()), total_words) # make sure the binary codes are correct np.allclose(model.wv.vocab['the'].code, [1, 1, 0, 0]) @@ -885,7 +888,8 @@ def testPathLineSentencesOneFile(self): # class TestWord2VecScripts(unittest.TestCase): # def testWord2VecStandAloneScript(self): # """Does Word2Vec script launch standalone?""" -# cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1' +# cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + \ +# ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1' # output = check_output(cmd, stderr=PIPE) # self.assertEqual(output, '0') # #endclass TestWord2VecScripts diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py index 634afa71cc..10f335cae2 100644 --- a/gensim/test/test_wordrank_wrapper.py +++ b/gensim/test/test_wordrank_wrapper.py @@ -30,7 +30,7 @@ def setUp(self): return self.test_model = wordrank.Wordrank.train( self.wr_path, self.corpus_file, self.out_name, iter=6, - dump_period=5, period=5, np=2, cleanup_files=True + dump_period=5, period=5, np=4, cleanup_files=True ) def testLoadWordrankFormat(self): diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 2db0d695d2..8d3185dbbb 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -24,7 +24,8 @@ def s_one_pre(topics): [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] Args: - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + topics : list of topics obtained from an algorithm such as LDA. + Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: s_one_pre_res : list of list of (W', W*) tuples for all unique topic ids @@ -52,7 +53,8 @@ def s_one_one(topics): [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] Args: - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + topics : list of topics obtained from an algorithm such as LDA. + Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: s_one_one_res : list of list of (W', W*) tuples for all unique topic ids @@ -84,7 +86,8 @@ def s_one_set(topics): (7, array([ 9, 10, 7]))]] Args: - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + topics : list of topics obtained from an algorithm such as LDA. + Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: s_one_set_res : list of list of (W', W*) tuples for all unique topic ids. diff --git a/gensim/utils.py b/gensim/utils.py index 0627f4703c..39bc87df9a 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -602,7 +602,9 @@ def is_corpus(obj): doc1 = next(iter(obj)) # empty corpus is resolved to False here if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...) return True, obj # the first document is empty=>assume this is a corpus - id1, val1 = next(iter(doc1)) # if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here + + # if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here + id1, val1 = next(iter(doc1)) id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float) except Exception: return False, obj @@ -757,7 +759,8 @@ def decode_htmlentities(text): """ Decode HTML entities in text, coded as hex, decimal or named. - Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py + Adapted + from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py >>> u = u'E tu vivrai nel terrore - L'aldilà (1981)' >>> print(decode_htmlentities(u).encode('UTF-8')) @@ -1055,7 +1058,9 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, """ if not has_pattern(): - raise ImportError("Pattern library is not installed. Pattern library is needed in order to use lemmatize function") + raise ImportError( + "Pattern library is not installed. Pattern library is needed in order to use lemmatize function" + ) from pattern.en import parse if light: @@ -1112,7 +1117,10 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce: result += vocab[w] del vocab[w] - logger.info("pruned out %i tokens with count <=%i (before %i, after %i)", old_len - len(vocab), min_reduce, old_len, len(vocab)) + logger.info( + "pruned out %i tokens with count <=%i (before %i, after %i)", + old_len - len(vocab), min_reduce, old_len, len(vocab) + ) return result diff --git a/setup.cfg b/setup.cfg index 26a4aa0132..b5f8a99afa 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,6 +6,3 @@ artifact_indexes= # Windows wheels buit by: # https://ci.appveyor.com/project/piskvorky/gensim http://17a25141cb7f75c18ee4-676a79255544e7711e0dd8bccdcdd1cb.r23.cf2.rackcdn.com - -[flake8] -ignore = E501,E12,W503 diff --git a/setup.py b/setup.py index 405174093a..0f9a518dbe 100644 --- a/setup.py +++ b/setup.py @@ -224,12 +224,20 @@ def finalize_options(self): """ +distributed_env = ['Pyro4 >= 4.27'] -test_env = [ +win_testenv = [ + 'pytest', + 'pytest-rerunfailures', + 'pytest-cov', + 'cython', + 'pyemd', 'testfixtures', - 'Morfessor == 2.0.2a4', 'scikit-learn', - 'pyemd', + 'Morfessor==2.0.2a4', +] + +linux_testenv = win_testenv + [ 'annoy', 'tensorflow <= 1.3.0', 'keras >= 2.0.4', @@ -291,11 +299,12 @@ def finalize_options(self): 'six >= 1.5.0', 'smart_open >= 1.2.1', ], - tests_require=test_env, + tests_require=linux_testenv, extras_require={ - 'distributed': ['Pyro4 >= 4.27'], - 'test': test_env, - 'docs': test_env + ['Pyro4 >= 4.27', 'sphinx', 'sphinxcontrib-napoleon', 'annoy'], + 'distributed': distributed_env, + 'test-win': win_testenv, + 'test': linux_testenv, + 'docs': linux_testenv + distributed_env + ['sphinx', 'sphinxcontrib-napoleon'], }, include_package_data=True, diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000..a6e6044708 --- /dev/null +++ b/tox.ini @@ -0,0 +1,78 @@ +[tox] +minversion = 2.0 +envlist = {py27,py35,py36}-{win,linux}, flake8, docs, docs-upload, download-wheels, upload-wheels +skipsdist = True +platform = linux: linux + win: win64 + +[flake8] +ignore = E12, W503 +max-line-length = 120 +show-source = True +builtins = get_ipython + + +[pytest] +addopts = -rfxEXs --durations=20 --showlocals --rerun 3 --cov=gensim --cov-report term + + +[testenv] +recreate = True +install_command = pip install --timeout=60 --trusted-host 28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com --find-links http://28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com/ {opts} numpy==1.11.3 scipy==0.18.1 {packages} +deps = + linux: .[test] + win: .[test-win] + +setenv = + FT_HOME={env:FT_HOME:} + WR_HOME={env:WR_HOME:} + VOWPAL_WABBIT_PATH={env:VOWPAL_WABBIT_PATH:} + DTM_PATH={env:DTM_PATH:} + MALLET_HOME={env:MALLET_HOME:} + +commands = + python -c "from gensim.models.word2vec import FAST_VERSION; print(FAST_VERSION)" + python setup.py build_ext --inplace + python -c "from gensim.models.word2vec import FAST_VERSION; print(FAST_VERSION)" + pytest {posargs:gensim/test} + + +[testenv:flake8] +recreate = True +deps = flake8 + +commands = flake8 gensim/ + + +[testenv:docs] +recreate = True +whitelist_externals = make +deps = .[docs] +changedir = docs/src + +commands = make clean html + + +[testenv:docs-upload] +recreate = True +whitelist_externals = make +deps = .[docs] +changedir = docs/src + +commands = make clean html upload + + +[testenv:download-wheels] +deps = wheelhouse_uploader +whitelist_externals = rm +recreate = True + +commands = + rm -rf dist/ + python setup.py sdist fetch_artifacts + + +[testenv:upload-wheels] +deps = wheelhouse_uploader + +commands = python setup.py register sdist upload