benfred · benfred · Jan 3, 2022 · Oct 9, 2021 · Oct 16, 2021 · Nov 26, 2021
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -3,11 +3,7 @@
 
 name: Build
 
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
+on: [push, pull_request]
 
 jobs:
   build:
@@ -27,8 +23,11 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install flake8 isort cpplint black pytest
+        pip install flake8 isort cpplint black pytest codespell h5py pylint
         pip install -r requirements.txt
+    - name: Install ANN Libraries
+      run: pip install annoy nmslib 
+      if: runner.os == 'Linux'
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
@@ -42,6 +41,12 @@ jobs:
     - name: Lint with isort
       run: |
         isort -c .
+    - name: Lint with codespell
+      run: |
+        codespell
+    - name: Lint with pylint
+      run: |
+        pylint implicit
     - name: Build
       run: |
         python setup.py develop

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,22 @@
+repos:
+      - repo: https://github.com/timothycrosley/isort
+        rev: 5.10.1
+        hooks:
+        - id: isort
+          additional_dependencies: [toml]
+      - repo: https://github.com/python/black
+        rev: 21.12b0
+        hooks:
+        - id: black
+      - repo: https://github.com/pycqa/flake8
+        rev: 4.0.1
+        hooks:
+        - id: flake8
+      - repo: https://github.com/pycqa/pylint
+        rev: v2.12.1
+        hooks:
+        - id: pylint
+      - repo: https://github.com/codespell-project/codespell
+        rev: v2.1.0
+        hooks:
+        - id: codespell
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,49 @@
+[MASTER]
+
+ignore-patterns=setup.py
+
+extension-pkg-whitelist=implicit.cpu._als,implicit._nearest_neighbours,implicit.gpu._cuda,implicit.cpu.bpr,implicit.cpu.topk,numpy.random.mtrand,nmslib,faiss
+
+[MESSAGES CONTROL]
+disable=fixme,
+    missing-function-docstring,
+    missing-module-docstring,
+    missing-class-docstring,
+    wrong-import-order,
+    wrong-import-position,
+    ungrouped-imports,
+    line-too-long,
+    superfluous-parens,
+    trailing-whitespace,
+    invalid-name,
+    import-error,
+    no-self-use,
+
+    # disable code-complexity check
+    too-many-function-args,
+    too-many-instance-attributes,
+    too-many-locals,
+    too-many-branches,
+    too-many-nested-blocks,
+    too-many-statements,
+    too-many-arguments,
+    too-many-return-statements,
+    too-many-lines,
+    too-few-public-methods,
+
+    # TODO: fix underlying errors for these
+    import-outside-toplevel,
+    not-callable,
+    unused-argument,
+    abstract-method,
+    arguments-differ,
+    no-member,
+    no-name-in-module,
+    arguments-renamed,
+    import-self,
+    protected-access,
+
+[SIMILARITIES]
+min-similarity-lines=50
+ignore-docstrings=yes
+ignore-imports=yes
diff --git a/README.md b/README.md
@@ -53,12 +53,11 @@ import implicit
 # initialize a model
 model = implicit.als.AlternatingLeastSquares(factors=50)
 
-# train the model on a sparse matrix of item/user/confidence weights
-model.fit(item_user_data)
+# train the model on a sparse matrix of user/item/confidence weights
+model.fit(user_item_data)
 
 # recommend items for a user
-user_items = item_user_data.T.tocsr()
-recommendations = model.recommend(userid, user_items)
+recommendations = model.recommend(userid, user_item_data)
 
 # find related items
 related = model.similar_items(itemid)
@@ -88,12 +87,11 @@ There are also several other blog posts about using Implicit to build recommenda
 
 #### Requirements
 
-This library requires SciPy version 0.16 or later. Running on OSX requires an OpenMP compiler,
-which can be installed with homebrew: ```brew install gcc```. Running on Windows requires Python
-3.5+.
+This library requires SciPy version 0.16 or later and Python version 3.6 or later.
+Running on OSX requires an OpenMP compiler, which can be installed with homebrew: ```brew install gcc```.
 
 GPU Support requires at least version 11 of the [NVidia CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). The build will use the ```nvcc``` compiler
-that is found on the path, but this can be overriden by setting the CUDAHOME enviroment variable
+that is found on the path, but this can be overridden by setting the CUDAHOME environment variable
 to point to your cuda installation.
 
 This library has been tested with Python 3.6, 3.7, 3.8 and 3.9 on Ubuntu, OSX and Windows.

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -66,4 +66,4 @@ Note that this dataset was filtered down for all versions to reviews that were p
 stars), to simulate a truly implicit dataset.
 
 Implicit on the CPU seems to suffer a bit here relative to the other options. It seems like there might
-be a single threaded bottleneck at some point thats worth examining later.
+be a single threaded bottleneck at some point that's worth examining later.
diff --git a/cuda_setup.py b/cuda_setup.py
@@ -61,8 +61,6 @@ def locate_cuda():
 
     post_args = [
         "-arch=sm_60",
-        "-gencode=arch=compute_50,code=sm_50",
-        "-gencode=arch=compute_52,code=sm_52",
         "-gencode=arch=compute_60,code=sm_60",
         "-gencode=arch=compute_61,code=sm_61",
         "-gencode=arch=compute_70,code=sm_70",

diff --git a/docs/ann.rst b/docs/ann.rst
@@ -13,20 +13,20 @@ See `this post comparing the different ANN libraries
 <http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/>`_ for
 more details.
 
-NMSLibAlternatingLeastSquares
------------------------------
-.. autoclass:: implicit.approximate_als.NMSLibAlternatingLeastSquares
+NMSLibModel
+-----------
+.. autoclass:: implicit.ann.nmslib.NMSLibModel
    :members:
    :show-inheritance:
 
-AnnoyAlternatingLeastSquares
-----------------------------
-.. autoclass:: implicit.approximate_als.AnnoyAlternatingLeastSquares
+AnnoyModel
+----------
+.. autoclass:: implicit.ann.annoy.AnnoyModel
    :members:
    :show-inheritance:
 
-FaissAlternatingLeastSquares
------------------------------
-.. autoclass:: implicit.approximate_als.FaissAlternatingLeastSquares
+FaissModel
+----------
+.. autoclass:: implicit.ann.faiss.FaissModel
    :members:
    :show-inheritance:
diff --git a/examples/lastfm.py b/examples/lastfm.py
@@ -85,10 +85,11 @@ def calculate_similar_artists(output_filename, model_name="als"):
 
     # this is actually disturbingly expensive:
     plays = plays.tocsr()
+    user_plays = plays.T.tocsr()
 
     logging.debug("training model %s", model_name)
     start = time.time()
-    model.fit(plays)
+    model.fit(user_plays)
     logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
 
     # write out similar artists by popularity
@@ -102,11 +103,15 @@ def calculate_similar_artists(output_filename, model_name="als"):
     logging.debug("writing similar items")
     with tqdm.tqdm(total=len(to_generate)) as progress:
         with codecs.open(output_filename, "w", "utf8") as o:
-            for artistid in to_generate:
-                artist = artists[artistid]
-                for other, score in model.similar_items(artistid, 11):
-                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
-                progress.update(1)
+            batch_size = 1000
+            for startidx in range(0, len(to_generate), batch_size):
+                batch = to_generate[startidx : startidx + batch_size]
+                ids, scores = model.similar_items(batch, 11)
+                for i, artistid in enumerate(batch):
+                    artist = artists[artistid]
+                    for other, score in zip(ids[i], scores[i]):
+                        o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
+                progress.update(batch_size)
 
     logging.debug("generated similar artists in %0.2fs", time.time() - start)
 
@@ -131,21 +136,27 @@ def calculate_recommendations(output_filename, model_name="als"):
 
     # this is actually disturbingly expensive:
     plays = plays.tocsr()
+    user_plays = plays.T.tocsr()
 
     logging.debug("training model %s", model_name)
     start = time.time()
-    model.fit(plays)
+    model.fit(user_plays)
     logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
 
     # generate recommendations for each user and write out to a file
     start = time.time()
-    user_plays = plays.T.tocsr()
     with tqdm.tqdm(total=len(users)) as progress:
         with codecs.open(output_filename, "w", "utf8") as o:
-            for userid, username in enumerate(users):
-                for artistid, score in model.recommend(userid, user_plays):
-                    o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
-                progress.update(1)
+            batch_size = 1000
+            to_generate = np.arange(len(users))
+            for startidx in range(0, len(to_generate), batch_size):
+                batch = to_generate[startidx : startidx + batch_size]
+                ids, scores = model.recommend(batch, user_plays, filter_already_liked_items=True)
+                for i, userid in enumerate(batch):
+                    username = users[userid]
+                    for other, score in zip(ids[i], scores[i]):
+                        o.write("%s\t%s\t%s\n" % (username, artists[other], score))
+                progress.update(batch_size)
     logging.debug("generated recommendations in %0.2fs", time.time() - start)
 
 

diff --git a/examples/movielens.py b/examples/movielens.py
@@ -73,10 +73,12 @@ def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0,
     else:
         raise NotImplementedError("TODO: model %s" % model_name)
 
+    user_ratings = ratings.T.tocsr()
+
     # train the model
     log.debug("training model %s", model_name)
     start = time.time()
-    model.fit(ratings)
+    model.fit(user_ratings)
     log.debug("trained model '%s' in %s", model_name, time.time() - start)
     log.debug("calculating top movies")
 
@@ -91,7 +93,7 @@ def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0,
                 # no ratings > 4 meaning we've filtered out all data for it.
                 if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                     title = titles[movieid]
-                    for other, score in model.similar_items(movieid, 11):
+                    for other, score in zip(*model.similar_items(movieid, 11)):
                         o.write("%s\t%s\t%s\n" % (title, titles[other], score))
                 progress.update(1)
 

diff --git a/implicit/__init__.py b/implicit/__init__.py
@@ -2,4 +2,4 @@
 
 __version__ = "0.4.8"
 
-__all__ = [als, approximate_als, bpr, nearest_neighbours, lmf, __version__]
+__all__ = ["als", "approximate_als", "bpr", "nearest_neighbours", "lmf", "__version__"]
diff --git a/implicit/_nearest_neighbours.pyx b/implicit/_nearest_neighbours.pyx
@@ -30,7 +30,7 @@ cdef extern from "nearest_neighbours.h" namespace "implicit" nogil:
 
 cdef class NearestNeighboursScorer(object):
     """ Class to return the top K items from multipying a users likes
-    by a precomputed similarity vector. """
+    by a precomputed sparse similarity matrix. """
     cdef SparseMatrixMultiplier[int, double] * neighbours
 
     cdef int[:] similarity_indptr
@@ -97,11 +97,11 @@ cdef class NearestNeighboursScorer(object):
 
 
 @cython.boundscheck(False)
-def all_pairs_knn(items, unsigned int K=100, int num_threads=0, show_progress=True):
+def all_pairs_knn(users, unsigned int K=100, int num_threads=0, show_progress=True):
     """ Returns the top K nearest neighbours for each row in the matrix.
     """
-    items = items.tocsr()
-    users = items.T.tocsr()
+    users = users.tocsr()
+    items = users.T.tocsr()
 
     cdef int item_count = items.shape[0]
     cdef int i, u, index1, index2, j

diff --git a/implicit/als.py b/implicit/als.py
@@ -61,15 +61,14 @@ def AlternatingLeastSquares(
             calculate_training_loss=calculate_training_loss,
             random_state=random_state,
         )
-    else:
-        return implicit.cpu.als.AlternatingLeastSquares(
-            factors,
-            regularization,
-            dtype,
-            use_native,
-            use_cg,
-            iterations,
-            calculate_training_loss,
-            num_threads,
-            random_state,
-        )
+    return implicit.cpu.als.AlternatingLeastSquares(
+        factors,
+        regularization,
+        dtype,
+        use_native,
+        use_cg,
+        iterations,
+        calculate_training_loss,
+        num_threads,
+        random_state,
+    )
diff --git a/implicit/ann/__init__.py b/implicit/ann/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@

		__version__ = "0.4.8"

		__all__ = [als, approximate_als, bpr, nearest_neighbours, lmf, __version__]
		__all__ = ["als", "approximate_als", "bpr", "nearest_neighbours", "lmf", "__version__"]