Skip to content

Commit

Permalink
Add filtering options for similar_users and similar_items (#496)
Browse files Browse the repository at this point in the history
  • Loading branch information
benfred authored Dec 2, 2021
1 parent b8d9feb commit f38b10c
Show file tree
Hide file tree
Showing 6 changed files with 190 additions and 29 deletions.
59 changes: 50 additions & 9 deletions implicit/cpu/matrix_factorization_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def recommend(

# check selected items are in the model
if items.max() >= self.item_factors.shape[0] or items.min() < 0:
raise IndexError("Some itemids are not in the model")
raise IndexError("Some itemids in the items parameter in are not in the model")

# get a CSR matrix of items to filter per-user
filter_query_items = None
Expand Down Expand Up @@ -136,19 +136,42 @@ def recalculate_user(self, userid, user_items):
def recalculate_item(self, itemid, react_users):
raise NotImplementedError("recalculate_item is not supported with this model")

def similar_users(self, userid, N=10):
factor = self.user_factors[userid]
factors = self.user_factors
def similar_users(self, userid, N=10, filter_users=None, users=None):
user_factors = self.user_factors
norms = self.user_norms
norm = norms[userid]
return self._get_similarity_score(factor, norm, factors, norms, N)

# if we have an user list to restrict down to, we need to filter the user_factors
if users is not None:
if filter_users:
raise ValueError("Can't set both users and filter_users in similar_users call")

users = np.array(users)
user_factors = user_factors[users]
norms = norms[users]

# check selected items are in the model
if users.max() >= self.user_factors.shape[0] or users.min() < 0:
raise IndexError("Some userids in the users parameter are not in the model")

factor = self.user_factors[userid]
ids, scores = self._get_similarity_score(
factor, norm, user_factors, norms, N, filter_items=filter_users
)
if users is not None:
ids = users[ids]

return ids, scores

similar_users.__doc__ = RecommenderBase.similar_users.__doc__

def similar_items(self, itemid, N=10, react_users=None, recalculate_item=False):
def similar_items(
self, itemid, N=10, react_users=None, recalculate_item=False, filter_items=None, items=None
):
factor = self._item_factor(itemid, react_users, recalculate_item)
factors = self.item_factors
norms = self.item_norms

if recalculate_item:
if np.isscalar(itemid):
norm = np.linalg.norm(factor)
Expand All @@ -159,12 +182,30 @@ def similar_items(self, itemid, N=10, react_users=None, recalculate_item=False):
else:
norm = norms[itemid]

return self._get_similarity_score(factor, norm, factors, norms, N)
# if we have an item list to restrict down to, we need to filter the item_factors
if items is not None:
if filter_items:
raise ValueError("Can't set both items and filter_items in similar_items call")

items = np.array(items)
factors = factors[items]
norms = norms[items]

# check selected items are in the model
if items.max() >= self.item_factors.shape[0] or items.min() < 0:
raise IndexError("Some itemids in the items parameter are not in the model")

ids, scores = self._get_similarity_score(
factor, norm, factors, norms, N, filter_items=filter_items
)
if items is not None:
ids = items[ids]
return ids, scores

similar_items.__doc__ = RecommenderBase.similar_items.__doc__

def _get_similarity_score(self, factor, norm, factors, norms, N):
ids, scores = topk(factors, factor, N, item_norms=norms)
def _get_similarity_score(self, factor, norm, factors, norms, N, filter_items=None):
ids, scores = topk(factors, factor, N, item_norms=norms, filter_items=filter_items)
if np.isscalar(norm):
ids, scores = ids[0], scores[0]
scores /= norm
Expand Down
8 changes: 6 additions & 2 deletions implicit/gpu/knn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,13 @@ void KnnQuery::topk(const Matrix & items, const Matrix & query, int k,
auto count = thrust::make_counting_iterator<int>(0);
float * data = temp_distances.data;
int * items = item_filter->data;
thrust::for_each(count, count + item_filter->size,
int items_size = item_filter->size;
int cols = temp_distances.cols;
thrust::for_each(count, count + items_size * temp_distances.rows,
[=] __device__(int i) {
data[items[i]] = -FLT_MAX;
int col = items[i % items_size];
int row = i / items_size;
data[row * cols + col] = -FLT_MAX;
});
}

Expand Down
57 changes: 52 additions & 5 deletions implicit/gpu/matrix_factorization_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def recommend(
if items.max() >= self.item_factors.shape[0] or items.min() < 0:
raise IndexError("Some itemids are not in the model")

if filter_items:
if filter_items is not None:
filter_items = implicit.gpu.IntVector(np.array(filter_items, dtype="int32"))

query_filter = None
Expand Down Expand Up @@ -105,11 +105,33 @@ def item_norms(self):
self._item_norms_host = self._item_norms.to_numpy().reshape(self._item_norms.shape[1])
return self._item_norms

def similar_users(self, userid, N=10):
def similar_users(self, userid, N=10, filter_users=None, users=None):
norms = self.user_norms
user_factors = self.user_factors
if users is not None:
if filter_users:
raise ValueError("Can't set both users and filter_users in similar_users call")

users = np.array(users)
user_factors = user_factors[users]

# TODO: we should be able to do this all on the GPU
norms = implicit.gpu.Matrix(self._user_norms_host[users].reshape(1, len(users)))

# check selected items are in the model
if users.max() >= self.user_factors.shape[0] or users.min() < 0:
raise IndexError("Some userids in the users parameter are not in the model")

if filter_users is not None:
filter_users = implicit.gpu.IntVector(np.array(filter_users, dtype="int32"))

ids, scores = self._knn.topk(
self.user_factors, self.user_factors[userid], N, self.user_norms
user_factors, self.user_factors[userid], N, norms, item_filter=filter_users
)

if users is not None:
ids = users[ids]

user_norms = self._user_norms_host[userid]
if np.isscalar(userid):
ids, scores = ids[0], scores[0]
Expand All @@ -120,13 +142,38 @@ def similar_users(self, userid, N=10):

similar_users.__doc__ = RecommenderBase.similar_users.__doc__

def similar_items(self, itemid, N=10, react_users=None, recalculate_item=False):
def similar_items(
self, itemid, N=10, react_users=None, recalculate_item=False, filter_items=None, items=None
):
if recalculate_item:
raise NotImplementedError("recalculate_item isn't support on GPU yet")

item_factors = self.item_factors
norms = self.item_norms
if items is not None:
if filter_items:
raise ValueError("Can't set both items and filter_items in similar_items call")

items = np.array(items)

# TODO: we should be able to do this all on the GPU
norms = implicit.gpu.Matrix(self._item_norms_host[items].reshape(1, len(items)))
item_factors = item_factors[items]

# check selected items are in the model
if items.max() >= self.item_factors.shape[0] or items.min() < 0:
raise IndexError("Some itemids are not in the model")

if filter_items is not None:
filter_items = implicit.gpu.IntVector(np.array(filter_items, dtype="int32"))

ids, scores = self._knn.topk(
self.item_factors, self.item_factors[itemid], N, self.item_norms
item_factors, self.item_factors[itemid], N, norms, item_filter=filter_items
)

if items is not None:
ids = items[ids]

item_norms = self._item_norms_host[itemid]
if np.isscalar(itemid):
ids, scores = ids[0], scores[0]
Expand Down
36 changes: 29 additions & 7 deletions implicit/nearest_neighbours.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def recommend(
if userid >= user_items.shape[0]:
raise ValueError("userid is out of bounds of the user_items matrix")

if filter_items and items:
if filter_items is not None and items is not None:
raise ValueError("Can't specify both filter_items and items")

if filter_items is not None:
Expand Down Expand Up @@ -96,22 +96,43 @@ def recommend(

return ids, scores

def similar_users(self, userid, N=10):
raise NotImplementedError("Not implemented Yet")
def similar_users(self, userid, N=10, filter_users=None, users=None):
raise NotImplementedError("similar_users isn't implemented for item-item recommenders")

def similar_items(self, itemid, N=10, react_users=None, recalculate_item=False):
def similar_items(
self, itemid, N=10, react_users=None, recalculate_item=False, filter_items=None, items=None
):
"""Returns a list of the most similar other items"""
if recalculate_item:
raise NotImplementedError("Recalculate_item isn't implemented")

print("N", N)
if not np.isscalar(itemid):
return _batch(self.similar_items, itemid, N=N)
return _batch(self.similar_items, itemid, N=N, filter_items=filter_items, items=items)

if filter_items is not None and items is not None:
raise ValueError("Can't specify both filter_items and items")

if itemid >= self.similarity.shape[0]:
return np.array([]), np.array([])

ids = self.similarity[itemid].indices
scores = self.similarity[itemid].data

if filter_items is not None:
mask = np.in1d(ids, filter_items, invert=True)
ids, scores = ids[mask], scores[mask]

elif items is not None:
mask = np.in1d(ids, items)
ids, scores = ids[mask], scores[mask]

# returned items should be equal to input selected items
missing = items[np.in1d(items, ids, invert=True)]
if missing.size:
ids = np.append(ids, missing)
scores = np.append(scores, np.full(missing.size, -np.finfo(scores.dtype).max))

best = np.argsort(scores)[::-1][:N]
return ids[best], scores[best]

Expand Down Expand Up @@ -226,8 +247,9 @@ def _batch(func, ids, *args, N=10, **kwargs):
batch_ids, batch_scores = func(idx, *args, N=N, **kwargs)

# pad out to N items if we're returned fewer
missing_items = len(batch_ids) - N
if missing_items:
missing_items = N - len(batch_ids)
print("i", i, "idx", idx, " missing ", missing_items)
if missing_items > 0:
batch_ids = np.append(batch_ids, np.full(missing_items, -1))
batch_scores = np.append(
batch_scores, np.full(missing_items, -np.finfo(np.float32).max)
Expand Down
22 changes: 17 additions & 5 deletions implicit/recommender_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,25 +74,32 @@ def recommend(
"""

@abstractmethod
def similar_users(self, userid, N=10):
def similar_users(self, userid, N=10, filter_users=None, users=None):
"""
Calculates a list of similar users
Calculates the most similar users for a userid or array of userids
Parameters
----------
userid : Union[int, array_like]
The userid or an array of userids to retrieve similar users for
The userid or an array of userids to retrieve similar users for.
N : int, optional
The number of similar users to return
filter_users: array_like, optional
An array of user ids to filter out from the results being returned
users: array_like, optional
An array of user ids to include in the output. If not set all users in the training
set will be included. Cannot be used with the filter_users options
Returns
-------
tuple
Tuple of (itemids, scores) arrays
Tuple of (userids, scores) arrays
"""

@abstractmethod
def similar_items(self, itemid, N=10, react_users=None, recalculate_item=False):
def similar_items(
self, itemid, N=10, react_users=None, recalculate_item=False, filter_items=None, items=None
):
"""
Calculates a list of similar items
Expand All @@ -108,6 +115,11 @@ def similar_items(self, itemid, N=10, react_users=None, recalculate_item=False):
recalculate_item : bool, optional
When true, don't rely on stored item state and instead recalculate from the
passed in react_users
filter_items: array_like, optional
An array of item ids to filter out from the results being returned
items: array_like, optional
An array of item ids to include in the output. If not set all items in the training
set will be included. Cannot be used with the filter_items options
Returns
-------
Expand Down
37 changes: 36 additions & 1 deletion tests/recommender_base_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def get_checker_board(X):
return csr_matrix(ret - np.eye(X))


class RecommenderBaseTestMixin(object):
class RecommenderBaseTestMixin:
"""Mixin to test a bunch of common functionality in models
deriving from RecommenderBase"""

Expand Down Expand Up @@ -163,6 +163,25 @@ def test_similar_users_batch(self):
for r in ids[userid]:
self.assertEqual(r % 2, userid % 2)

def test_similar_users_filter(self):
model = self._get_model()
# calculating similar users in nearest-neighbours is not implemented yet
if isinstance(model, ItemItemRecommender):
return

model.fit(get_checker_board(256), show_progress=False)
userids = np.arange(50)

ids, _ = model.similar_users(userids, N=10, filter_users=np.arange(52) * 5)
for userid in userids:
for r in ids[userid]:
self.assertTrue(r % 5 != 0)

selected = np.arange(10)
ids, _ = model.similar_users(userids, N=10, users=selected)
for userid in userids:
self.assertEqual(set(ids[userid]), set(selected))

def test_similar_items(self):
model = self._get_model()
model.fit(get_checker_board(256), show_progress=False)
Expand Down Expand Up @@ -198,6 +217,22 @@ def check_results(ids):
# some models don't support recalculating user on the fly, and that's ok
pass

def test_similar_items_filter(self):
model = self._get_model()

model.fit(get_checker_board(256), show_progress=False)
itemids = np.arange(50)

ids, _ = model.similar_items(itemids, N=10, filter_items=np.arange(52) * 5)
for itemid in itemids:
for r in ids[itemid]:
self.assertTrue(r % 5 != 0)

selected = np.arange(10)
ids, _ = model.similar_items(itemids, N=10, items=selected)
for itemid in itemids:
self.assertEqual(set(ids[itemid]), set(selected))

def test_zero_length_row(self):
# get a matrix where a row/column is 0
item_users = get_checker_board(50).todense()
Expand Down

0 comments on commit f38b10c

Please sign in to comment.