Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix save/load for models that haven't been fit #577

Merged
merged 3 commits into from
Jun 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 12 additions & 18 deletions implicit/cpu/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,14 +410,16 @@ def to_gpu(self):
regularization=self.regularization,
iterations=self.iterations,
calculate_training_loss=self.calculate_training_loss,
random_state=self.random_state,
)
ret.user_factors = implicit.gpu.Matrix(self.user_factors)
ret.item_factors = implicit.gpu.Matrix(self.item_factors)
if self.user_factors is not None:
ret.user_factors = implicit.gpu.Matrix(self.user_factors)
if self.item_factors is not None:
ret.item_factors = implicit.gpu.Matrix(self.item_factors)
return ret

def save(self, file):
np.savez(
file,
def save(self, fileobj_or_path):
args = dict(
user_factors=self.user_factors,
item_factors=self.item_factors,
regularization=self.regularization,
Expand All @@ -429,20 +431,12 @@ def save(self, file):
cg_steps=self.cg_steps,
calculate_training_loss=self.calculate_training_loss,
dtype=self.dtype.name,
random_state=self.random_state,
)

@classmethod
def load(cls, file):
if isinstance(file, str) and not file.endswith(".npz"):
file = file + ".npz"
with np.load(file, allow_pickle=False) as data:
ret = cls()
for k, v in data.items():
if k == "dtype":
ret.dtype = np.dtype(str(v))
else:
setattr(ret, k, v)
return ret
# filter out 'None' valued args, since we can't go np.load on
# them without using pickle
args = {k: v for k, v in args.items() if v is not None}
np.savez(fileobj_or_path, **args)


def least_squares(Cui, X, Y, regularization, num_threads=0):
Expand Down
31 changes: 13 additions & 18 deletions implicit/cpu/bpr.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -213,15 +213,17 @@ class BayesianPersonalizedRanking(MatrixFactorizationBase):
regularization=self.regularization,
iterations=self.iterations,
verify_negative_samples=self.verify_negative_samples,
random_state=self.random_state,
)
ret.user_factors = implicit.gpu.Matrix(self.user_factors)
ret.item_factors = implicit.gpu.Matrix(self.item_factors)

if self.user_factors is not None:
ret.user_factors = implicit.gpu.Matrix(self.user_factors)
if self.item_factors is not None:
ret.item_factors = implicit.gpu.Matrix(self.item_factors)
return ret

def save(self, file):
np.savez(
file,
user_factors=self.user_factors,
def save(self, fileobj_or_path):
args = dict(user_factors=self.user_factors,
item_factors=self.item_factors,
regularization=self.regularization,
factors=self.factors,
Expand All @@ -230,20 +232,13 @@ class BayesianPersonalizedRanking(MatrixFactorizationBase):
num_threads=self.num_threads,
iterations=self.iterations,
dtype=self.dtype.name,
random_state=self.random_state
)

@classmethod
def load(cls, file):
if isinstance(file, str) and not file.endswith(".npz"):
file = file + ".npz"
with np.load(file, allow_pickle=False) as data:
ret = cls()
for k, v in data.items():
if k == "dtype":
ret.dtype = np.dtype(str(v))
else:
setattr(ret, k, v)
return ret
# filter out 'None' valued args, since we can't go np.load on
# them without using pickle
args = {k:v for k,v in args.items() if v is not None}
np.savez(fileobj_or_path, **args)


@cython.cdivision(True)
Expand Down
25 changes: 8 additions & 17 deletions implicit/cpu/lmf.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,8 @@ class LogisticMatrixFactorization(MatrixFactorizationBase):

self._check_fit_errors()

def save(self, file):
np.savez(
file,
def save(self, fileobj_or_path):
args = dict(
user_factors=self.user_factors,
item_factors=self.item_factors,
regularization=self.regularization,
Expand All @@ -208,20 +207,12 @@ class LogisticMatrixFactorization(MatrixFactorizationBase):
num_threads=self.num_threads,
iterations=self.iterations,
dtype=self.dtype.name,
)

@classmethod
def load(cls, file):
if isinstance(file, str) and not file.endswith(".npz"):
file = file + ".npz"
with np.load(file, allow_pickle=False) as data:
ret = cls()
for k, v in data.items():
if k == "dtype":
ret.dtype = np.dtype(str(v))
else:
setattr(ret, k, v)
return ret
random_state=self.random_state)

# filter out 'None' valued args, since we can't go np.load on
# them without using pickle
args = {k:v for k,v in args.items() if v is not None}
np.savez(fileobj_or_path, **args)


@cython.cdivision(True)
Expand Down
1 change: 1 addition & 0 deletions implicit/gpu/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ def to_cpu(self) -> implicit.cpu.als.AlternatingLeastSquares:
regularization=self.regularization,
iterations=self.iterations,
calculate_training_loss=self.calculate_training_loss,
random_state=self.random_state,
)
ret.user_factors = self.user_factors.to_numpy() if self.user_factors is not None else None
ret.item_factors = self.item_factors.to_numpy() if self.item_factors is not None else None
Expand Down
1 change: 1 addition & 0 deletions implicit/gpu/bpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def to_cpu(self) -> implicit.cpu.bpr.BayesianPersonalizedRanking:
regularization=self.regularization,
iterations=self.iterations,
verify_negative_samples=self.verify_negative_samples,
random_state=self.random_state,
)
ret.user_factors = self.user_factors.to_numpy() if self.user_factors is not None else None
ret.item_factors = self.item_factors.to_numpy() if self.item_factors is not None else None
Expand Down
27 changes: 15 additions & 12 deletions implicit/nearest_neighbours.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,24 +152,27 @@ def __setstate__(self, state):
else:
self.scorer = None

def save(self, file):
def save(self, fileobj_or_path):
args = dict(K=self.K)
m = self.similarity
np.savez(file, data=m.data, indptr=m.indptr, indices=m.indices, shape=m.shape, K=self.K)
if m is not None:
args.update(dict(shape=m.shape, data=m.data, indptr=m.indptr, indices=m.indices))
np.savez(fileobj_or_path, **args)

@classmethod
def load(cls, file):
def load(cls, fileobj_or_path):
# numpy.save automatically appends a npz suffic, numpy.load doesn't apparently
if isinstance(file, str) and not file.endswith(".npz"):
file = file + ".npz"

with np.load(file, allow_pickle=False) as data:
similarity = csr_matrix(
(data["data"], data["indices"], data["indptr"]), shape=data["shape"]
)
if isinstance(fileobj_or_path, str) and not fileobj_or_path.endswith(".npz"):
fileobj_or_path = fileobj_or_path + ".npz"

with np.load(fileobj_or_path, allow_pickle=False) as data:
ret = cls()
ret.similarity = similarity
ret.scorer = NearestNeighboursScorer(similarity)
if data.get("data") is not None:
similarity = csr_matrix(
(data["data"], data["indices"], data["indptr"]), shape=data["shape"]
)
ret.similarity = similarity
ret.scorer = NearestNeighboursScorer(similarity)
ret.K = data["K"]
return ret

Expand Down
18 changes: 15 additions & 3 deletions implicit/recommender_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import warnings
from abc import ABCMeta, abstractmethod

import numpy as np


class ModelFitError(Exception):
pass
Expand Down Expand Up @@ -166,13 +168,12 @@ def save(self, file):
"""

@classmethod
@abstractmethod
def load(cls, file) -> "RecommenderBase":
def load(cls, fileobj_or_path) -> "RecommenderBase":
"""Loads the model from a file

Parameters
----------
file : str or io.IOBase
fileobj_or_path : str or io.IOBase
Either the filename or an open file-like object to load the model from

Returns
Expand All @@ -185,6 +186,17 @@ def load(cls, file) -> "RecommenderBase":
save
numpy.load
"""
if isinstance(fileobj_or_path, str) and not fileobj_or_path.endswith(".npz"):
fileobj_or_path = fileobj_or_path + ".npz"
with np.load(fileobj_or_path, allow_pickle=False) as data:
ret = cls()
for k, v in data.items():
if k == "dtype":
v = np.dtype(str(v))
elif v.shape == ():
v = v.item()
setattr(ret, k, v)
return ret

def rank_items(self, userid, user_items, selected_items, recalculate_user=False):
warnings.warn(
Expand Down
18 changes: 18 additions & 0 deletions tests/approximate_als_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def test_pickle(self):
def test_serialization(self):
pass

def test_serialization_without_fit(self):
pass

if HAS_CUDA:

class AnnoyALSGPUTest(unittest.TestCase, RecommenderBaseTestMixin):
Expand All @@ -41,6 +44,9 @@ def test_pickle(self):
def test_serialization(self):
pass

def test_serialization_without_fit(self):
pass

except ImportError:
pass

Expand All @@ -64,6 +70,9 @@ def test_pickle(self):
def test_serialization(self):
pass

def test_serialization_without_fit(self):
pass

if HAS_CUDA:
# nmslib doesn't support querying on the gpu, but we should be able to still use a GPU als
# model with the nmslib index
Expand All @@ -84,6 +93,9 @@ def test_pickle(self):
def test_serialization(self):
pass

def test_serialization_without_fit(self):
pass

except ImportError:
pass

Expand All @@ -103,6 +115,9 @@ def test_pickle(self):
def test_serialization(self):
pass

def test_serialization_without_fit(self):
pass

if HAS_CUDA:

class FaissALSGPUTest(unittest.TestCase, RecommenderBaseTestMixin):
Expand Down Expand Up @@ -138,6 +153,9 @@ def test_pickle(self):
def test_serialization(self):
pass

def test_serialization_without_fit(self):
pass

except ImportError:
pass

Expand Down
8 changes: 8 additions & 0 deletions tests/recommender_base_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,3 +423,11 @@ def test_serialization(self):
reloaded = model.load(f)
assert_array_equal(model.similar_items(1)[0], reloaded.similar_items(1)[0])
assert_array_equal(model.similar_items(1)[1], reloaded.similar_items(1)[1])

def test_serialization_without_fit(self):
model = self._get_model()
with tempfile.TemporaryDirectory() as tmpdir:
filename = os.path.join(tmpdir, "model.npz")
model.save(filename)
reloaded = model.load(filename)
assert model.__dict__ == reloaded.__dict__