Skip to content

Commit

Permalink
Merge pull request #81 from iomega/model_serialization
Browse files Browse the repository at this point in the history
Model serialization
  • Loading branch information
florian-huber authored Oct 1, 2022
2 parents 56d8a79 + 13fa916 commit bc49564
Show file tree
Hide file tree
Showing 19 changed files with 350 additions and 26 deletions.
16 changes: 0 additions & 16 deletions .prospector.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,3 @@ member-warnings: false

ignore-paths:
- readthedocs

pyroma:
run: true

pep8:
full: true

pep257:
disable: [
# Disable because not part of PEP257 official convention:
# see http://pep257.readthedocs.io/en/latest/error_codes.html
D203, # 1 blank line required before class docstring
D212, # Multi-line docstring summary should start at the first line
D213, # Multi-line docstring summary should start at the second line
D404, # First word of the docstring should not be This
]
14 changes: 13 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.7.0] - 2022-10-01

### Added

- added `spec2vec.serialization` subpackage to import and export `Word2Vec` models to/from disk without Pickle
(via `import_model` and `export_model` respectively) [#80](https://github.com/iomega/spec2vec/pull/80)

## Fixed

- updated Code examples in documentation to recent changes in matchms.

## [0.6.0] - 2022-01-03

### Added
Expand Down Expand Up @@ -120,7 +131,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fossa configuration
- Flowchart

[Unreleased]: https://github.com/iomega/spec2vec/compare/0.6.0...HEAD
[Unreleased]: https://github.com/iomega/spec2vec/compare/0.7.0...HEAD
[0.7.0]: https://github.com/iomega/spec2vec/compare/0.6.0...0.7.0
[0.6.0]: https://github.com/iomega/spec2vec/compare/0.5.0...0.6.0
[0.5.0]: https://github.com/iomega/spec2vec/compare/0.4.0...0.5.0
[0.4.0]: https://github.com/iomega/spec2vec/compare/0.3.4...0.4.0
Expand Down
1 change: 1 addition & 0 deletions conda/environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies:
- numpy
- pip
- python >=3.7
- scipy
- tqdm
- pip:
- -e ..[dev]
1 change: 1 addition & 0 deletions conda/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ dependencies:
- numba >=0.51
- numpy
- python >=3.7
- scipy
- tqdm
3 changes: 2 additions & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% set name = "spec2vec" %}
{% set version = "0.6.0" %}
{% set version = "0.7.0" %}

package:
name: {{ name|lower }}
Expand Down Expand Up @@ -42,6 +42,7 @@ requirements:
- numpy
- pip
- python >=3.7
- scipy
- tqdm

test:
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/test_user_workflow_spec2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def apply_my_filters(s):
return s

repository_root = os.path.join(os.path.dirname(__file__), "..")
spectrums_file = os.path.join(repository_root, "tests", "pesticides.mgf")
spectrums_file = os.path.join(repository_root, "tests", "data", "pesticides.mgf")

# apply my filters to the data
spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.0
current_version = 0.7.0

[bumpversion:file:conda/meta.yaml]
search = set version = "{current_version}"
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"matchms >=0.11.0",
"numba >=0.51",
"numpy",
"scipy",
"tqdm",
],
extras_require={"dev": ["bump2version",
Expand All @@ -58,7 +59,7 @@
"prospector[with_pyroma]",
"pytest",
"pytest-cov",
"sphinx>=3.0.0,!=3.2.0,<4.0.0",
"sphinx>=4.0.0",
"sphinx_rtd_theme",
"sphinxcontrib-apidoc",
"yapf",],
Expand Down
9 changes: 5 additions & 4 deletions spec2vec/Spec2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from matchms import Spectrum
from matchms.similarity.BaseSimilarity import BaseSimilarity
from tqdm import tqdm
from spec2vec.serialization import Word2VecLight
from spec2vec.SpectrumDocument import SpectrumDocument
from spec2vec.vector_operations import (calc_vector, cosine_similarity,
cosine_similarity_matrix)
Expand Down Expand Up @@ -48,7 +49,7 @@ def spectrum_processing(s):
s = require_minimum_number_of_peaks(s, n_required=5)
return s
spectrums_file = os.path.join(os.getcwd(), "..", "tests", "pesticides.mgf")
spectrums_file = os.path.join(os.getcwd(), "..", "tests", "data", "pesticides.mgf")
# Load data and apply the above defined filters to the data
spectrums = [spectrum_processing(s) for s in load_from_mgf(spectrums_file)]
Expand All @@ -69,8 +70,8 @@ def spectrum_processing(s):
# Select top-10 candidates for first query spectrum
spectrum0_top10 = scores.scores_by_query(spectrums[0], sort=True)[:10]
# Display spectrum IDs for top-10 matches
print([s[0].metadata['spectrumid'] for s in spectrum0_top10])
# Display spectrum IDs for top-10 matches (only works if metadata contains "spectrum_id" field)
print([s[0].metadata['spectrum_id'] for s in spectrum0_top10])
Should output
Expand All @@ -79,7 +80,7 @@ def spectrum_processing(s):
['CCMSLIB00001058300', 'CCMSLIB00001058289', 'CCMSLIB00001058303', ...
"""
def __init__(self, model: Word2Vec, intensity_weighting_power: Union[float, int] = 0,
def __init__(self, model: Union[Word2Vec, Word2VecLight], intensity_weighting_power: Union[float, int] = 0,
allowed_missing_percentage: Union[float, int] = 10, progress_bar: bool = False):
"""
Expand Down
2 changes: 2 additions & 0 deletions spec2vec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from . import serialization
from .__version__ import __version__
from .Document import Document
from .logging_functions import _init_logger
Expand All @@ -13,6 +14,7 @@
"__version__",
"calc_vector",
"Document",
"serialization",
"SpectrumDocument",
"Spec2Vec",
]
2 changes: 1 addition & 1 deletion spec2vec/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.6.0'
__version__ = '0.7.0'
15 changes: 15 additions & 0 deletions spec2vec/serialization/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
Functions for exporting and importing trained :class:`~gensim.models.Word2Vec` model to and from disk.
##########################################
Functions provide the ability to export and import trained :class:`~gensim.models.Word2Vec` model to and from disk
without pickling the model. The model can be stored in two files: `.json` for metadata and `.npy` for weights.
"""
from .model_exporting import export_model
from .model_importing import Word2VecLight, import_model


__all__ = [
"export_model",
"import_model",
"Word2VecLight"
]
94 changes: 94 additions & 0 deletions spec2vec/serialization/model_exporting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import json
import os
from copy import deepcopy
from typing import Union
import numpy as np
import scipy.sparse
from gensim.models import Word2Vec


def export_model(model: Word2Vec,
output_model_file: Union[str, os.PathLike],
output_weights_file: Union[str, os.PathLike]):
"""
Write a lightweight version of a :class:`~gensim.model.Word2Vec` model to disk. Such a model can be read to
calculate scores but is not capable of further training.
Parameters
----------
model:
:class:`~gensim.models.Word2Vec` trained model.
output_model_file:
A path of json file to save the model.
output_weights_file:
A path of `.npy` file to save the model's weights.
"""
model = deepcopy(model)
keyedvectors = extract_keyedvectors(model)
weights = keyedvectors.pop("vectors")
keyedvectors["__weights_format"] = get_weights_format(weights)

save_model(keyedvectors, output_model_file)
save_weights(weights, output_weights_file)


def save_weights(weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix],
output_weights_file: Union[str, os.PathLike]):
"""
Write model's weights to disk in `.npy` dense array format. If the weights are sparse, they are converted to dense
prior to saving.
"""
if isinstance(weights, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)):
weights = weights.toarray()

np.save(output_weights_file, weights, allow_pickle=False)


def save_model(keyedvectors: dict, output_model_file: Union[str, os.PathLike]):
"""Write model's metadata to disk in json format."""
with open(output_model_file, "w", encoding="utf-8") as f:
json.dump(keyedvectors, f)


def get_weights_format(weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]) -> str:
"""
Get the array format of the model's weights.
Parameters
----------
weights:
Model's weights.
Returns
-------
weights_format:
Format of the model's weights.
"""
if isinstance(weights, np.ndarray):
return "np.ndarray"
if isinstance(weights, scipy.sparse.csr_matrix):
return "csr_matrix"
if isinstance(weights, scipy.sparse.csc_matrix):
return "csc_matrix"
raise NotImplementedError("The model's weights format is not supported.")


def extract_keyedvectors(model: Word2Vec) -> dict:
"""
Extract :class:`~gensim.models.KeyedVectors` object from the model, convert it to a dictionary and
remove redundant keys.
Parameters
----------
model:
:class:`~gensim.models.Word2Vec` trained model.
Returns
-------
keyedvectors:
Dictionary representation of :class:`~gensim.models.KeyedVectors` without redundant keys.
"""
keyedvectors = model.wv.__dict__
keyedvectors.pop("vectors_lockf", None)
keyedvectors.pop("expandos", None)
return keyedvectors
85 changes: 85 additions & 0 deletions spec2vec/serialization/model_importing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import json
import os
from typing import Union
import numpy as np
import scipy.sparse
from gensim.models import KeyedVectors


class Word2VecLight:
"""
A lightweight version of :class:`~gensim.models.Word2Vec`. The objects of this class follow the interface of the
original :class:`~gensim.models.Word2Vec` to the point necessary to calculate Spec2Vec scores. The model cannot be
used for further training.
"""

def __init__(self, model: dict, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]):
"""
Parameters
----------
model:
A dictionary containing the model's metadata.
weights:
A numpy array or a scipy sparse matrix containing the model's weights.
"""
self.wv: KeyedVectors = self._KeyedVectorsBuilder().from_dict(model).with_weights(weights).build()

class _KeyedVectorsBuilder:
def __init__(self):
self.vector_size = None
self.weights = None

def build(self) -> KeyedVectors:
keyed_vectors = KeyedVectors(self.vector_size)
keyed_vectors.__dict__ = self.__dict__
keyed_vectors.vectors = self.weights
return keyed_vectors

def from_dict(self, dictionary: dict):
expected_keys = {"vector_size", "__numpys", "__scipys", "__ignoreds", "__recursive_saveloads",
"index_to_key", "norms", "key_to_index", "next_index", "__weights_format"}
if dictionary.keys() == expected_keys:
self.__dict__ = dictionary
else:
raise ValueError("The keys of model's dictionary representation do not match the expected keys.")
return self

def with_weights(self, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]):
self.weights = weights
return self


def import_model(model_file, weights_file) -> Word2VecLight:
"""
Read a lightweight version of a :class:`~gensim.models.Word2Vec` model from disk.
Parameters
----------
model_file:
A path of json file to load the model.
weights_file:
A path of `.npy` file to load the model's weights.
Returns
-------
:class:`~spec2vec.serialization.model_importing.Word2VecLight` – a lightweight version of a
:class:`~gensim.models.Word2Vec`
"""
with open(model_file, "r", encoding="utf-8") as f:
model: dict = json.load(f)

weights = load_weights(weights_file, model["__weights_format"])
return Word2VecLight(model, weights)


def load_weights(weights_file: Union[str, os.PathLike],
weights_format: str) -> Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]:
weights: np.ndarray = np.load(weights_file, allow_pickle=False)

weights_array_builder = {"csr_matrix": scipy.sparse.csr_matrix,
"csc_matrix": scipy.sparse.csc_matrix,
"np.ndarray": lambda x: x}
weights = weights_array_builder[weights_format](weights)

return weights
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from pathlib import Path
import pytest


@pytest.fixture(scope="module")
def test_dir(request):
"""Return the directory of the currently running test script."""
return Path(request.fspath).parent
1 change: 1 addition & 0 deletions tests/data/model.json

Large diffs are not rendered by default.

File renamed without changes.
Binary file added tests/data/weights.npy
Binary file not shown.
Loading

0 comments on commit bc49564

Please sign in to comment.