Merge pull request #81 from iomega/model_serialization

Model serialization
iomega · Oct 1, 2022 · bc49564 · bc49564
2 parents 56d8a79 + 13fa916
commit bc49564
Show file tree

Hide file tree

Showing 19 changed files with 350 additions and 26 deletions.
diff --git a/.prospector.yml b/.prospector.yml
@@ -11,19 +11,3 @@ member-warnings: false
 
 ignore-paths:
   - readthedocs
-
-pyroma:
-    run: true
-
-pep8:
-    full: true
-
-pep257:
-    disable: [
-        # Disable because not part of PEP257 official convention:
-        # see http://pep257.readthedocs.io/en/latest/error_codes.html
-        D203,  # 1 blank line required before class docstring
-        D212,  # Multi-line docstring summary should start at the first line
-        D213,  # Multi-line docstring summary should start at the second line
-        D404,  # First word of the docstring should not be This
-    ]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.7.0] - 2022-10-01
+
+### Added
+
+- added `spec2vec.serialization` subpackage to import and export `Word2Vec` models to/from disk without Pickle 
+  (via `import_model` and `export_model` respectively) [#80](https://github.com/iomega/spec2vec/pull/80)
+
+## Fixed
+
+- updated Code examples in documentation to recent changes in matchms.
+
 ## [0.6.0] - 2022-01-03
 
 ### Added
@@ -120,7 +131,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fossa configuration
 - Flowchart
 
-[Unreleased]: https://github.com/iomega/spec2vec/compare/0.6.0...HEAD
+[Unreleased]: https://github.com/iomega/spec2vec/compare/0.7.0...HEAD
+[0.7.0]: https://github.com/iomega/spec2vec/compare/0.6.0...0.7.0
 [0.6.0]: https://github.com/iomega/spec2vec/compare/0.5.0...0.6.0
 [0.5.0]: https://github.com/iomega/spec2vec/compare/0.4.0...0.5.0
 [0.4.0]: https://github.com/iomega/spec2vec/compare/0.3.4...0.4.0

diff --git a/conda/environment-dev.yml b/conda/environment-dev.yml
@@ -11,6 +11,7 @@ dependencies:
   - numpy
   - pip
   - python >=3.7
+  - scipy
   - tqdm
   - pip:
     - -e ..[dev]
diff --git a/conda/environment.yml b/conda/environment.yml
@@ -9,4 +9,5 @@ dependencies:
   - numba >=0.51
   - numpy
   - python >=3.7
+  - scipy
   - tqdm
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "spec2vec" %}
-{% set version = "0.6.0" %}
+{% set version = "0.7.0" %}
 
 package:
   name: {{ name|lower }}
@@ -42,6 +42,7 @@ requirements:
     - numpy
     - pip
     - python >=3.7
+    - scipy
     - tqdm
 
 test:

diff --git a/integration-tests/test_user_workflow_spec2vec.py b/integration-tests/test_user_workflow_spec2vec.py
@@ -31,7 +31,7 @@ def apply_my_filters(s):
         return s
 
     repository_root = os.path.join(os.path.dirname(__file__), "..")
-    spectrums_file = os.path.join(repository_root, "tests", "pesticides.mgf")
+    spectrums_file = os.path.join(repository_root, "tests", "data", "pesticides.mgf")
 
     # apply my filters to the data
     spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.0
+current_version = 0.7.0
 
 [bumpversion:file:conda/meta.yaml]
 search = set version = "{current_version}"

diff --git a/setup.py b/setup.py
@@ -50,6 +50,7 @@
         "matchms >=0.11.0",
         "numba >=0.51",
         "numpy",
+        "scipy",
         "tqdm",
     ],
     extras_require={"dev": ["bump2version",
@@ -58,7 +59,7 @@
                             "prospector[with_pyroma]",
                             "pytest",
                             "pytest-cov",
-                            "sphinx>=3.0.0,!=3.2.0,<4.0.0",
+                            "sphinx>=4.0.0",
                             "sphinx_rtd_theme",
                             "sphinxcontrib-apidoc",
                             "yapf",],

diff --git a/spec2vec/Spec2Vec.py b/spec2vec/Spec2Vec.py
@@ -5,6 +5,7 @@
 from matchms import Spectrum
 from matchms.similarity.BaseSimilarity import BaseSimilarity
 from tqdm import tqdm
+from spec2vec.serialization import Word2VecLight
 from spec2vec.SpectrumDocument import SpectrumDocument
 from spec2vec.vector_operations import (calc_vector, cosine_similarity,
                                         cosine_similarity_matrix)
@@ -48,7 +49,7 @@ def spectrum_processing(s):
             s = require_minimum_number_of_peaks(s, n_required=5)
             return s
 
-        spectrums_file = os.path.join(os.getcwd(), "..", "tests", "pesticides.mgf")
+        spectrums_file = os.path.join(os.getcwd(), "..", "tests", "data", "pesticides.mgf")
 
         # Load data and apply the above defined filters to the data
         spectrums = [spectrum_processing(s) for s in load_from_mgf(spectrums_file)]
@@ -69,8 +70,8 @@ def spectrum_processing(s):
         # Select top-10 candidates for first query spectrum
         spectrum0_top10 = scores.scores_by_query(spectrums[0], sort=True)[:10]
 
-        # Display spectrum IDs for top-10 matches
-        print([s[0].metadata['spectrumid'] for s in spectrum0_top10])
+        # Display spectrum IDs for top-10 matches (only works if metadata contains "spectrum_id" field)
+        print([s[0].metadata['spectrum_id'] for s in spectrum0_top10])
 
     Should output
 
@@ -79,7 +80,7 @@ def spectrum_processing(s):
         ['CCMSLIB00001058300', 'CCMSLIB00001058289', 'CCMSLIB00001058303', ...
 
     """
-    def __init__(self, model: Word2Vec, intensity_weighting_power: Union[float, int] = 0,
+    def __init__(self, model: Union[Word2Vec, Word2VecLight], intensity_weighting_power: Union[float, int] = 0,
                  allowed_missing_percentage: Union[float, int] = 10, progress_bar: bool = False):
         """
 

diff --git a/spec2vec/__init__.py b/spec2vec/__init__.py
@@ -1,3 +1,4 @@
+from . import serialization
 from .__version__ import __version__
 from .Document import Document
 from .logging_functions import _init_logger
@@ -13,6 +14,7 @@
     "__version__",
     "calc_vector",
     "Document",
+    "serialization",
     "SpectrumDocument",
     "Spec2Vec",
 ]
diff --git a/spec2vec/__version__.py b/spec2vec/__version__.py
@@ -1 +1 @@
-__version__ = '0.6.0'
+__version__ = '0.7.0'
diff --git a/spec2vec/serialization/__init__.py b/spec2vec/serialization/__init__.py
@@ -0,0 +1,15 @@
+"""
+Functions for exporting and importing trained :class:`~gensim.models.Word2Vec` model to and from disk.
+##########################################
+Functions provide the ability to export and import trained :class:`~gensim.models.Word2Vec` model to and from disk
+without pickling the model. The model can be stored in two files: `.json` for metadata and `.npy` for weights.
+"""
+from .model_exporting import export_model
+from .model_importing import Word2VecLight, import_model
+
+
+__all__ = [
+    "export_model",
+    "import_model",
+    "Word2VecLight"
+    ]
diff --git a/spec2vec/serialization/model_exporting.py b/spec2vec/serialization/model_exporting.py
@@ -0,0 +1,94 @@
+import json
+import os
+from copy import deepcopy
+from typing import Union
+import numpy as np
+import scipy.sparse
+from gensim.models import Word2Vec
+
+
+def export_model(model: Word2Vec,
+                 output_model_file: Union[str, os.PathLike],
+                 output_weights_file: Union[str, os.PathLike]):
+    """
+    Write a lightweight version of a :class:`~gensim.model.Word2Vec` model to disk. Such a model can be read to
+    calculate scores but is not capable of further training.
+
+    Parameters
+    ----------
+    model:
+        :class:`~gensim.models.Word2Vec` trained model.
+    output_model_file:
+        A path of json file to save the model.
+    output_weights_file:
+        A path of `.npy` file to save the model's weights.
+    """
+    model = deepcopy(model)
+    keyedvectors = extract_keyedvectors(model)
+    weights = keyedvectors.pop("vectors")
+    keyedvectors["__weights_format"] = get_weights_format(weights)
+
+    save_model(keyedvectors, output_model_file)
+    save_weights(weights, output_weights_file)
+
+
+def save_weights(weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix],
+                 output_weights_file: Union[str, os.PathLike]):
+    """
+    Write model's weights to disk in `.npy` dense array format. If the weights are sparse, they are converted to dense
+    prior to saving.
+    """
+    if isinstance(weights, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)):
+        weights = weights.toarray()
+
+    np.save(output_weights_file, weights, allow_pickle=False)
+
+
+def save_model(keyedvectors: dict, output_model_file: Union[str, os.PathLike]):
+    """Write model's metadata to disk in json format."""
+    with open(output_model_file, "w", encoding="utf-8") as f:
+        json.dump(keyedvectors, f)
+
+
+def get_weights_format(weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]) -> str:
+    """
+    Get the array format of the model's weights.
+
+    Parameters
+    ----------
+    weights:
+        Model's weights.
+
+    Returns
+    -------
+    weights_format:
+        Format of the model's weights.
+    """
+    if isinstance(weights, np.ndarray):
+        return "np.ndarray"
+    if isinstance(weights, scipy.sparse.csr_matrix):
+        return "csr_matrix"
+    if isinstance(weights, scipy.sparse.csc_matrix):
+        return "csc_matrix"
+    raise NotImplementedError("The model's weights format is not supported.")
+
+
+def extract_keyedvectors(model: Word2Vec) -> dict:
+    """
+    Extract :class:`~gensim.models.KeyedVectors` object from the model, convert it to a dictionary and
+    remove redundant keys.
+
+    Parameters
+    ----------
+    model:
+        :class:`~gensim.models.Word2Vec` trained model.
+
+    Returns
+    -------
+    keyedvectors:
+        Dictionary representation of :class:`~gensim.models.KeyedVectors` without redundant keys.
+    """
+    keyedvectors = model.wv.__dict__
+    keyedvectors.pop("vectors_lockf", None)
+    keyedvectors.pop("expandos", None)
+    return keyedvectors
diff --git a/spec2vec/serialization/model_importing.py b/spec2vec/serialization/model_importing.py
@@ -0,0 +1,85 @@
+import json
+import os
+from typing import Union
+import numpy as np
+import scipy.sparse
+from gensim.models import KeyedVectors
+
+
+class Word2VecLight:
+    """
+    A lightweight version of :class:`~gensim.models.Word2Vec`. The objects of this class follow the interface of the
+    original :class:`~gensim.models.Word2Vec` to the point necessary to calculate Spec2Vec scores. The model cannot be
+    used for further training.
+    """
+
+    def __init__(self, model: dict, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]):
+        """
+
+        Parameters
+        ----------
+        model:
+            A dictionary containing the model's metadata.
+        weights:
+            A numpy array or a scipy sparse matrix containing the model's weights.
+        """
+        self.wv: KeyedVectors = self._KeyedVectorsBuilder().from_dict(model).with_weights(weights).build()
+
+    class _KeyedVectorsBuilder:
+        def __init__(self):
+            self.vector_size = None
+            self.weights = None
+
+        def build(self) -> KeyedVectors:
+            keyed_vectors = KeyedVectors(self.vector_size)
+            keyed_vectors.__dict__ = self.__dict__
+            keyed_vectors.vectors = self.weights
+            return keyed_vectors
+
+        def from_dict(self, dictionary: dict):
+            expected_keys = {"vector_size", "__numpys", "__scipys", "__ignoreds", "__recursive_saveloads",
+                             "index_to_key", "norms", "key_to_index", "next_index", "__weights_format"}
+            if dictionary.keys() == expected_keys:
+                self.__dict__ = dictionary
+            else:
+                raise ValueError("The keys of model's dictionary representation do not match the expected keys.")
+            return self
+
+        def with_weights(self, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]):
+            self.weights = weights
+            return self
+
+
+def import_model(model_file, weights_file) -> Word2VecLight:
+    """
+    Read a lightweight version of a :class:`~gensim.models.Word2Vec` model from disk.
+
+    Parameters
+    ----------
+    model_file:
+        A path of json file to load the model.
+    weights_file:
+        A path of `.npy` file to load the model's weights.
+
+    Returns
+    -------
+    :class:`~spec2vec.serialization.model_importing.Word2VecLight` – a lightweight version of a
+    :class:`~gensim.models.Word2Vec`
+    """
+    with open(model_file, "r", encoding="utf-8") as f:
+        model: dict = json.load(f)
+
+    weights = load_weights(weights_file, model["__weights_format"])
+    return Word2VecLight(model, weights)
+
+
+def load_weights(weights_file: Union[str, os.PathLike],
+                 weights_format: str) -> Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]:
+    weights: np.ndarray = np.load(weights_file, allow_pickle=False)
+
+    weights_array_builder = {"csr_matrix": scipy.sparse.csr_matrix,
+                            "csc_matrix": scipy.sparse.csc_matrix,
+                            "np.ndarray": lambda x: x}
+    weights = weights_array_builder[weights_format](weights)
+
+    return weights
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,8 @@
+from pathlib import Path
+import pytest
+
+
+@pytest.fixture(scope="module")
+def test_dir(request):
+    """Return the directory of the currently running test script."""
+    return Path(request.fspath).parent
diff --git a/tests/data/model.json b/tests/data/model.json
diff --git a/tests/pesticides.mgf → tests/data/pesticides.mgf b/tests/pesticides.mgf → tests/data/pesticides.mgf
diff --git a/tests/data/weights.npy b/tests/data/weights.npy
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ dependencies: @@
       - numpy
       - pip
       - python >=3.7
+      - scipy
       - tqdm
       - pip:
         - -e ..[dev]