Merge pull request #2944 from gojomo/ft_save_after_update_vocab

[MRG] fix save_facebook_model failure after update-vocab & other initialization streamlining
piskvorky · Oct 15, 2020 · ea87470 · ea87470
2 parents 1d0a8bd + 9cd75c3
commit ea87470
Show file tree

Hide file tree

Showing 8 changed files with 176 additions and 381 deletions.
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -286,6 +286,9 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No
 
         self.vector_size = vector_size
         self.dv = dv or KeyedVectors(self.vector_size, mapfile_path=dv_mapfile)
+        # EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0)
+        # advanced users should directly resize/adjust as desired after any vocab growth
+        self.dv.vectors_lockf = np.ones(1, dtype=REAL)  # 0.0 values suppress word-backprop-updates; 1.0 allows
 
         super(Doc2Vec, self).__init__(
             sentences=corpus_iterable,
@@ -330,11 +333,10 @@ def _clear_post_train(self):
         self.wv.norms = None
         self.dv.norms = None
 
-    def reset_weights(self):
-        super(Doc2Vec, self).reset_weights()
-        self.dv.resize_vectors()
-        self.dv.randomly_initialize_vectors()
-        self.dv.vectors_lockf = np.ones(1, dtype=REAL)  # 0.0 values suppress word-backprop-updates; 1.0 allows
+    def init_weights(self):
+        super(Doc2Vec, self).init_weights()
+        # to not use an identical rnd stream as words, deterministically change seed (w/ 1000th prime)
+        self.dv.resize_vectors(seed=self.seed + 7919)
 
     def reset_from(self, other_model):
         """Copy shareable data structures from another (possibly pre-trained) model.
@@ -359,7 +361,7 @@ def reset_from(self, other_model):
         self.dv.key_to_index = other_model.dv.key_to_index
         self.dv.index_to_key = other_model.dv.index_to_key
         self.dv.expandos = other_model.dv.expandos
-        self.reset_weights()
+        self.init_weights()
 
     def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch,
                         total_examples=None, total_words=None, offsets=None, start_doctags=None, **kwargs):

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -191,7 +191,7 @@
 
 
 class KeyedVectors(utils.SaveLoad):
-    def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None):
+    def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None):
         """Mapping between keys (such as words)  and vectors for :class:`~gensim.models.Word2Vec`
         and related models.
 
@@ -204,6 +204,18 @@ def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None):
         types, as the type and storage array for such attributes is established by the 1st time such
         `attr` is set.
 
+        Parameters
+        ----------
+        vector_size : int
+            Intended number of dimensions for all contained vectors.
+        count : int, optional
+            If provided, vectors wil be pre-allocated for at least this many vectors. (Otherwise
+            they can be added later.)
+        dtype : type, optional
+            Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
+            another type is provided here.
+        mapfile_path : string, optional
+            FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975
         """
         self.vector_size = vector_size
         # pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos`
@@ -337,34 +349,16 @@ def get_vecattr(self, key, attr):
         index = self.get_index(key)
         return self.expandos[attr][index]
 
-    def resize_vectors(self):
-        """Make underlying vectors match index_to_key size."""
-        target_count = len(self.index_to_key)
-        prev_count = len(self.vectors)
-        if prev_count == target_count:
-            return ()
-        prev_vectors = self.vectors
-        if hasattr(self, 'mapfile_path') and self.mapfile_path:
-            self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL)
-        else:
-            self.vectors = np.zeros((target_count, self.vector_size), dtype=REAL)
-        self.vectors[0: min(prev_count, target_count), ] = prev_vectors[0: min(prev_count, target_count), ]
-        self.allocate_vecattrs()
-        self.norms = None
-        return range(prev_count, target_count)
+    def resize_vectors(self, seed=0):
+        """Make underlying vectors match index_to_key size; random-initialize any new rows."""
 
-    def randomly_initialize_vectors(self, indexes=None, seed=0):
-        """Initialize vectors with low-magnitude random vectors, as is typical for pre-trained
-        Word2Vec and related models.
+        target_shape = (len(self.index_to_key), self.vector_size)
+        self.vectors = prep_vectors(target_shape, prior_vectors=self.vectors, seed=seed)
+        # FIXME BEFORE 4.0.0 PER #2955 / #2975 : support memmap & cleanup
+#        if hasattr(self, 'mapfile_path') and self.mapfile_path:
+#            self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL)
 
-        """
-        if indexes is None:
-            indexes = range(0, len(self.vectors))
-        for i in indexes:
-            self.vectors[i] = pseudorandom_weak_vector(
-                self.vectors.shape[1],
-                seed_string=str(self.index_to_key[i]) + str(seed),
-            )
+        self.allocate_vecattrs()
         self.norms = None
 
     def __len__(self):
@@ -1526,7 +1520,7 @@ def save_word2vec_format(
             (in case word vectors are appended with document vectors afterwards).
         write_header : bool, optional
             If False, don't write the 1st line declaring the count of vectors and dimensions.
-        TODO: doc prefix, append, sort_attr
+        FIXME: doc prefix, append, sort_attr
         """
         if total_vec is None:
             total_vec = len(self.index_to_key)
@@ -1918,3 +1912,21 @@ def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash):
     else:
         once = utils.default_prng
     return (once.random(size).astype(REAL) - 0.5) / size
+
+
+def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL):
+    """FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK
+    Return a numpy array of the given shape. Reuse prior_vectors object or values
+    to extent possible. Initialize new values randomly if requested."""
+    if prior_vectors is None:
+        prior_vectors = np.zeros((0, 0))
+    if prior_vectors.shape == target_shape:
+        return prior_vectors
+    target_count, vector_size = target_shape
+    rng = np.random.default_rng(seed=seed)  # use new instance of numpy's recommended generator/algorithm
+    new_vectors = rng.random(target_shape, dtype=dtype)  # [0.0, 1.0)
+    new_vectors *= 2.0  # [0.0, 2.0)
+    new_vectors -= 1.0  # [-1.0, 1.0)
+    new_vectors /= vector_size
+    new_vectors[0:prior_vectors.shape[0], 0:prior_vectors.shape[1]] = prior_vectors
+    return new_vectors