Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] fix save_facebook_model failure after update-vocab & other initialization streamlining #2944

Merged
merged 17 commits into from
Oct 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No

self.vector_size = vector_size
self.dv = dv or KeyedVectors(self.vector_size, mapfile_path=dv_mapfile)
# EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0)
# advanced users should directly resize/adjust as desired after any vocab growth
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
self.dv.vectors_lockf = np.ones(1, dtype=REAL) # 0.0 values suppress word-backprop-updates; 1.0 allows

super(Doc2Vec, self).__init__(
sentences=corpus_iterable,
Expand Down Expand Up @@ -330,11 +333,10 @@ def _clear_post_train(self):
self.wv.norms = None
self.dv.norms = None

def reset_weights(self):
super(Doc2Vec, self).reset_weights()
self.dv.resize_vectors()
self.dv.randomly_initialize_vectors()
self.dv.vectors_lockf = np.ones(1, dtype=REAL) # 0.0 values suppress word-backprop-updates; 1.0 allows
def init_weights(self):
super(Doc2Vec, self).init_weights()
# to not use an identical rnd stream as words, deterministically change seed (w/ 1000th prime)
self.dv.resize_vectors(seed=self.seed + 7919)

def reset_from(self, other_model):
"""Copy shareable data structures from another (possibly pre-trained) model.
Expand All @@ -359,7 +361,7 @@ def reset_from(self, other_model):
self.dv.key_to_index = other_model.dv.key_to_index
self.dv.index_to_key = other_model.dv.index_to_key
self.dv.expandos = other_model.dv.expandos
self.reset_weights()
self.init_weights()

def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch,
total_examples=None, total_words=None, offsets=None, start_doctags=None, **kwargs):
Expand Down
288 changes: 28 additions & 260 deletions gensim/models/fasttext.py

Large diffs are not rendered by default.

68 changes: 40 additions & 28 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@


class KeyedVectors(utils.SaveLoad):
def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None):
def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None):
"""Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec`
and related models.

Expand All @@ -204,6 +204,18 @@ def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None):
types, as the type and storage array for such attributes is established by the 1st time such
`attr` is set.

Parameters
----------
vector_size : int
Intended number of dimensions for all contained vectors.
count : int, optional
If provided, vectors wil be pre-allocated for at least this many vectors. (Otherwise
they can be added later.)
dtype : type, optional
Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
another type is provided here.
mapfile_path : string, optional
FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975
"""
self.vector_size = vector_size
# pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos`
Expand Down Expand Up @@ -337,34 +349,16 @@ def get_vecattr(self, key, attr):
index = self.get_index(key)
return self.expandos[attr][index]

def resize_vectors(self):
"""Make underlying vectors match index_to_key size."""
target_count = len(self.index_to_key)
prev_count = len(self.vectors)
if prev_count == target_count:
return ()
prev_vectors = self.vectors
if hasattr(self, 'mapfile_path') and self.mapfile_path:
self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL)
else:
self.vectors = np.zeros((target_count, self.vector_size), dtype=REAL)
self.vectors[0: min(prev_count, target_count), ] = prev_vectors[0: min(prev_count, target_count), ]
self.allocate_vecattrs()
self.norms = None
return range(prev_count, target_count)
def resize_vectors(self, seed=0):
"""Make underlying vectors match index_to_key size; random-initialize any new rows."""

def randomly_initialize_vectors(self, indexes=None, seed=0):
"""Initialize vectors with low-magnitude random vectors, as is typical for pre-trained
Word2Vec and related models.
target_shape = (len(self.index_to_key), self.vector_size)
self.vectors = prep_vectors(target_shape, prior_vectors=self.vectors, seed=seed)
# FIXME BEFORE 4.0.0 PER #2955 / #2975 : support memmap & cleanup
# if hasattr(self, 'mapfile_path') and self.mapfile_path:
# self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL)

"""
if indexes is None:
indexes = range(0, len(self.vectors))
for i in indexes:
self.vectors[i] = pseudorandom_weak_vector(
self.vectors.shape[1],
seed_string=str(self.index_to_key[i]) + str(seed),
)
self.allocate_vecattrs()
self.norms = None

def __len__(self):
Expand Down Expand Up @@ -1526,7 +1520,7 @@ def save_word2vec_format(
(in case word vectors are appended with document vectors afterwards).
write_header : bool, optional
If False, don't write the 1st line declaring the count of vectors and dimensions.
TODO: doc prefix, append, sort_attr
FIXME: doc prefix, append, sort_attr
"""
if total_vec is None:
total_vec = len(self.index_to_key)
Expand Down Expand Up @@ -1918,3 +1912,21 @@ def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash):
else:
once = utils.default_prng
return (once.random(size).astype(REAL) - 0.5) / size


def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL):
"""FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK
Return a numpy array of the given shape. Reuse prior_vectors object or values
to extent possible. Initialize new values randomly if requested."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be a public API function. We should document the parameters so they appear in the documentation.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It ultimately may not be, pending the mmap work & other initialization clean-up, which might also jostle the internal names a bit. (At the moment, this is only called from resize_vectors() which may be the preferable public-entry point, because outside callers are less-likely to have a prior_vectors.)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, but this stuff will show up in the docs, right?

I can think of several better ways forward:

  • Make the docstring a code-comment so it doesn't show up in the docs
  • Mark the function as internal

if prior_vectors is None:
prior_vectors = np.zeros((0, 0))
if prior_vectors.shape == target_shape:
return prior_vectors
target_count, vector_size = target_shape
rng = np.random.default_rng(seed=seed) # use new instance of numpy's recommended generator/algorithm
new_vectors = rng.random(target_shape, dtype=dtype) # [0.0, 1.0)
new_vectors *= 2.0 # [0.0, 2.0)
new_vectors -= 1.0 # [-1.0, 1.0)
new_vectors /= vector_size
new_vectors[0:prior_vectors.shape[0], 0:prior_vectors.shape[1]] = prior_vectors
return new_vectors
Loading