Skip to content

Commit fa9dfcf

Browse files
committed
Initialize position-dependent weights with uniform distribution
1 parent 94a57ff commit fa9dfcf

File tree

1 file changed

+32
-31
lines changed

1 file changed

+32
-31
lines changed

gensim/models/fasttext.py

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@
275275
276276
"""
277277

278+
from math import sqrt
278279
import logging
279280
import os
280281

@@ -482,7 +483,9 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
482483
# with no eligible char-ngram lengths, no buckets need be allocated
483484
bucket = 0
484485

485-
self.wv = FastTextKeyedVectors(vector_size, position_dependent_vector_size, min_n, max_n, bucket)
486+
self.wv = FastTextKeyedVectors(
487+
vector_size, position_dependent_weights, position_dependent_vector_size, min_n, max_n,
488+
bucket)
486489
self.wv.bucket = bucket
487490

488491
super(FastText, self).__init__(
@@ -505,9 +508,7 @@ def prepare_weights(self, update=False):
505508
"""
506509
super(FastText, self).prepare_weights(update=update)
507510
if not update:
508-
self.wv.init_ngrams_weights(self.seed)
509-
if self.position_dependent_weights:
510-
self.wv.init_positional_weights(self.window)
511+
self.wv.init_ngrams_weights(self.seed, self.window)
511512
# EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0)
512513
# advanced users should directly resize/adjust as necessary
513514
self.wv.vectors_vocab_lockf = ones(1, dtype=REAL)
@@ -598,9 +599,7 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog
598599
599600
"""
600601
if not update:
601-
self.wv.init_ngrams_weights(self.seed)
602-
if self.position_dependent_weights:
603-
self.wv.init_positional_weights(self.window)
602+
self.wv.init_ngrams_weights(self.seed, self.window)
604603
elif not len(self.wv):
605604
raise RuntimeError(
606605
"You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
@@ -1172,7 +1171,7 @@ def save_facebook_model(model, path, encoding="utf-8", lr_update_rate=100, word_
11721171

11731172

11741173
class FastTextKeyedVectors(KeyedVectors):
1175-
def __init__(self, vector_size, position_dependent_vector_size, min_n, max_n, bucket):
1174+
def __init__(self, vector_size, position_dependent_weights, position_dependent_vector_size, min_n, max_n, bucket):
11761175
"""Vectors and vocab for :class:`~gensim.models.fasttext.FastText`.
11771176
11781177
Implements significant parts of the FastText algorithm. For example,
@@ -1188,6 +1187,8 @@ def __init__(self, vector_size, position_dependent_vector_size, min_n, max_n, bu
11881187
----------
11891188
vector_size : int
11901189
The dimensionality of all vectors.
1190+
position_dependent_weights : bool
1191+
Whether position-dependent weight vectors will also be stored.
11911192
position_dependent_vector_size : int
11921193
How many features of the trained vector features should be
11931194
position-dependent. Decreasing the number of position-dependent
@@ -1222,11 +1223,12 @@ def __init__(self, vector_size, position_dependent_vector_size, min_n, max_n, bu
12221223
12231224
"""
12241225
super(FastTextKeyedVectors, self).__init__(vector_size=vector_size)
1226+
self.position_dependent_weights = position_dependent_weights
12251227
self.position_dependent_vector_size = position_dependent_vector_size # fka pdw_size
12261228
self.vectors_vocab = None # fka syn0_vocab
12271229
self.vectors_ngrams = None # fka syn0_ngrams
1230+
self.vectors_positions = None # fka syn0_positions
12281231
self.buckets_word = None
1229-
self.vectors_positions = None
12301232
self.min_n = min_n
12311233
self.max_n = max_n
12321234
self.bucket = bucket # count of buckets, fka num_ngram_vectors
@@ -1343,7 +1345,7 @@ def get_vector(self, word, use_norm=False):
13431345
else:
13441346
return word_vec
13451347

1346-
def init_ngrams_weights(self, seed):
1348+
def init_ngrams_weights(self, seed, window):
13471349
"""Initialize the vocabulary and ngrams weights prior to training.
13481350
13491351
Creates the weight matrices and initializes them with uniform random values.
@@ -1352,6 +1354,8 @@ def init_ngrams_weights(self, seed):
13521354
----------
13531355
seed : float
13541356
The seed for the PRNG.
1357+
window : int
1358+
The size of the window used during the training.
13551359
13561360
Note
13571361
----
@@ -1362,10 +1366,9 @@ def init_ngrams_weights(self, seed):
13621366

13631367
rand_obj = np.random.default_rng(seed=seed) # use new instance of numpy's recommended generator/algorithm
13641368

1365-
lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size
13661369
vocab_shape = (len(self), self.vector_size)
13671370
ngrams_shape = (self.bucket, self.vector_size)
1368-
self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL)
1371+
positions_shape = (2 * window, self.position_dependent_vector_size)
13691372
#
13701373
# We could have initialized vectors_ngrams at construction time, but we
13711374
# do it here for two reasons:
@@ -1375,23 +1378,15 @@ def init_ngrams_weights(self, seed):
13751378
# vectors_ngrams, and vectors_vocab cannot happen at construction
13761379
# time because the vocab is not initialized at that stage.
13771380
#
1381+
if self.position_dependent_weights:
1382+
hi = sqrt(sqrt(3.0) / self.vector_size)
1383+
lo = -hi
1384+
self.vectors_positions = rand_obj.uniform(lo, hi, positions_shape).astype(REAL)
1385+
else:
1386+
lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size
1387+
self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL)
13781388
self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL)
13791389

1380-
def init_positional_weights(self, window):
1381-
"""Initialize the positional weights prior to training.
1382-
1383-
Creates the weight matrix and initializes it with uniform random values.
1384-
1385-
Parameters
1386-
----------
1387-
window : int
1388-
The size of the window used during the training.
1389-
1390-
"""
1391-
1392-
positional_shape = (2 * window, self.position_dependent_vector_size)
1393-
self.vectors_positions = np.ones(positional_shape, dtype=REAL)
1394-
13951390
def update_ngrams_weights(self, seed, old_vocab_len):
13961391
"""Update the vocabulary weights for training continuation.
13971392
@@ -1413,7 +1408,8 @@ def update_ngrams_weights(self, seed, old_vocab_len):
14131408
rand_obj.seed(seed)
14141409

14151410
new_vocab = len(self) - old_vocab_len
1416-
self.vectors_vocab = _pad_random(self.vectors_vocab, new_vocab, rand_obj)
1411+
self.vectors_vocab = _pad_random(self.vectors_vocab, new_vocab, rand_obj,
1412+
squared=self.position_dependent_weights)
14171413

14181414
def init_post_load(self, fb_vectors):
14191415
"""Perform initialization after loading a native Facebook model.
@@ -1480,11 +1476,16 @@ def recalc_char_ngram_buckets(self):
14801476
)
14811477

14821478

1483-
def _pad_random(m, new_rows, rand):
1479+
def _pad_random(m, new_rows, rand, squared=False):
14841480
"""Pad a matrix with additional rows filled with random values."""
14851481
_, columns = m.shape
1486-
low, high = -1.0 / columns, 1.0 / columns
1487-
suffix = rand.uniform(low, high, (new_rows, columns)).astype(REAL)
1482+
shape = (new_rows, columns)
1483+
if squared:
1484+
high = sqrt(sqrt(3.0) / columns)
1485+
low = -high
1486+
else:
1487+
low, high = -1.0 / columns, 1.0 / columns
1488+
suffix = rand.uniform(low, high, shape).astype(REAL)
14881489
return vstack([m, suffix])
14891490

14901491

0 commit comments

Comments
 (0)