275
275
276
276
"""
277
277
278
+ from math import sqrt
278
279
import logging
279
280
import os
280
281
@@ -482,7 +483,9 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
482
483
# with no eligible char-ngram lengths, no buckets need be allocated
483
484
bucket = 0
484
485
485
- self .wv = FastTextKeyedVectors (vector_size , position_dependent_vector_size , min_n , max_n , bucket )
486
+ self .wv = FastTextKeyedVectors (
487
+ vector_size , position_dependent_weights , position_dependent_vector_size , min_n , max_n ,
488
+ bucket )
486
489
self .wv .bucket = bucket
487
490
488
491
super (FastText , self ).__init__ (
@@ -505,9 +508,7 @@ def prepare_weights(self, update=False):
505
508
"""
506
509
super (FastText , self ).prepare_weights (update = update )
507
510
if not update :
508
- self .wv .init_ngrams_weights (self .seed )
509
- if self .position_dependent_weights :
510
- self .wv .init_positional_weights (self .window )
511
+ self .wv .init_ngrams_weights (self .seed , self .window )
511
512
# EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0)
512
513
# advanced users should directly resize/adjust as necessary
513
514
self .wv .vectors_vocab_lockf = ones (1 , dtype = REAL )
@@ -598,9 +599,7 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog
598
599
599
600
"""
600
601
if not update :
601
- self .wv .init_ngrams_weights (self .seed )
602
- if self .position_dependent_weights :
603
- self .wv .init_positional_weights (self .window )
602
+ self .wv .init_ngrams_weights (self .seed , self .window )
604
603
elif not len (self .wv ):
605
604
raise RuntimeError (
606
605
"You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
@@ -1172,7 +1171,7 @@ def save_facebook_model(model, path, encoding="utf-8", lr_update_rate=100, word_
1172
1171
1173
1172
1174
1173
class FastTextKeyedVectors (KeyedVectors ):
1175
- def __init__ (self , vector_size , position_dependent_vector_size , min_n , max_n , bucket ):
1174
+ def __init__ (self , vector_size , position_dependent_weights , position_dependent_vector_size , min_n , max_n , bucket ):
1176
1175
"""Vectors and vocab for :class:`~gensim.models.fasttext.FastText`.
1177
1176
1178
1177
Implements significant parts of the FastText algorithm. For example,
@@ -1188,6 +1187,8 @@ def __init__(self, vector_size, position_dependent_vector_size, min_n, max_n, bu
1188
1187
----------
1189
1188
vector_size : int
1190
1189
The dimensionality of all vectors.
1190
+ position_dependent_weights : bool
1191
+ Whether position-dependent weight vectors will also be stored.
1191
1192
position_dependent_vector_size : int
1192
1193
How many features of the trained vector features should be
1193
1194
position-dependent. Decreasing the number of position-dependent
@@ -1222,11 +1223,12 @@ def __init__(self, vector_size, position_dependent_vector_size, min_n, max_n, bu
1222
1223
1223
1224
"""
1224
1225
super (FastTextKeyedVectors , self ).__init__ (vector_size = vector_size )
1226
+ self .position_dependent_weights = position_dependent_weights
1225
1227
self .position_dependent_vector_size = position_dependent_vector_size # fka pdw_size
1226
1228
self .vectors_vocab = None # fka syn0_vocab
1227
1229
self .vectors_ngrams = None # fka syn0_ngrams
1230
+ self .vectors_positions = None # fka syn0_positions
1228
1231
self .buckets_word = None
1229
- self .vectors_positions = None
1230
1232
self .min_n = min_n
1231
1233
self .max_n = max_n
1232
1234
self .bucket = bucket # count of buckets, fka num_ngram_vectors
@@ -1343,7 +1345,7 @@ def get_vector(self, word, use_norm=False):
1343
1345
else :
1344
1346
return word_vec
1345
1347
1346
- def init_ngrams_weights (self , seed ):
1348
+ def init_ngrams_weights (self , seed , window ):
1347
1349
"""Initialize the vocabulary and ngrams weights prior to training.
1348
1350
1349
1351
Creates the weight matrices and initializes them with uniform random values.
@@ -1352,6 +1354,8 @@ def init_ngrams_weights(self, seed):
1352
1354
----------
1353
1355
seed : float
1354
1356
The seed for the PRNG.
1357
+ window : int
1358
+ The size of the window used during the training.
1355
1359
1356
1360
Note
1357
1361
----
@@ -1362,10 +1366,9 @@ def init_ngrams_weights(self, seed):
1362
1366
1363
1367
rand_obj = np .random .default_rng (seed = seed ) # use new instance of numpy's recommended generator/algorithm
1364
1368
1365
- lo , hi = - 1.0 / self .vector_size , 1.0 / self .vector_size
1366
1369
vocab_shape = (len (self ), self .vector_size )
1367
1370
ngrams_shape = (self .bucket , self .vector_size )
1368
- self . vectors_vocab = rand_obj . uniform ( lo , hi , vocab_shape ). astype ( REAL )
1371
+ positions_shape = ( 2 * window , self . position_dependent_vector_size )
1369
1372
#
1370
1373
# We could have initialized vectors_ngrams at construction time, but we
1371
1374
# do it here for two reasons:
@@ -1375,23 +1378,15 @@ def init_ngrams_weights(self, seed):
1375
1378
# vectors_ngrams, and vectors_vocab cannot happen at construction
1376
1379
# time because the vocab is not initialized at that stage.
1377
1380
#
1381
+ if self .position_dependent_weights :
1382
+ hi = sqrt (sqrt (3.0 ) / self .vector_size )
1383
+ lo = - hi
1384
+ self .vectors_positions = rand_obj .uniform (lo , hi , positions_shape ).astype (REAL )
1385
+ else :
1386
+ lo , hi = - 1.0 / self .vector_size , 1.0 / self .vector_size
1387
+ self .vectors_vocab = rand_obj .uniform (lo , hi , vocab_shape ).astype (REAL )
1378
1388
self .vectors_ngrams = rand_obj .uniform (lo , hi , ngrams_shape ).astype (REAL )
1379
1389
1380
- def init_positional_weights (self , window ):
1381
- """Initialize the positional weights prior to training.
1382
-
1383
- Creates the weight matrix and initializes it with uniform random values.
1384
-
1385
- Parameters
1386
- ----------
1387
- window : int
1388
- The size of the window used during the training.
1389
-
1390
- """
1391
-
1392
- positional_shape = (2 * window , self .position_dependent_vector_size )
1393
- self .vectors_positions = np .ones (positional_shape , dtype = REAL )
1394
-
1395
1390
def update_ngrams_weights (self , seed , old_vocab_len ):
1396
1391
"""Update the vocabulary weights for training continuation.
1397
1392
@@ -1413,7 +1408,8 @@ def update_ngrams_weights(self, seed, old_vocab_len):
1413
1408
rand_obj .seed (seed )
1414
1409
1415
1410
new_vocab = len (self ) - old_vocab_len
1416
- self .vectors_vocab = _pad_random (self .vectors_vocab , new_vocab , rand_obj )
1411
+ self .vectors_vocab = _pad_random (self .vectors_vocab , new_vocab , rand_obj ,
1412
+ squared = self .position_dependent_weights )
1417
1413
1418
1414
def init_post_load (self , fb_vectors ):
1419
1415
"""Perform initialization after loading a native Facebook model.
@@ -1480,11 +1476,16 @@ def recalc_char_ngram_buckets(self):
1480
1476
)
1481
1477
1482
1478
1483
- def _pad_random (m , new_rows , rand ):
1479
+ def _pad_random (m , new_rows , rand , squared = False ):
1484
1480
"""Pad a matrix with additional rows filled with random values."""
1485
1481
_ , columns = m .shape
1486
- low , high = - 1.0 / columns , 1.0 / columns
1487
- suffix = rand .uniform (low , high , (new_rows , columns )).astype (REAL )
1482
+ shape = (new_rows , columns )
1483
+ if squared :
1484
+ high = sqrt (sqrt (3.0 ) / columns )
1485
+ low = - high
1486
+ else :
1487
+ low , high = - 1.0 / columns , 1.0 / columns
1488
+ suffix = rand .uniform (low , high , shape ).astype (REAL )
1488
1489
return vstack ([m , suffix ])
1489
1490
1490
1491
0 commit comments