Support position-dependent weighting with fastText CBOW and negatives

Witiko · Witiko · commit dc56ae2fb160 · 2020-07-29T18:58:17.000+02:00
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -312,7 +312,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
                  max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                  negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6,
                  sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
-                 max_final_vocab=None):
+                 max_final_vocab=None, position_dependent_weights=0):
         """Train, use and evaluate word representations learned using the method
         described in `Enriching Word Vectors with Subword Information <https://arxiv.org/abs/1607.04606>`_,
         aka FastText.
@@ -421,6 +421,14 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
             ``min_count```.  If the specified ``min_count`` is more than the
             automatically calculated ``min_count``, the former will be used.
             Set to ``None`` if not required.
+        position_dependent_weights : {1,0}, optional
+            If position vectors should be computed beside word and n-gram vectors, and used to weight the
+            context words during the training (1), or if all context words should be uniformly weighted (0).
+
+        Notes
+        -----
+        Positional vectors are only implemented for CBOW with negative sampling, not SG or hierarchical softmax.
+        Locking positional vectors is not supported. BLAS primitives are not used by the implementation.
 
         Examples
         --------
@@ -451,6 +459,10 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
         self.callbacks = callbacks
         if word_ngrams != 1:
             raise NotImplementedError("Gensim's FastText implementation does not yet support word_ngrams != 1.")
+        if position_dependent_weights and (sg or hs):
+            raise NotImplementedError("Gensim's FastText implementation does not yet support position-dependent "
+                "weighting with SG or hierarchical softmax")
+        self.position_dependent_weights = position_dependent_weights
         self.word_ngrams = word_ngrams
         if max_n < min_n:
             # with no eligible char-ngram lengths, no buckets need be allocated
@@ -468,7 +480,8 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
             seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha)
 
     def prepare_weights(self, update=False):
-        """In addition to superclass allocations, compute ngrams of all words present in vocabulary.
+        """In addition to superclass allocations, compute ngrams of all words present in vocabulary
+        and initialize positional vectors.
 
         Parameters
         ----------
@@ -479,6 +492,8 @@ def prepare_weights(self, update=False):
         super(FastText, self).prepare_weights(update=update)
         if not update:
             self.wv.init_ngrams_weights(self.seed)
+            if self.position_dependent_weights:
+                self.wv.init_positional_weights(self.seed, self.window)
             # EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0)
             # advanced users should directly resize/adjust as necessary
             self.wv.vectors_vocab_lockf = ones(1, dtype=REAL)
@@ -570,6 +585,8 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog
         """
         if not update:
             self.wv.init_ngrams_weights(self.seed)
+            if self.position_dependent_weights:
+                self.wv.init_positional_weights(self.seed, self.window)
         elif not len(self.wv):
             raise RuntimeError(
                 "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
@@ -1190,6 +1207,7 @@ def __init__(self, vector_size, min_n, max_n, bucket):
         self.vectors_vocab = None  # fka syn0_vocab
         self.vectors_ngrams = None  # fka syn0_ngrams
         self.buckets_word = None
+        self.vectors_positions = None
         self.min_n = min_n
         self.max_n = max_n
         self.bucket = bucket  # count of buckets, fka num_ngram_vectors
@@ -1329,7 +1347,6 @@ def init_ngrams_weights(self, seed):
         vocab_shape = (len(self), self.vector_size)
         ngrams_shape = (self.bucket, self.vector_size)
         self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL)
-
         #
         # We could have initialized vectors_ngrams at construction time, but we
         # do it here for two reasons:
@@ -1341,6 +1358,25 @@ def init_ngrams_weights(self, seed):
         #
         self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL)
 
+    def init_positional_weights(self, seed, window):
+        """Initialize the positional weights prior to training.
+
+        Creates the weight matrix and initializes it with uniform random values.
+
+        Parameters
+        ----------
+        seed : float
+            The seed for the PRNG.
+        window : int
+            The size of the window used during the training.
+
+        """
+        rand_obj = np.random.default_rng(seed=seed)  # use new instance of numpy's recommended generator/algorithm
+
+        lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size
+        positional_shape = (2 * window, self.vector_size)
+        self.vectors_positions = rand_obj.uniform(lo, hi, positional_shape).astype(REAL)
+
     def update_ngrams_weights(self, seed, old_vocab_len):
         """Update the vocabulary weights for training continuation.
 
diff --git a/gensim/models/fasttext_inner.pxd b/gensim/models/fasttext_inner.pxd
@@ -46,17 +46,18 @@ cdef struct FastTextConfig:
     #
     # Model parameters.  These get copied as-is from the Python model.
     #
-    int sg, hs, negative, sample, size, window, cbow_mean, workers
+    int sg, hs, pdw, negative, sample, size, window, cbow_mean, workers
     REAL_t alpha
 
     #
-    # The syn0_vocab and syn0_ngrams arrays store vectors for vocabulary terms
-    # and ngrams, respectively, as 1D arrays in scanline order. For example,
-    # syn0_vocab[i * size : (i + 1) * size] contains the elements for the ith
-    # vocab term.
+    # The syn0_vocab, syn0_ngrams, and syn0_positions arrays store vectors for
+    # vocabulary terms, ngrams, and positions, respectively, as 1D arrays in
+    # scanline order. For example, syn0_vocab[i * size : (i + 1) * size]
+    # contains the elements for the ith vocab term.
     #
     REAL_t *syn0_vocab
     REAL_t *syn0_ngrams
+    REAL_t *syn0_positions
 
     #
     # EXPERIMENTAL
diff --git a/gensim/models/fasttext_inner.pyx b/gensim/models/fasttext_inner.pyx
@@ -242,22 +242,32 @@ cdef void fasttext_fast_sentence_cbow_neg(FastTextConfig *c, int i, int j, int k
 
     cdef long long row2
     cdef unsigned long long modulo = 281474976710655ULL
-    cdef REAL_t f, g, count, inv_count = 1.0, label, f_dot
+    cdef REAL_t f, g, count, inv_count = 1.0, label, f_dot, positional_feature
     cdef np.uint32_t target_index, word_index
-    cdef int d, m
+    cdef int d, m, n, o
 
     word_index = c.indexes[i]
 
     memset(c.neu1, 0, c.size * cython.sizeof(REAL_t))
     count = <REAL_t>0.0
+    n = j - i + c.window
     for m in range(j, k):
         if m == i:
             continue
         count += ONEF
-        our_saxpy(&c.size, &ONEF, &c.syn0_vocab[c.indexes[m] * c.size], &ONE, c.neu1, &ONE)
-        for d in range(c.subwords_idx_len[m]):
-            count += ONEF
-            our_saxpy(&c.size, &ONEF, &c.syn0_ngrams[c.subwords_idx[m][d] * c.size], &ONE, c.neu1, &ONE)
+        if c.pdw:
+            for d in range(c.size):  # TODO make into a Hadamard product using a BLAS primitive: DSBMV, followed by SAXPY
+                c.neu1[d] += c.syn0_vocab[c.indexes[m] * c.size + d] * c.syn0_positions[n * c.size + d]
+            for o in range(c.subwords_idx_len[m]):
+                count += ONEF
+                for d in range(c.size):  # TODO make into a Hadamard product using a BLAS primitive: DSBMV, followed by SAXPY
+                    c.neu1[d] += c.syn0_ngrams[c.subwords_idx[m][o] * c.size + d] * c.syn0_positions[n * c.size + d]
+        else:
+            our_saxpy(&c.size, &ONEF, &c.syn0_vocab[c.indexes[m] * c.size], &ONE, c.neu1, &ONE)
+            for o in range(c.subwords_idx_len[m]):
+                count += ONEF
+                our_saxpy(&c.size, &ONEF, &c.syn0_ngrams[c.subwords_idx[m][o] * c.size], &ONE, c.neu1, &ONE)
+        n += 1
 
     if count > (<REAL_t>0.5):
         inv_count = ONEF / count
@@ -293,16 +303,29 @@ cdef void fasttext_fast_sentence_cbow_neg(FastTextConfig *c, int i, int j, int k
     if not c.cbow_mean:  # divide error over summed window vectors
         sscal(&c.size, &inv_count, c.work, &ONE)
 
-    for m in range(j,k):
+    n = j - i + c.window
+    for m in range(j, k):
         if m == i:
             continue
-        our_saxpy(
-            &c.size, &c.vocab_lockf[c.indexes[m] % c.vocab_lockf_len], c.work, &ONE,
-            &c.syn0_vocab[c.indexes[m]*c.size], &ONE)
-        for d in range(c.subwords_idx_len[m]):
+        if c.pdw:
+            for d in range(c.size):  # TODO make into a Hadamard product using a BLAS primitive: DSBMV, followed by SAXPY
+                positional_feature = c.syn0_positions[n * c.size + d]
+                c.syn0_positions[n * c.size + d] += c.work[d] * c.syn0_vocab[c.indexes[m] * c.size + d]
+                c.syn0_vocab[c.indexes[m] * c.size + d] += c.vocab_lockf[c.indexes[m] % c.vocab_lockf_len] * c.work[d] * positional_feature
+            for o in range(c.subwords_idx_len[m]):
+                for d in range(c.size):  # TODO make into two Hadamard products using a BLAS primitive: DSBMV, followed by SAXPY
+                    positional_feature = c.syn0_positions[n * c.size + d]
+                    c.syn0_positions[n * c.size + d] += c.work[d] * c.syn0_ngrams[c.subwords_idx[m][o] * c.size + d]
+                    c.syn0_ngrams[c.subwords_idx[m][o] * c.size + d] += c.ngrams_lockf[c.subwords_idx[m][o] % c.ngrams_lockf_len] * c.work[d] * positional_feature
+        else:
             our_saxpy(
-                &c.size, &c.ngrams_lockf[c.subwords_idx[m][d] % c.ngrams_lockf_len], c.work, &ONE,
-                &c.syn0_ngrams[c.subwords_idx[m][d]*c.size], &ONE)
+                &c.size, &c.vocab_lockf[c.indexes[m] % c.vocab_lockf_len], c.work, &ONE,
+                &c.syn0_vocab[c.indexes[m] * c.size], &ONE)
+            for o in range(c.subwords_idx_len[m]):
+                our_saxpy(
+                    &c.size, &c.ngrams_lockf[c.subwords_idx[m][o] % c.ngrams_lockf_len], c.work, &ONE,
+                    &c.syn0_ngrams[c.subwords_idx[m][o] * c.size], &ONE)
+        n += 1
 
 
 cdef void fasttext_fast_sentence_cbow_hs(FastTextConfig *c, int i, int j, int k) nogil:
@@ -398,9 +421,12 @@ cdef void init_ft_config(FastTextConfig *c, model, alpha, _work, _neu1):
     c.cbow_mean = model.cbow_mean
     c.window = model.window
     c.workers = model.workers
+    c.pdw = model.position_dependent_weights
 
     c.syn0_vocab = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_vocab))
     c.syn0_ngrams = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_ngrams))
+    if c.pdw:
+        c.syn0_positions = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_positions))
 
     # EXPERIMENTAL lockf scaled suppression/enablement of training
     c.vocab_lockf = <REAL_t *>(np.PyArray_DATA(model.wv.vectors_vocab_lockf))

Original file line number	Diff line number	Diff line change
`@@ -46,17 +46,18 @@ cdef struct FastTextConfig:`
`46`	`46`	`#`
`47`	`47`	`# Model parameters. These get copied as-is from the Python model.`
`48`	`48`	`#`
`49`		`- int sg, hs, negative, sample, size, window, cbow_mean, workers`
	`49`	`+ int sg, hs, pdw, negative, sample, size, window, cbow_mean, workers`
`50`	`50`	`REAL_t alpha`
`51`	`51`
`52`	`52`	`#`
`53`		`- # The syn0_vocab and syn0_ngrams arrays store vectors for vocabulary terms`
`54`		`- # and ngrams, respectively, as 1D arrays in scanline order. For example,`
`55`		`- # syn0_vocab[i * size : (i + 1) * size] contains the elements for the ith`
`56`		`- # vocab term.`
	`53`	`+ # The syn0_vocab, syn0_ngrams, and syn0_positions arrays store vectors for`
	`54`	`+ # vocabulary terms, ngrams, and positions, respectively, as 1D arrays in`
	`55`	`+ # scanline order. For example, syn0_vocab[i * size : (i + 1) * size]`
	`56`	`+ # contains the elements for the ith vocab term.`
`57`	`57`	`#`
`58`	`58`	`REAL_t *syn0_vocab`
`59`	`59`	`REAL_t *syn0_ngrams`
	`60`	`+ REAL_t *syn0_positions`
`60`	`61`
`61`	`62`	`#`
`62`	`63`	`# EXPERIMENTAL`