702 lines
31 KiB
Python
702 lines
31 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
# Authors: Chinmaya Pancholi <chinmayapancholi13@gmail.com>, Shiva Manne <s.manne@rare-technologies.com>
|
||
|
# Copyright (C) 2017 RaRe Technologies s.r.o.
|
||
|
|
||
|
"""
|
||
|
Warnings
|
||
|
--------
|
||
|
.. deprecated:: 3.3.0
|
||
|
Use :mod:`gensim.models.fasttext` instead.
|
||
|
|
||
|
|
||
|
Learn word representations via fasttext's "skip-gram and CBOW models", using either
|
||
|
hierarchical softmax or negative sampling [1]_.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
There are more ways to get word vectors in Gensim than just FastText.
|
||
|
See wrappers for VarEmbed and WordRank or Word2Vec
|
||
|
|
||
|
This module allows training a word embedding from a training corpus with the additional ability
|
||
|
to obtain word vectors for out-of-vocabulary words.
|
||
|
|
||
|
For a tutorial on gensim's native fasttext, refer to the noteboook -- [2]_
|
||
|
|
||
|
**Make sure you have a C compiler before installing gensim, to use optimized (compiled) fasttext training**
|
||
|
|
||
|
.. [1] P. Bojanowski, E. Grave, A. Joulin, T. Mikolov
|
||
|
Enriching Word Vectors with Subword Information. In arXiv preprint arXiv:1607.04606.
|
||
|
https://arxiv.org/abs/1607.04606
|
||
|
|
||
|
.. [2] https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb
|
||
|
|
||
|
"""
|
||
|
|
||
|
import logging
|
||
|
|
||
|
import numpy as np
|
||
|
from numpy import zeros, ones, vstack, sum as np_sum, empty, float32 as REAL
|
||
|
|
||
|
from gensim.models.deprecated.word2vec import Word2Vec, train_sg_pair, train_cbow_pair
|
||
|
from gensim.models.deprecated.fasttext_wrapper import FastTextKeyedVectors
|
||
|
from gensim.models.deprecated.fasttext_wrapper import FastText as Ft_Wrapper, compute_ngrams, ft_hash
|
||
|
from gensim.models.fasttext import FastText as NewFastText
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
FAST_VERSION = -1
|
||
|
MAX_WORDS_IN_BATCH = 10000
|
||
|
|
||
|
|
||
|
def load_old_fasttext(*args, **kwargs):
|
||
|
old_model = FastText.load(*args, **kwargs)
|
||
|
params = {
|
||
|
'size': old_model.vector_size,
|
||
|
'alpha': old_model.alpha,
|
||
|
'window': old_model.window,
|
||
|
'min_count': old_model.min_count,
|
||
|
'max_vocab_size': old_model.__dict__.get('max_vocab_size', None),
|
||
|
'sample': old_model.sample,
|
||
|
'seed': old_model.seed,
|
||
|
'workers': old_model.workers,
|
||
|
'min_alpha': old_model.min_alpha,
|
||
|
'sg': old_model.sg,
|
||
|
'hs': old_model.hs,
|
||
|
'negative': old_model.negative,
|
||
|
'cbow_mean': old_model.cbow_mean,
|
||
|
'hashfxn': old_model.hashfxn,
|
||
|
'iter': old_model.iter,
|
||
|
'null_word': old_model.null_word,
|
||
|
'sorted_vocab': old_model.sorted_vocab,
|
||
|
'batch_words': old_model.batch_words,
|
||
|
'min_n': old_model.min_n,
|
||
|
'max_n': old_model.max_n,
|
||
|
'word_ngrams': old_model.word_ngrams,
|
||
|
'bucket': old_model.bucket
|
||
|
}
|
||
|
new_model = NewFastText(**params)
|
||
|
# set trainables attributes
|
||
|
new_model.wv.vectors = old_model.wv.syn0
|
||
|
new_model.wv.vectors_vocab = old_model.wv.syn0_vocab
|
||
|
new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams
|
||
|
if hasattr(old_model.wv, 'syn0norm'):
|
||
|
new_model.wv.vectors_norm = old_model.wv.syn0norm
|
||
|
if hasattr(old_model, 'syn1'):
|
||
|
new_model.trainables.syn1 = old_model.syn1
|
||
|
if hasattr(old_model, 'syn1neg'):
|
||
|
new_model.trainables.syn1neg = old_model.syn1neg
|
||
|
if hasattr(old_model, 'syn0_lockf'):
|
||
|
new_model.trainables.vectors_lockf = old_model.syn0_lockf
|
||
|
|
||
|
if hasattr(old_model, 'syn0_vocab_lockf'):
|
||
|
new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf
|
||
|
if hasattr(old_model, 'syn0_ngrams_lockf'):
|
||
|
new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf
|
||
|
if hasattr(old_model.wv, 'syn0_vocab_norm'):
|
||
|
new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm
|
||
|
if hasattr(old_model.wv, 'syn0_ngrams_norm'):
|
||
|
new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm
|
||
|
|
||
|
# set vocabulary attributes
|
||
|
new_model.wv.vocab = old_model.wv.vocab
|
||
|
new_model.wv.index2word = old_model.wv.index2word
|
||
|
new_model.vocabulary.cum_table = old_model.cum_table
|
||
|
|
||
|
new_model.wv.hash2index = old_model.wv.hash2index
|
||
|
|
||
|
new_model.train_count = old_model.train_count
|
||
|
new_model.corpus_count = old_model.corpus_count
|
||
|
new_model.running_training_loss = old_model.running_training_loss
|
||
|
new_model.total_train_time = old_model.total_train_time
|
||
|
new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
|
||
|
new_model.model_trimmed_post_training = old_model.model_trimmed_post_training
|
||
|
|
||
|
new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors
|
||
|
|
||
|
return new_model
|
||
|
|
||
|
|
||
|
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
|
||
|
"""Update CBOW model by training on a sequence of sentences.
|
||
|
|
||
|
Each sentence is a list of string tokens, which are looked up in the model's
|
||
|
vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`.
|
||
|
|
||
|
This is the non-optimized, Python version. If you have cython installed, gensim
|
||
|
will use the optimized version from fasttext_inner instead.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
model : :class:`~gensim.models.fasttext.FastText`
|
||
|
`FastText` instance.
|
||
|
sentences : iterable of iterables
|
||
|
Iterable of the sentences directly from disk/network.
|
||
|
alpha : float
|
||
|
Learning rate.
|
||
|
work : :class:`numpy.ndarray`
|
||
|
Private working memory for each worker.
|
||
|
neu1 : :class:`numpy.ndarray`
|
||
|
Private working memory for each worker.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
int
|
||
|
Effective number of words trained.
|
||
|
|
||
|
"""
|
||
|
result = 0
|
||
|
for sentence in sentences:
|
||
|
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
|
||
|
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
|
||
|
for pos, word in enumerate(word_vocabs):
|
||
|
reduced_window = model.random.randint(model.window)
|
||
|
start = max(0, pos - model.window + reduced_window)
|
||
|
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
|
||
|
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
|
||
|
|
||
|
word2_subwords = []
|
||
|
vocab_subwords_indices = []
|
||
|
ngrams_subwords_indices = []
|
||
|
|
||
|
for index in word2_indices:
|
||
|
vocab_subwords_indices += [index]
|
||
|
word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]]
|
||
|
|
||
|
for subword in word2_subwords:
|
||
|
ngrams_subwords_indices.append(model.wv.ngrams[subword])
|
||
|
|
||
|
l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size
|
||
|
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size
|
||
|
|
||
|
l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
|
||
|
subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices]
|
||
|
if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean:
|
||
|
l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))
|
||
|
|
||
|
# train on the sliding window for target word
|
||
|
train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True)
|
||
|
result += len(word_vocabs)
|
||
|
return result
|
||
|
|
||
|
|
||
|
def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
|
||
|
"""Update skip-gram model by training on a sequence of sentences.
|
||
|
|
||
|
Each sentence is a list of string tokens, which are looked up in the model's
|
||
|
vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`.
|
||
|
|
||
|
This is the non-optimized, Python version. If you have cython installed, gensim
|
||
|
will use the optimized version from fasttext_inner instead.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
model : :class:`~gensim.models.fasttext.FastText`
|
||
|
`FastText` instance.
|
||
|
sentences : iterable of iterables
|
||
|
Iterable of the sentences directly from disk/network.
|
||
|
alpha : float
|
||
|
Learning rate.
|
||
|
work : :class:`numpy.ndarray`
|
||
|
Private working memory for each worker.
|
||
|
neu1 : :class:`numpy.ndarray`
|
||
|
Private working memory for each worker.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
int
|
||
|
Effective number of words trained.
|
||
|
|
||
|
"""
|
||
|
result = 0
|
||
|
for sentence in sentences:
|
||
|
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
|
||
|
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
|
||
|
for pos, word in enumerate(word_vocabs):
|
||
|
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
|
||
|
# now go over all words from the (reduced) window, predicting each one in turn
|
||
|
start = max(0, pos - model.window + reduced_window)
|
||
|
|
||
|
subwords_indices = [word.index]
|
||
|
word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]]
|
||
|
|
||
|
for subword in word2_subwords:
|
||
|
subwords_indices.append(model.wv.ngrams[subword])
|
||
|
|
||
|
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
|
||
|
if pos2 != pos: # don't train on the `word` itself
|
||
|
train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True)
|
||
|
|
||
|
result += len(word_vocabs)
|
||
|
return result
|
||
|
|
||
|
|
||
|
class FastText(Word2Vec):
|
||
|
"""Class for training, using and evaluating word representations learned using method
|
||
|
described in [1]_ aka Fasttext.
|
||
|
|
||
|
The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and
|
||
|
:meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original
|
||
|
fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`.
|
||
|
|
||
|
"""
|
||
|
def __init__(
|
||
|
self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
|
||
|
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
|
||
|
negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1,
|
||
|
bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH):
|
||
|
"""Initialize the model from an iterable of `sentences`. Each sentence is a
|
||
|
list of words (unicode strings) that will be used for training.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sentences : iterable of iterables
|
||
|
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
|
||
|
consider an iterable that streams the sentences directly from disk/network.
|
||
|
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
|
||
|
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
|
||
|
If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
|
||
|
in some other way.
|
||
|
sg : int {1, 0}
|
||
|
Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed.
|
||
|
size : int
|
||
|
Dimensionality of the feature vectors.
|
||
|
window : int
|
||
|
The maximum distance between the current and predicted word within a sentence.
|
||
|
alpha : float
|
||
|
The initial learning rate.
|
||
|
min_alpha : float
|
||
|
Learning rate will linearly drop to `min_alpha` as training progresses.
|
||
|
seed : int
|
||
|
Seed for the random number generator. Initial vectors for each word are seeded with a hash of
|
||
|
the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
|
||
|
you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
|
||
|
from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
|
||
|
use of the `PYTHONHASHSEED` environment variable to control hash randomization).
|
||
|
min_count : int
|
||
|
Ignores all words with total frequency lower than this.
|
||
|
max_vocab_size : int
|
||
|
Limits the RAM during vocabulary building; if there are more unique
|
||
|
words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
|
||
|
Set to `None` for no limit.
|
||
|
sample : float
|
||
|
The threshold for configuring which higher-frequency words are randomly downsampled,
|
||
|
useful range is (0, 1e-5).
|
||
|
workers : int
|
||
|
Use these many worker threads to train the model (=faster training with multicore machines).
|
||
|
hs : int {1,0}
|
||
|
If 1, hierarchical softmax will be used for model training.
|
||
|
If set to 0, and `negative` is non-zero, negative sampling will be used.
|
||
|
negative : int
|
||
|
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
|
||
|
should be drawn (usually between 5-20).
|
||
|
If set to 0, no negative sampling is used.
|
||
|
cbow_mean : int {1,0}
|
||
|
If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
|
||
|
hashfxn : function
|
||
|
Hash function to use to randomly initialize weights, for increased training reproducibility.
|
||
|
iter : int
|
||
|
Number of iterations (epochs) over the corpus.
|
||
|
trim_rule : function
|
||
|
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
|
||
|
be trimmed away, or handled using the default (discard if word count < min_count).
|
||
|
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
|
||
|
or a callable that accepts parameters (word, count, min_count) and returns either
|
||
|
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
|
||
|
Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
|
||
|
of the model.
|
||
|
sorted_vocab : int {1,0}
|
||
|
If 1, sort the vocabulary by descending frequency before assigning word indexes.
|
||
|
batch_words : int
|
||
|
Target size (in words) for batches of examples passed to worker threads (and
|
||
|
thus cython routines).(Larger batches will be passed if individual
|
||
|
texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
|
||
|
min_n : int
|
||
|
Min length of char ngrams to be used for training word representations.
|
||
|
max_n : int
|
||
|
Max length of char ngrams to be used for training word representations. Set `max_n` to be
|
||
|
lesser than `min_n` to avoid char ngrams being used.
|
||
|
word_ngrams : int {1,0}
|
||
|
If 1, uses enriches word vectors with subword(ngrams) information.
|
||
|
If 0, this is equivalent to word2vec.
|
||
|
bucket : int
|
||
|
Character ngrams are hashed into a fixed number of buckets, in order to limit the
|
||
|
memory usage of the model. This option specifies the number of buckets used by the model.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Initialize and train a `FastText` model
|
||
|
|
||
|
>>> from gensim.models import FastText
|
||
|
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
|
||
|
>>>
|
||
|
>>> model = FastText(sentences, min_count=1)
|
||
|
>>> say_vector = model['say'] # get vector for word
|
||
|
>>> of_vector = model['of'] # get vector for out-of-vocab word
|
||
|
|
||
|
|
||
|
"""
|
||
|
# fastText specific params
|
||
|
self.bucket = bucket
|
||
|
self.word_ngrams = word_ngrams
|
||
|
self.min_n = min_n
|
||
|
self.max_n = max_n
|
||
|
if self.word_ngrams <= 1 and self.max_n == 0:
|
||
|
self.bucket = 0
|
||
|
|
||
|
super(FastText, self).__init__(
|
||
|
sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count,
|
||
|
max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
|
||
|
sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word,
|
||
|
trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words)
|
||
|
|
||
|
def initialize_word_vectors(self):
|
||
|
"""Initializes FastTextKeyedVectors instance to store all vocab/ngram vectors for the model."""
|
||
|
self.wv = FastTextKeyedVectors()
|
||
|
self.wv.min_n = self.min_n
|
||
|
self.wv.max_n = self.max_n
|
||
|
|
||
|
def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False):
|
||
|
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
|
||
|
Each sentence must be a list of unicode strings.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sentences : iterable of iterables
|
||
|
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
|
||
|
consider an iterable that streams the sentences directly from disk/network.
|
||
|
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
|
||
|
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
|
||
|
keep_raw_vocab : bool
|
||
|
If not true, delete the raw vocabulary after the scaling is done and free up RAM.
|
||
|
trim_rule : function
|
||
|
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
|
||
|
be trimmed away, or handled using the default (discard if word count < min_count).
|
||
|
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
|
||
|
or a callable that accepts parameters (word, count, min_count) and returns either
|
||
|
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
|
||
|
Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
|
||
|
of the model.
|
||
|
progress_per : int
|
||
|
Indicates how many words to process before showing/updating the progress.
|
||
|
update: bool
|
||
|
If true, the new words in `sentences` will be added to model's vocab.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
Train a model and update vocab for online training
|
||
|
|
||
|
>>> from gensim.models import FastText
|
||
|
>>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
|
||
|
>>> sentences_2 = [["dude", "say", "wazzup!"]]
|
||
|
>>>
|
||
|
>>> model = FastText(min_count=1)
|
||
|
>>> model.build_vocab(sentences_1)
|
||
|
>>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
|
||
|
>>> model.build_vocab(sentences_2, update=True)
|
||
|
>>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)
|
||
|
|
||
|
"""
|
||
|
if update:
|
||
|
if not len(self.wv.vocab):
|
||
|
raise RuntimeError(
|
||
|
"You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
|
||
|
"First build the vocabulary of your model with a corpus "
|
||
|
"before doing an online update.")
|
||
|
self.old_vocab_len = len(self.wv.vocab)
|
||
|
self.old_hash2index_len = len(self.wv.hash2index)
|
||
|
|
||
|
super(FastText, self).build_vocab(
|
||
|
sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update)
|
||
|
self.init_ngrams(update=update)
|
||
|
|
||
|
def init_ngrams(self, update=False):
|
||
|
"""Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams.
|
||
|
Vectors for other ngrams are initialized with a random uniform distribution in FastText.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
update : bool
|
||
|
If True, the new vocab words and their new ngrams word vectors are initialized
|
||
|
with random uniform distribution and updated/added to the existing vocab word and ngram vectors.
|
||
|
|
||
|
"""
|
||
|
if not update:
|
||
|
self.wv.ngrams = {}
|
||
|
self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL)
|
||
|
self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL)
|
||
|
|
||
|
self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL)
|
||
|
self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL)
|
||
|
|
||
|
all_ngrams = []
|
||
|
for w, v in self.wv.vocab.items():
|
||
|
self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n)
|
||
|
all_ngrams += self.wv.ngrams_word[w]
|
||
|
|
||
|
all_ngrams = list(set(all_ngrams))
|
||
|
self.num_ngram_vectors = len(all_ngrams)
|
||
|
logger.info("Total number of ngrams is %d", len(all_ngrams))
|
||
|
|
||
|
self.wv.hash2index = {}
|
||
|
ngram_indices = []
|
||
|
new_hash_count = 0
|
||
|
for i, ngram in enumerate(all_ngrams):
|
||
|
ngram_hash = ft_hash(ngram) % self.bucket
|
||
|
if ngram_hash in self.wv.hash2index:
|
||
|
self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
|
||
|
else:
|
||
|
ngram_indices.append(ngram_hash % self.bucket)
|
||
|
self.wv.hash2index[ngram_hash] = new_hash_count
|
||
|
self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
|
||
|
new_hash_count = new_hash_count + 1
|
||
|
|
||
|
self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0)
|
||
|
self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0)
|
||
|
self.reset_ngram_weights()
|
||
|
else:
|
||
|
new_ngrams = []
|
||
|
for w, v in self.wv.vocab.items():
|
||
|
self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n)
|
||
|
new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams]
|
||
|
|
||
|
new_ngrams = list(set(new_ngrams))
|
||
|
logger.info("Number of new ngrams is %d", len(new_ngrams))
|
||
|
new_hash_count = 0
|
||
|
for i, ngram in enumerate(new_ngrams):
|
||
|
ngram_hash = ft_hash(ngram) % self.bucket
|
||
|
if ngram_hash not in self.wv.hash2index:
|
||
|
self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len
|
||
|
self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
|
||
|
new_hash_count = new_hash_count + 1
|
||
|
else:
|
||
|
self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
|
||
|
|
||
|
rand_obj = np.random
|
||
|
rand_obj.seed(self.seed)
|
||
|
new_vocab_rows = rand_obj.uniform(
|
||
|
-1.0 / self.vector_size, 1.0 / self.vector_size,
|
||
|
(len(self.wv.vocab) - self.old_vocab_len, self.vector_size)
|
||
|
).astype(REAL)
|
||
|
new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL)
|
||
|
new_ngram_rows = rand_obj.uniform(
|
||
|
-1.0 / self.vector_size, 1.0 / self.vector_size,
|
||
|
(len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)
|
||
|
).astype(REAL)
|
||
|
new_ngram_lockf_rows = ones(
|
||
|
(len(self.wv.hash2index) - self.old_hash2index_len,
|
||
|
self.vector_size),
|
||
|
dtype=REAL)
|
||
|
|
||
|
self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
|
||
|
self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows])
|
||
|
self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows])
|
||
|
self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows])
|
||
|
|
||
|
def reset_ngram_weights(self):
|
||
|
"""Reset all projection weights to an initial (untrained) state,
|
||
|
but keep the existing vocabulary and their ngrams.
|
||
|
|
||
|
"""
|
||
|
rand_obj = np.random
|
||
|
rand_obj.seed(self.seed)
|
||
|
for index in range(len(self.wv.vocab)):
|
||
|
self.wv.syn0_vocab[index] = rand_obj.uniform(
|
||
|
-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size
|
||
|
).astype(REAL)
|
||
|
for index in range(len(self.wv.hash2index)):
|
||
|
self.wv.syn0_ngrams[index] = rand_obj.uniform(
|
||
|
-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size
|
||
|
).astype(REAL)
|
||
|
|
||
|
def _do_train_job(self, sentences, alpha, inits):
|
||
|
"""Train a single batch of sentences. Return 2-tuple `(effective word count after
|
||
|
ignoring unknown words and sentence length trimming, total word count)`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sentences : iterable of iterables
|
||
|
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
|
||
|
consider an iterable that streams the sentences directly from disk/network.
|
||
|
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
|
||
|
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
|
||
|
alpha : float
|
||
|
The current learning rate.
|
||
|
inits : (:class:`numpy.ndarray`, :class:`numpy.ndarray`)
|
||
|
Each worker's private work memory.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(int, int)
|
||
|
Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count)
|
||
|
|
||
|
"""
|
||
|
work, neu1 = inits
|
||
|
tally = 0
|
||
|
if self.sg:
|
||
|
tally += train_batch_sg(self, sentences, alpha, work, neu1)
|
||
|
else:
|
||
|
tally += train_batch_cbow(self, sentences, alpha, work, neu1)
|
||
|
|
||
|
return tally, self._raw_word_count(sentences)
|
||
|
|
||
|
def train(self, sentences, total_examples=None, total_words=None,
|
||
|
epochs=None, start_alpha=None, end_alpha=None,
|
||
|
word_count=0, queue_factor=2, report_delay=1.0):
|
||
|
"""Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
|
||
|
For FastText, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)
|
||
|
|
||
|
To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
|
||
|
progress-percentage logging, either total_examples (count of sentences) or total_words (count of
|
||
|
raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
|
||
|
:meth:`~gensim.models.fasttext.FastText.build_vocab()`, the count of examples in that corpus
|
||
|
will be available in the model's :attr:`corpus_count` property).
|
||
|
|
||
|
To avoid common mistakes around the model's ability to do multiple training passes itself, an
|
||
|
explicit `epochs` argument **MUST** be provided. In the common and recommended case,
|
||
|
where :meth:`~gensim.models.fasttext.FastText.train()` is only called once,
|
||
|
the model's cached `iter` value should be supplied as `epochs` value.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sentences : iterable of iterables
|
||
|
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
|
||
|
consider an iterable that streams the sentences directly from disk/network.
|
||
|
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
|
||
|
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
|
||
|
total_examples : int
|
||
|
Count of sentences.
|
||
|
total_words : int
|
||
|
Count of raw words in sentences.
|
||
|
epochs : int
|
||
|
Number of iterations (epochs) over the corpus.
|
||
|
start_alpha : float
|
||
|
Initial learning rate.
|
||
|
end_alpha : float
|
||
|
Final learning rate. Drops linearly from `start_alpha`.
|
||
|
word_count : int
|
||
|
Count of words already trained. Set this to 0 for the usual
|
||
|
case of training on all words in sentences.
|
||
|
queue_factor : int
|
||
|
Multiplier for size of queue (number of workers * queue_factor).
|
||
|
report_delay : float
|
||
|
Seconds to wait before reporting progress.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.models import FastText
|
||
|
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
|
||
|
>>>
|
||
|
>>> model = FastText(min_count=1)
|
||
|
>>> model.build_vocab(sentences)
|
||
|
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
|
||
|
|
||
|
"""
|
||
|
self.neg_labels = []
|
||
|
if self.negative > 0:
|
||
|
# precompute negative labels optimization for pure-python training
|
||
|
self.neg_labels = zeros(self.negative + 1)
|
||
|
self.neg_labels[0] = 1.
|
||
|
|
||
|
Word2Vec.train(
|
||
|
self, sentences, total_examples=self.corpus_count, epochs=self.iter,
|
||
|
start_alpha=self.alpha, end_alpha=self.min_alpha)
|
||
|
self.get_vocab_word_vecs()
|
||
|
|
||
|
def __getitem__(self, word):
|
||
|
"""Get `word` representations in vector space, as a 1D numpy array.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
word : str
|
||
|
A single word whose vector needs to be returned.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`numpy.ndarray`
|
||
|
The word's representations in vector space, as a 1D numpy array.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
KeyError
|
||
|
For words with all ngrams absent, a KeyError is raised.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
>>> from gensim.models import FastText
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> trained_model = FastText.load_fasttext_format(datapath('lee_fasttext'))
|
||
|
>>> meow_vector = trained_model['hello'] # get vector for word
|
||
|
|
||
|
"""
|
||
|
return self.word_vec(word)
|
||
|
|
||
|
def get_vocab_word_vecs(self):
|
||
|
"""Calculate vectors for words in vocabulary and stores them in `wv.syn0`."""
|
||
|
for w, v in self.wv.vocab.items():
|
||
|
word_vec = np.copy(self.wv.syn0_vocab[v.index])
|
||
|
ngrams = self.wv.ngrams_word[w]
|
||
|
ngram_weights = self.wv.syn0_ngrams
|
||
|
for ngram in ngrams:
|
||
|
word_vec += ngram_weights[self.wv.ngrams[ngram]]
|
||
|
word_vec /= (len(ngrams) + 1)
|
||
|
self.wv.syn0[v.index] = word_vec
|
||
|
|
||
|
def word_vec(self, word, use_norm=False):
|
||
|
"""Get the word's representations in vector space, as a 1D numpy array.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
word : str
|
||
|
A single word whose vector needs to be returned.
|
||
|
use_norm : bool
|
||
|
If True, returns normalized vector.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`numpy.ndarray`
|
||
|
The word's representations in vector space, as a 1D numpy array.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
KeyError
|
||
|
For words with all ngrams absent, a KeyError is raised.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
>>> from gensim.models import FastText
|
||
|
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
|
||
|
>>>
|
||
|
>>> model = FastText(sentences, min_count=1)
|
||
|
>>> meow_vector = model.word_vec('meow') # get vector for word
|
||
|
|
||
|
"""
|
||
|
return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm)
|
||
|
|
||
|
@classmethod
|
||
|
def load_fasttext_format(cls, *args, **kwargs):
|
||
|
"""Load a :class:`~gensim.models.fasttext.FastText` model from a format compatible with
|
||
|
the original fasttext implementation.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname : str
|
||
|
Path to the file.
|
||
|
|
||
|
"""
|
||
|
return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
|
||
|
|
||
|
def save(self, *args, **kwargs):
|
||
|
"""Save the model. This saved model can be loaded again using :func:`~gensim.models.fasttext.FastText.load`,
|
||
|
which supports online training and getting vectors for out-of-vocabulary words.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname : str
|
||
|
Path to the file.
|
||
|
|
||
|
"""
|
||
|
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm'])
|
||
|
super(FastText, self).save(*args, **kwargs)
|