laywerrobot/lib/python3.6/site-packages/gensim/models/base_any2vec.py

1317 lines
59 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Shiva Manne <manneshiva@gmail.com>
# Copyright (C) 2018 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This module contains base classes required for implementing \*2vec algorithms.
The class hierarchy is designed to facilitate adding more concrete implementations for creating embeddings.
In the most general case, the purpose of this class is to transform an arbitrary representation to a numerical vector
(embedding). This is represented by the base :class:`~gensim.models.base_any2vec.BaseAny2VecModel`. The input space in
most cases (in the NLP field at least) is plain text. For this reason, we enrich the class hierarchy with the abstract
:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` to be used as a base for models where the input
space is text.
Notes
-----
Even though this is the usual case, not all embeddings transform text, such as the
:class:`~gensim.models.poincare.PoincareModel` that embeds graphs.
See Also
--------
:class:`~gensim.models.word2vec.Word2Vec`.
Word2Vec model - embeddings for words.
:class:`~gensim.models.fasttext.FastText`.
FastText model - embeddings for words (ngram-based).
:class:`~gensim.models.doc2vec.Doc2Vec`.
Doc2Vec model - embeddings for documents.
:class:`~gensim.models.poincare.PoincareModel`
Poincare model - embeddings for graphs.
"""
from gensim import utils
import logging
from timeit import default_timer
import threading
from six.moves import xrange
from six import itervalues
from gensim import matutils
from numpy import float32 as REAL, ones, random, dtype, zeros
from types import GeneratorType
from gensim.utils import deprecated
import warnings
try:
from queue import Queue
except ImportError:
from Queue import Queue
logger = logging.getLogger(__name__)
class BaseAny2VecModel(utils.SaveLoad):
"""Base class for training, using and evaluating \*2vec model.
Contains implementation for multi-threaded training. The purpose of this class is to provide a
reference interface for concrete embedding implementations, whether the input space is a corpus
of words, documents or anything else. At the same time, functionality that we expect to be common
for those implementations is provided here to avoid code duplication.
In the special but usual case where the input space consists of words, a more specialized layer
is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
Notes
-----
A subclass should initialize the following attributes:
* self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example)
* self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example)
* self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example)
"""
def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000):
"""
Parameters
----------
workers : int, optional
Number of working threads, used for multithreading.
vector_size : int, optional
Dimensionality of the feature vectors.
epochs : int, optional
Number of iterations (epochs) of training through the corpus.
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks that need to be executed/run at specific stages during training.
batch_words : int, optional
Number of words to be processed by a single job.
"""
self.vector_size = int(vector_size)
self.workers = int(workers)
self.epochs = epochs
self.train_count = 0
self.total_train_time = 0
self.batch_words = batch_words
self.model_trimmed_post_training = False
self.callbacks = callbacks
def _get_job_params(self, cur_epoch):
"""Get job parameters required for each batch."""
raise NotImplementedError()
def _set_train_params(self, **kwargs):
"""Set model parameters required for training."""
raise NotImplementedError()
def _update_job_params(self, job_params, epoch_progress, cur_epoch):
"""Get updated job parameters based on the epoch_progress and cur_epoch."""
raise NotImplementedError()
def _get_thread_working_mem(self):
"""Get private working memory per thread."""
raise NotImplementedError()
def _raw_word_count(self, job):
"""Get the number of words in a given job."""
raise NotImplementedError()
def _clear_post_train(self):
"""Resets certain properties of the model post training. eg. `keyedvectors.vectors_norm`."""
raise NotImplementedError()
def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
"""Train a single batch. Return 2-tuple `(effective word count, total word count)`."""
raise NotImplementedError()
def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs):
"""Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided."""
raise NotImplementedError()
def _worker_loop(self, job_queue, progress_queue):
"""Train the model, lifting batches of data from the queue.
This function will be called in parallel by multiple workers (threads or processes) to make
optimal use of multicore machines.
Parameters
----------
job_queue : Queue of (list of objects, (str, int))
A queue of jobs still to be processed. The worker will take up jobs from this queue.
Each job is represented by a tuple where the first element is the corpus chunk to be processed and
the second is the dictionary of parameters.
progress_queue : Queue of (int, int, int)
A queue of progress reports. Each report is represented as a tuple of these 3 elements:
* Size of data chunk processed, for example number of sentences in the corpus chunk.
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
* Total word count used in training.
"""
thread_private_mem = self._get_thread_working_mem()
jobs_processed = 0
while True:
job = job_queue.get()
if job is None:
progress_queue.put(None)
break # no more jobs => quit this worker
data_iterable, job_parameters = job
for callback in self.callbacks:
callback.on_batch_begin(self)
tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem)
for callback in self.callbacks:
callback.on_batch_end(self)
progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress
jobs_processed += 1
logger.debug("worker exiting, processed %i jobs", jobs_processed)
def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=None, total_words=None):
"""Fill the jobs queue using the data found in the input stream.
Each job is represented by a tuple where the first element is the corpus chunk to be processed and
the second is a dictionary of parameters.
Parameters
----------
data_iterator : iterable of list of objects
The input dataset. This will be split in chunks and these chunks will be pushed to the queue.
job_queue : Queue of (list of object, dict of (str, int))
A queue of jobs still to be processed. The worker will take up jobs from this queue.
Each job is represented by a tuple where the first element is the corpus chunk to be processed and
the second is the dictionary of parameters.
cur_epoch : int, optional
The current training epoch, needed to compute the training parameters for each job.
For example in many implementations the learning rate would be dropping with the number of epochs.
total_examples : int, optional
Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
in a corpus. Used to log progress.
total_words : int, optional
Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
words in a corpus. Used to log progress.
"""
job_batch, batch_size = [], 0
pushed_words, pushed_examples = 0, 0
next_job_params = self._get_job_params(cur_epoch)
job_no = 0
for data_idx, data in enumerate(data_iterator):
data_length = self._raw_word_count([data])
# can we fit this sentence into the existing job batch?
if batch_size + data_length <= self.batch_words:
# yes => add it to the current job
job_batch.append(data)
batch_size += data_length
else:
job_no += 1
job_queue.put((job_batch, next_job_params))
# update the learning rate for the next job
if total_examples:
# examples-based decay
pushed_examples += len(job_batch)
epoch_progress = 1.0 * pushed_examples / total_examples
else:
# words-based decay
pushed_words += self._raw_word_count(job_batch)
epoch_progress = 1.0 * pushed_words / total_words
next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch)
# add the sentence that didn't fit as the first item of a new job
job_batch, batch_size = [data], data_length
# add the last job too (may be significantly smaller than batch_words)
if job_batch:
job_no += 1
job_queue.put((job_batch, next_job_params))
if job_no == 0 and self.train_count == 0:
logger.warning(
"train() called with an empty iterator (if not intended, "
"be sure to provide a corpus that offers restartable iteration = an iterable)."
)
# give the workers heads up that they can finish -- no more work!
for _ in xrange(self.workers):
job_queue.put(None)
logger.debug("job loop exiting, total %i jobs", job_no)
def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
raw_word_count, total_words, trained_word_count, elapsed):
raise NotImplementedError()
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
trained_word_count, elapsed):
raise NotImplementedError()
def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally):
raise NotImplementedError()
def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None,
report_delay=1.0):
"""Get the progress report for a single training epoch.
Parameters
----------
progress_queue : Queue of (int, int, int)
A queue of progress reports. Each report is represented as a tuple of these 3 elements:
* size of data chunk processed, for example number of sentences in the corpus chunk.
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
* Total word count used in training.
job_queue : Queue of (list of object, dict of (str, int))
A queue of jobs still to be processed. The worker will take up jobs from this queue.
Each job is represented by a tuple where the first element is the corpus chunk to be processed and
the second is the dictionary of parameters.
cur_epoch : int, optional
The current training epoch, needed to compute the training parameters for each job.
For example in many implementations the learning rate would be dropping with the number of epochs.
total_examples : int, optional
Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
in a corpus. Used to log progress.
total_words : int, optional
Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
words in a corpus. Used to log progress.
report_delay : float, optional
Number of seconds between two consecutive progress report messages in the logger.
Returns
-------
(int, int, int)
The epoch report consisting of three elements:
* size of data chunk processed, for example number of sentences in the corpus chunk.
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
* Total word count used in training.
"""
example_count, trained_word_count, raw_word_count = 0, 0, 0
start, next_report = default_timer() - 0.00001, 1.0
job_tally = 0
unfinished_worker_count = self.workers
while unfinished_worker_count > 0:
report = progress_queue.get() # blocks if workers too slow
if report is None: # a thread reporting that it finished
unfinished_worker_count -= 1
logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count)
continue
examples, trained_words, raw_words = report
job_tally += 1
# update progress stats
example_count += examples
trained_word_count += trained_words # only words in vocab & sampled
raw_word_count += raw_words
# log progress once every report_delay seconds
elapsed = default_timer() - start
if elapsed >= next_report:
self._log_progress(
job_queue, progress_queue, cur_epoch, example_count, total_examples,
raw_word_count, total_words, trained_word_count, elapsed)
next_report = elapsed + report_delay
# all done; report the final stats
elapsed = default_timer() - start
self._log_epoch_end(
cur_epoch, example_count, total_examples, raw_word_count, total_words,
trained_word_count, elapsed)
self.total_train_time += elapsed
return trained_word_count, raw_word_count, job_tally
def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None,
total_words=None, queue_factor=2, report_delay=1.0):
"""Train the model for a single epoch.
Parameters
----------
data_iterable : iterable of list of object
The input corpus. This will be split in chunks and these chunks will be pushed to the queue.
cur_epoch : int, optional
The current training epoch, needed to compute the training parameters for each job.
For example in many implementations the learning rate would be dropping with the number of epochs.
total_examples : int, optional
Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
in a corpus, used to log progress.
total_words : int, optional
Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
words in a corpus, used to log progress.
queue_factor : int, optional
Multiplier for size of queue -> size = number of workers * queue_factor.
report_delay : float, optional
Number of seconds between two consecutive progress report messages in the logger.
Returns
-------
(int, int, int)
The training report for this epoch consisting of three elements:
* Size of data chunk processed, for example number of sentences in the corpus chunk.
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
* Total word count used in training.
"""
job_queue = Queue(maxsize=queue_factor * self.workers)
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
workers = [
threading.Thread(
target=self._worker_loop,
args=(job_queue, progress_queue,))
for _ in xrange(self.workers)
]
workers.append(threading.Thread(
target=self._job_producer,
args=(data_iterable, job_queue),
kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words}))
for thread in workers:
thread.daemon = True # make interrupting the process with ctrl+c easier
thread.start()
trained_word_count, raw_word_count, job_tally = self._log_epoch_progress(
progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words,
report_delay=report_delay)
return trained_word_count, raw_word_count, job_tally
def train(self, data_iterable, epochs=None, total_examples=None,
total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs):
"""Train the model for multiple epochs using multiple workers.
Parameters
----------
data_iterable : iterable of list of object
The input corpus. This will be split in chunks and these chunks will be pushed to the queue.
epochs : int, optional
Number of epochs (training iterations over the whole input) of training.
total_examples : int, optional
Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
in a corpus, used to log progress.
total_words : int, optional
Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
words in a corpus, used to log progress.
queue_factor : int, optional
Multiplier for size of queue -> size = number of workers * queue_factor.
report_delay : float, optional
Number of seconds between two consecutive progress report messages in the logger.
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks to execute at specific stages during training.
**kwargs : object
Additional key word parameters for the specific model inheriting from this class.
Returns
-------
(int, int)
The total training report consisting of two elements:
* size of total data processed, for example number of sentences in the whole corpus.
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
"""
self._set_train_params(**kwargs)
if callbacks:
self.callbacks = callbacks
self.epochs = epochs
self._check_training_sanity(
epochs=epochs,
total_examples=total_examples,
total_words=total_words, **kwargs)
for callback in self.callbacks:
callback.on_train_begin(self)
trained_word_count = 0
raw_word_count = 0
start = default_timer() - 0.00001
job_tally = 0
for cur_epoch in range(self.epochs):
for callback in self.callbacks:
callback.on_epoch_begin(self)
trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch(
data_iterable, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words,
queue_factor=queue_factor, report_delay=report_delay)
trained_word_count += trained_word_count_epoch
raw_word_count += raw_word_count_epoch
job_tally += job_tally_epoch
for callback in self.callbacks:
callback.on_epoch_end(self)
# Log overall time
total_elapsed = default_timer() - start
self._log_train_end(raw_word_count, trained_word_count, total_elapsed, job_tally)
self.train_count += 1 # number of times train() has been called
self._clear_post_train()
for callback in self.callbacks:
callback.on_train_end(self)
return trained_word_count, raw_word_count
@classmethod
def load(cls, fname_or_handle, **kwargs):
"""Load a previously saved object (using :meth:`gensim.models.base_any2vec.BaseAny2VecModel.save`) from a file.
Parameters
----------
fname_or_handle : {str, file-like object}
Path to file that contains needed object or handle to an open file.
**kwargs : object
Keyword arguments propagated to :meth:`~gensim.utils.SaveLoad.load`.
See Also
--------
:meth:`~gensim.models.base_any2vec.BaseAny2VecModel.save`
Method for save a model.
Returns
-------
object
Object loaded from `fname_or_handle`.
Raises
------
IOError
When methods are called on an instance (should be called on a class, this is a class method).
"""
return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs)
def save(self, fname_or_handle, **kwargs):
""""Save the object to file.
Parameters
----------
fname_or_handle : {str, file-like object}
Path to file where the model will be persisted.
**kwargs : object
Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`.
See Also
--------
:meth:`~gensim.models.base_any2vec.BaseAny2VecModel.load`
Method for load model after current method.
"""
super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs)
class BaseWordEmbeddingsModel(BaseAny2VecModel):
"""Base class containing common methods for training, using & evaluating word embeddings learning models.
See Also
--------
:class:`~gensim.models.word2vec.Word2Vec`.
Word2Vec model - embeddings for words.
:class:`~gensim.models.fasttext.FastText`.
FastText model - embeddings for words (ngram-based).
:class:`~gensim.models.doc2vec.Doc2Vec`.
Doc2Vec model - embeddings for documents.
:class:`~gensim.models.poincare.PoincareModel`
Poincare model - embeddings for graphs.
"""
def _clear_post_train(self):
raise NotImplementedError()
def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
raise NotImplementedError()
def _set_train_params(self, **kwargs):
raise NotImplementedError()
def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000,
trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1,
min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs):
"""
Parameters
----------
sentences : iterable of list of str, optional
Can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` for such examples.
workers : int, optional
Number of working threads, used for multiprocessing.
vector_size : int, optional
Dimensionality of the feature vectors.
epochs : int, optional
Number of iterations (epochs) of training through the corpus.
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks that need to be executed/run at specific stages during training.
batch_words : int, optional
Number of words to be processed by a single job.
trim_rule : function, optional
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
or a callable that accepts parameters (word, count, min_count) and returns either
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
The rule, if given, is only used to prune vocabulary during current method call and is not stored as part
of the model.
The input parameters are of the following types:
* `word` (str) - the word we are examining
* `count` (int) - the word's frequency count in the corpus
* `min_count` (int) - the minimum count threshold.
sg : {1, 0}, optional
Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed.
alpha : float, optional
The beginning learning rate. This will linearly reduce with iterations until it reaches `min_alpha`.
window : int, optional
The maximum distance between the current and predicted word within a sentence.
seed : int, optional
Seed for the random number generator. Initial vectors for each word are seeded with a hash of
the concatenation of word + `str(seed)`.
Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker
thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling.
In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED`
environment variable to control hash randomization.
hs : {1,0}, optional
If 1, hierarchical softmax will be used for model training.
If set to 0, and `negative` is non-zero, negative sampling will be used.
negative : int, optional
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
cbow_mean : {1,0}, optional
If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
min_alpha : float, optional
Final learning rate. Drops linearly with the number of iterations from `alpha`.
compute_loss : bool, optional
If True, loss will be computed while training the Word2Vec model and stored in
:attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss` attribute.
fast_version : {-1, 1}, optional
Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
**kwargs : object
Key word arguments needed to allow children classes to accept more arguments.
"""
self.sg = int(sg)
if vector_size % 4 != 0:
logger.warning("consider setting layer size to a multiple of 4 for greater performance")
self.alpha = float(alpha)
self.window = int(window)
self.random = random.RandomState(seed)
self.min_alpha = float(min_alpha)
self.hs = int(hs)
self.negative = int(negative)
self.ns_exponent = ns_exponent
self.cbow_mean = int(cbow_mean)
self.compute_loss = bool(compute_loss)
self.running_training_loss = 0
self.min_alpha_yet_reached = float(alpha)
self.corpus_count = 0
super(BaseWordEmbeddingsModel, self).__init__(
workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words)
if fast_version < 0:
warnings.warn(
"C extension not loaded, training will be slow. "
"Install a C compiler and reinstall gensim for fast training."
)
self.neg_labels = []
if self.negative > 0:
# precompute negative labels optimization for pure-python training
self.neg_labels = zeros(self.negative + 1)
self.neg_labels[0] = 1.
if sentences is not None:
if isinstance(sentences, GeneratorType):
raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
self.build_vocab(sentences, trim_rule=trim_rule)
self.train(
sentences, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha,
end_alpha=self.min_alpha, compute_loss=compute_loss)
else:
if trim_rule is not None:
logger.warning(
"The rule, if given, is only used to prune vocabulary during build_vocab() "
"and is not stored as part of the model. Model initialized without sentences. "
"trim_rule provided, if any, will be ignored.")
# for backward compatibility (aliases pointing to corresponding variables in trainables, vocabulary)
@property
@deprecated("Attribute will be removed in 4.0.0, use self.epochs instead")
def iter(self):
return self.epochs
@iter.setter
@deprecated("Attribute will be removed in 4.0.0, use self.epochs instead")
def iter(self, value):
self.epochs = value
@property
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead")
def syn1(self):
return self.trainables.syn1
@syn1.setter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead")
def syn1(self, value):
self.trainables.syn1 = value
@syn1.deleter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead")
def syn1(self):
del self.trainables.syn1
@property
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead")
def syn1neg(self):
return self.trainables.syn1neg
@syn1neg.setter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead")
def syn1neg(self, value):
self.trainables.syn1neg = value
@syn1neg.deleter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead")
def syn1neg(self):
del self.trainables.syn1neg
@property
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead")
def syn0_lockf(self):
return self.trainables.vectors_lockf
@syn0_lockf.setter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead")
def syn0_lockf(self, value):
self.trainables.vectors_lockf = value
@syn0_lockf.deleter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead")
def syn0_lockf(self):
del self.trainables.vectors_lockf
@property
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.layer1_size instead")
def layer1_size(self):
return self.trainables.layer1_size
@layer1_size.setter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.layer1_size instead")
def layer1_size(self, value):
self.trainables.layer1_size = value
@property
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.hashfxn instead")
def hashfxn(self):
return self.trainables.hashfxn
@hashfxn.setter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.hashfxn instead")
def hashfxn(self, value):
self.trainables.hashfxn = value
@property
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.sample instead")
def sample(self):
return self.vocabulary.sample
@sample.setter
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.sample instead")
def sample(self, value):
self.vocabulary.sample = value
@property
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.min_count instead")
def min_count(self):
return self.vocabulary.min_count
@min_count.setter
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.min_count instead")
def min_count(self, value):
self.vocabulary.min_count = value
@property
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead")
def cum_table(self):
return self.vocabulary.cum_table
@cum_table.setter
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead")
def cum_table(self, value):
self.vocabulary.cum_table = value
@cum_table.deleter
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead")
def cum_table(self):
del self.vocabulary.cum_table
def __str__(self):
"""Get a human readable representation of the object.
Returns
-------
str
A human readable string containing the class name, as well as the size of dictionary, number of
features and starting learning rate used by the object.
"""
return "%s(vocab=%s, size=%s, alpha=%s)" % (
self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha
)
def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
Parameters
----------
sentences : iterable of list of str
Can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
update : bool, optional
If true, the new words in `sentences` will be added to model's vocab.
progress_per : int, optional
Indicates how many words to process before showing/updating the progress.
keep_raw_vocab : bool, optional
If False, the raw vocabulary will be deleted after the scaling is done to free up RAM.
trim_rule : function, optional
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
or a callable that accepts parameters (word, count, min_count) and returns either
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
The rule, if given, is only used to prune vocabulary during current method call and is not stored as part
of the model.
The input parameters are of the following types:
* `word` (str) - the word we are examining
* `count` (int) - the word's frequency count in the corpus
* `min_count` (int) - the minimum count threshold.
**kwargs : object
Key word arguments propagated to `self.vocabulary.prepare_vocab`
"""
total_words, corpus_count = self.vocabulary.scan_vocab(
sentences, progress_per=progress_per, trim_rule=trim_rule)
self.corpus_count = corpus_count
report_values = self.vocabulary.prepare_vocab(
self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab,
trim_rule=trim_rule, **kwargs)
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary)
def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
"""Build vocabulary from a dictionary of word frequencies.
Parameters
----------
word_freq : dict of (str, int)
A mapping from a word in the vocabulary to its frequency count.
keep_raw_vocab : bool, optional
If False, delete the raw vocabulary after the scaling is done to free up RAM.
corpus_count : int, optional
Even if no corpus is provided, this argument can set corpus_count explicitly.
trim_rule : function, optional
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
or a callable that accepts parameters (word, count, min_count) and returns either
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
The rule, if given, is only used to prune vocabulary during current method call and is not stored as part
of the model.
The input parameters are of the following types:
* `word` (str) - the word we are examining
* `count` (int) - the word's frequency count in the corpus
* `min_count` (int) - the minimum count threshold.
update : bool, optional
If true, the new provided words in `word_freq` dict will be added to model's vocab.
"""
logger.info("Processing provided word frequencies")
# Instead of scanning text, this will assign provided word frequencies dictionary(word_freq)
# to be directly the raw vocab
raw_vocab = word_freq
logger.info(
"collected %i different raw word, with total frequency of %i",
len(raw_vocab), sum(itervalues(raw_vocab))
)
# Since no sentences are provided, this is to control the corpus_count.
self.corpus_count = corpus_count or 0
self.vocabulary.raw_vocab = raw_vocab
# trim by min_count & precalculate downsampling
report_values = self.vocabulary.prepare_vocab(
self.hs, self.negative, self.wv, keep_raw_vocab=keep_raw_vocab,
trim_rule=trim_rule, update=update)
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
self.trainables.prepare_weights(
self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) # build tables & arrays
def estimate_memory(self, vocab_size=None, report=None):
"""Estimate required memory for a model using current settings and provided vocabulary size.
Parameters
----------
vocab_size : int, optional
Number of unique tokens in the vocabulary
report : dict of (str, int), optional
A dictionary from string representations of the model's memory consuming members to their size in bytes.
Returns
-------
dict of (str, int)
A dictionary from string representations of the model's memory consuming members to their size in bytes.
"""
vocab_size = vocab_size or len(self.wv.vocab)
report = report or {}
report['vocab'] = vocab_size * (700 if self.hs else 500)
report['vectors'] = vocab_size * self.vector_size * dtype(REAL).itemsize
if self.hs:
report['syn1'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize
if self.negative:
report['syn1neg'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize
report['total'] = sum(report.values())
logger.info(
"estimated required memory for %i words and %i dimensions: %i bytes",
vocab_size, self.vector_size, report['total']
)
return report
def train(self, sentences, total_examples=None, total_words=None,
epochs=None, start_alpha=None, end_alpha=None, word_count=0,
queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()):
"""Train the model. If the hyper-parameters are passed, they override the ones set in the constructor.
Parameters
----------
sentences : iterable of list of str
Can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
total_examples : int, optional
Count of sentences.
total_words : int, optional
Count of raw words in sentences.
epochs : int, optional
Number of iterations (epochs) over the corpus.
start_alpha : float, optional
Initial learning rate.
end_alpha : float, optional
Final learning rate. Drops linearly with the number of iterations from `start_alpha`.
word_count : int, optional
Count of words already trained. Leave this to 0 for the usual case of training on all words in sentences.
queue_factor : int, optional
Multiplier for size of queue -> size = number of workers * queue_factor.
report_delay : float, optional
Seconds to wait before reporting progress.
compute_loss : bool, optional
If True, loss will be computed while training the Word2Vec model and stored in
:attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`.
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks that need to be executed/run at specific stages during training.
Returns
-------
(int, int)
Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count).
"""
self.alpha = start_alpha or self.alpha
self.min_alpha = end_alpha or self.min_alpha
self.compute_loss = compute_loss
self.running_training_loss = 0.0
return super(BaseWordEmbeddingsModel, self).train(
sentences, total_examples=total_examples, total_words=total_words,
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)
def _get_job_params(self, cur_epoch):
"""Get the learning rate used in the current epoch.
Parameters
----------
cur_epoch : int
Current iteration through the corpus
Returns
-------
float
The learning rate for this epoch (it is linearly reduced with epochs from `self.alpha` to `self.min_alpha`).
"""
alpha = self.alpha - ((self.alpha - self.min_alpha) * float(cur_epoch) / self.epochs)
return alpha
def _update_job_params(self, job_params, epoch_progress, cur_epoch):
"""Get the correct learning rate for the next iteration.
Parameters
----------
job_params : dict of (str, obj)
UNUSED.
epoch_progress : float
Ratio of finished work in the current epoch.
cur_epoch : int
Number of current iteration.
Returns
-------
float
The learning rate to be used in the next training epoch.
"""
start_alpha = self.alpha
end_alpha = self.min_alpha
progress = (cur_epoch + epoch_progress) / self.epochs
next_alpha = start_alpha - (start_alpha - end_alpha) * progress
next_alpha = max(end_alpha, next_alpha)
self.min_alpha_yet_reached = next_alpha
return next_alpha
def _get_thread_working_mem(self):
"""Computes the memory used per worker thread.
Returns
-------
(np.ndarray, np.ndarray)
Each worker threads private work memory.
"""
work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory
neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)
return work, neu1
def _raw_word_count(self, job):
"""Get the number of words in a given job.
Parameters
----------
job: iterable of list of str
The corpus chunk processed in a single batch.
Returns
-------
int
Number of raw words in the corpus chunk.
"""
return sum(len(sentence) for sentence in job)
def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs):
"""Checks whether the training parameters make sense.
Called right before training starts in :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.train`
and raises warning or errors depending on the severity of the issue in case an inconsistent parameter
combination is detected.
Parameters
----------
epochs : int, optional
Number of training epochs. Must have a (non None) value.
total_examples : int, optional
Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied.
total_words : int, optional
Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied.
**kwargs : object
Unused. Present to preserve signature among base and inherited implementations.
Raises
------
RuntimeError
If one of the required training pre/post processing steps have not been performed.
ValueError
If the combination of input parameters is inconsistent.
"""
if self.alpha > self.min_alpha_yet_reached:
logger.warning("Effective 'alpha' higher than previous training cycles")
if self.model_trimmed_post_training:
raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
if not self.wv.vocab: # should be set by `build_vocab`
raise RuntimeError("you must first build vocabulary before training the model")
if not len(self.wv.vectors):
raise RuntimeError("you must initialize vectors before training the model")
if not hasattr(self, 'corpus_count'):
raise ValueError(
"The number of examples in the training corpus is missing. "
"Please make sure this is set inside `build_vocab` function."
"Call the `build_vocab` function before calling `train`."
)
if total_words is None and total_examples is None:
raise ValueError(
"You must specify either total_examples or total_words, for proper job parameters updation"
"and progress calculations. "
"The usual value is total_examples=model.corpus_count."
)
if epochs is None:
raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.epochs.")
logger.info(
"training model with %i workers on %i vocabulary and %i features, "
"using sg=%s hs=%s sample=%s negative=%s window=%s",
self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg,
self.hs, self.vocabulary.sample, self.negative, self.window
)
@classmethod
def load(cls, *args, **kwargs):
"""Load a previously saved object (using :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save`) from file.
Also initializes extra instance attributes in case the loaded model does not include them.
`*args` or `**kwargs` **MUST** include the fname argument (path to saved file).
See :meth:`~gensim.utils.SaveLoad.load`.
Parameters
----------
*args : object
Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`.
**kwargs : object
Key word arguments passed to :meth:`~gensim.utils.SaveLoad.load`.
See Also
--------
:meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save`
Method for save a model.
Returns
-------
:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
Model loaded from disk.
Raises
------
IOError
When methods are called on instance (should be called from class).
"""
model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs)
if not hasattr(model, 'ns_exponent'):
model.ns_exponent = 0.75
if not hasattr(model.vocabulary, 'ns_exponent'):
model.vocabulary.ns_exponent = 0.75
if model.negative and hasattr(model.wv, 'index2word'):
model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary
if not hasattr(model, 'corpus_count'):
model.corpus_count = None
if not hasattr(model.trainables, 'vectors_lockf') and hasattr(model.wv, 'vectors'):
model.trainables.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL)
if not hasattr(model, 'random'):
model.random = random.RandomState(model.trainables.seed)
if not hasattr(model, 'train_count'):
model.train_count = 0
model.total_train_time = 0
return model
def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
raw_word_count, total_words, trained_word_count, elapsed):
"""Callback used to log progress for long running jobs.
Parameters
----------
job_queue : Queue of (list of object, dict of (str, float))
The queue of jobs still to be performed by workers. Each job is represented as a tuple containing
the batch of data to be processed and the parameters to be used for the processing as a dict.
progress_queue : Queue of (int, int, int)
A queue of progress reports. Each report is represented as a tuple of these 3 elements:
* size of data chunk processed, for example number of sentences in the corpus chunk.
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
* Total word count used in training.
cur_epoch : int
The current training iteration through the corpus.
example_count : int
Number of examples (could be sentences for example) processed until now.
total_examples : int
Number of all examples present in the input corpus.
raw_word_count : int
Number of words used in training until now.
total_words : int
Number of all words in the input corpus.
trained_word_count : int
Number of effective words used in training until now (after ignoring unknown words and trimming
the sentence length).
elapsed : int
Elapsed time since the beginning of training in seconds.
"""
if total_examples:
# examples-based progress %
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue)
)
else:
# words-based progress %
logger.info(
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
utils.qsize(job_queue), utils.qsize(progress_queue)
)
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
trained_word_count, elapsed):
"""Callback used to log the end of a training epoch.
Parameters
----------
cur_epoch : int
The current training iteration through the corpus.
example_count : int
Number of examples (could be sentences for example) processed until now.
total_examples : int
Number of all examples present in the input corpus.
raw_word_count : int
Number of words used in training until now.
total_words : int
Number of all words in the input corpus.
trained_word_count : int
Number of effective words used in training until now (after ignoring unknown words and trimming
the sentence length).
elapsed : int
Elapsed time since the beginning of training in seconds.
Warnings
--------
In case the corpus is changed while the epoch was running.
"""
logger.info(
"EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed
)
# check that the input corpus hasn't changed during iteration
if total_examples and total_examples != example_count:
logger.warning(
"EPOCH - %i : supplied example count (%i) did not equal expected count (%i)", cur_epoch + 1,
example_count, total_examples
)
if total_words and total_words != raw_word_count:
logger.warning(
"EPOCH - %i : supplied raw word count (%i) did not equal expected count (%i)", cur_epoch + 1,
raw_word_count, total_words
)
def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally):
"""Callback to log the end of training.
Parameters
----------
raw_word_count : int
Number of words used in the whole training.
trained_word_count : int
Number of effective words used in training (after ignoring unknown words and trimming the sentence length).
total_elapsed : int
Total time spent during training in seconds.
job_tally : int
Total number of jobs processed during training.
"""
logger.info(
"training on a %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed
)
if job_tally < 10 * self.workers:
logger.warning(
"under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
)
# for backward compatibility
@deprecated("Method will be removed in 4.0.0, use self.wv.most_similar() instead")
def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
"""Deprecated, use self.wv.most_similar() instead.
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar`.
"""
return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer)
@deprecated("Method will be removed in 4.0.0, use self.wv.wmdistance() instead")
def wmdistance(self, document1, document2):
"""Deprecated, use self.wv.wmdistance() instead.
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.wmdistance`.
"""
return self.wv.wmdistance(document1, document2)
@deprecated("Method will be removed in 4.0.0, use self.wv.most_similar_cosmul() instead")
def most_similar_cosmul(self, positive=None, negative=None, topn=10):
"""Deprecated, use self.wv.most_similar_cosmul() instead.
Refer to the documentation for
:meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar_cosmul`.
"""
return self.wv.most_similar_cosmul(positive, negative, topn)
@deprecated("Method will be removed in 4.0.0, use self.wv.similar_by_word() instead")
def similar_by_word(self, word, topn=10, restrict_vocab=None):
"""Deprecated, use self.wv.similar_by_word() instead.
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_word`.
"""
return self.wv.similar_by_word(word, topn, restrict_vocab)
@deprecated("Method will be removed in 4.0.0, use self.wv.similar_by_vector() instead")
def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
"""Deprecated, use self.wv.similar_by_vector() instead.
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_vector`.
"""
return self.wv.similar_by_vector(vector, topn, restrict_vocab)
@deprecated("Method will be removed in 4.0.0, use self.wv.doesnt_match() instead")
def doesnt_match(self, words):
"""Deprecated, use self.wv.doesnt_match() instead.
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.doesnt_match`.
"""
return self.wv.doesnt_match(words)
@deprecated("Method will be removed in 4.0.0, use self.wv.similarity() instead")
def similarity(self, w1, w2):
"""Deprecated, use self.wv.similarity() instead.
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity`.
"""
return self.wv.similarity(w1, w2)
@deprecated("Method will be removed in 4.0.0, use self.wv.n_similarity() instead")
def n_similarity(self, ws1, ws2):
"""Deprecated, use self.wv.n_similarity() instead.
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.n_similarity`.
"""
return self.wv.n_similarity(ws1, ws2)
@deprecated("Method will be removed in 4.0.0, use self.wv.evaluate_word_pairs() instead")
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
case_insensitive=True, dummy4unknown=False):
"""Deprecated, use self.wv.evaluate_word_pairs() instead.
Refer to the documentation for
:meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.evaluate_word_pairs`.
"""
return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)