1317 lines
59 KiB
Python
1317 lines
59 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Author: Shiva Manne <manneshiva@gmail.com>
|
||
|
# Copyright (C) 2018 RaRe Technologies s.r.o.
|
||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
|
||
|
"""This module contains base classes required for implementing \*2vec algorithms.
|
||
|
|
||
|
The class hierarchy is designed to facilitate adding more concrete implementations for creating embeddings.
|
||
|
In the most general case, the purpose of this class is to transform an arbitrary representation to a numerical vector
|
||
|
(embedding). This is represented by the base :class:`~gensim.models.base_any2vec.BaseAny2VecModel`. The input space in
|
||
|
most cases (in the NLP field at least) is plain text. For this reason, we enrich the class hierarchy with the abstract
|
||
|
:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` to be used as a base for models where the input
|
||
|
space is text.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Even though this is the usual case, not all embeddings transform text, such as the
|
||
|
:class:`~gensim.models.poincare.PoincareModel` that embeds graphs.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
:class:`~gensim.models.word2vec.Word2Vec`.
|
||
|
Word2Vec model - embeddings for words.
|
||
|
:class:`~gensim.models.fasttext.FastText`.
|
||
|
FastText model - embeddings for words (ngram-based).
|
||
|
:class:`~gensim.models.doc2vec.Doc2Vec`.
|
||
|
Doc2Vec model - embeddings for documents.
|
||
|
:class:`~gensim.models.poincare.PoincareModel`
|
||
|
Poincare model - embeddings for graphs.
|
||
|
|
||
|
"""
|
||
|
|
||
|
from gensim import utils
|
||
|
import logging
|
||
|
from timeit import default_timer
|
||
|
import threading
|
||
|
from six.moves import xrange
|
||
|
from six import itervalues
|
||
|
from gensim import matutils
|
||
|
from numpy import float32 as REAL, ones, random, dtype, zeros
|
||
|
from types import GeneratorType
|
||
|
from gensim.utils import deprecated
|
||
|
import warnings
|
||
|
|
||
|
try:
|
||
|
from queue import Queue
|
||
|
except ImportError:
|
||
|
from Queue import Queue
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
class BaseAny2VecModel(utils.SaveLoad):
|
||
|
"""Base class for training, using and evaluating \*2vec model.
|
||
|
|
||
|
Contains implementation for multi-threaded training. The purpose of this class is to provide a
|
||
|
reference interface for concrete embedding implementations, whether the input space is a corpus
|
||
|
of words, documents or anything else. At the same time, functionality that we expect to be common
|
||
|
for those implementations is provided here to avoid code duplication.
|
||
|
|
||
|
In the special but usual case where the input space consists of words, a more specialized layer
|
||
|
is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
A subclass should initialize the following attributes:
|
||
|
|
||
|
* self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example)
|
||
|
* self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example)
|
||
|
* self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example)
|
||
|
|
||
|
"""
|
||
|
def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000):
|
||
|
"""
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
workers : int, optional
|
||
|
Number of working threads, used for multithreading.
|
||
|
vector_size : int, optional
|
||
|
Dimensionality of the feature vectors.
|
||
|
epochs : int, optional
|
||
|
Number of iterations (epochs) of training through the corpus.
|
||
|
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
|
||
|
List of callbacks that need to be executed/run at specific stages during training.
|
||
|
batch_words : int, optional
|
||
|
Number of words to be processed by a single job.
|
||
|
|
||
|
"""
|
||
|
self.vector_size = int(vector_size)
|
||
|
self.workers = int(workers)
|
||
|
self.epochs = epochs
|
||
|
self.train_count = 0
|
||
|
self.total_train_time = 0
|
||
|
self.batch_words = batch_words
|
||
|
self.model_trimmed_post_training = False
|
||
|
self.callbacks = callbacks
|
||
|
|
||
|
def _get_job_params(self, cur_epoch):
|
||
|
"""Get job parameters required for each batch."""
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _set_train_params(self, **kwargs):
|
||
|
"""Set model parameters required for training."""
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _update_job_params(self, job_params, epoch_progress, cur_epoch):
|
||
|
"""Get updated job parameters based on the epoch_progress and cur_epoch."""
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _get_thread_working_mem(self):
|
||
|
"""Get private working memory per thread."""
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _raw_word_count(self, job):
|
||
|
"""Get the number of words in a given job."""
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _clear_post_train(self):
|
||
|
"""Resets certain properties of the model post training. eg. `keyedvectors.vectors_norm`."""
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
|
||
|
"""Train a single batch. Return 2-tuple `(effective word count, total word count)`."""
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs):
|
||
|
"""Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided."""
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _worker_loop(self, job_queue, progress_queue):
|
||
|
"""Train the model, lifting batches of data from the queue.
|
||
|
|
||
|
This function will be called in parallel by multiple workers (threads or processes) to make
|
||
|
optimal use of multicore machines.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
job_queue : Queue of (list of objects, (str, int))
|
||
|
A queue of jobs still to be processed. The worker will take up jobs from this queue.
|
||
|
Each job is represented by a tuple where the first element is the corpus chunk to be processed and
|
||
|
the second is the dictionary of parameters.
|
||
|
progress_queue : Queue of (int, int, int)
|
||
|
A queue of progress reports. Each report is represented as a tuple of these 3 elements:
|
||
|
* Size of data chunk processed, for example number of sentences in the corpus chunk.
|
||
|
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
|
||
|
* Total word count used in training.
|
||
|
|
||
|
"""
|
||
|
thread_private_mem = self._get_thread_working_mem()
|
||
|
jobs_processed = 0
|
||
|
while True:
|
||
|
job = job_queue.get()
|
||
|
if job is None:
|
||
|
progress_queue.put(None)
|
||
|
break # no more jobs => quit this worker
|
||
|
data_iterable, job_parameters = job
|
||
|
|
||
|
for callback in self.callbacks:
|
||
|
callback.on_batch_begin(self)
|
||
|
|
||
|
tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem)
|
||
|
|
||
|
for callback in self.callbacks:
|
||
|
callback.on_batch_end(self)
|
||
|
|
||
|
progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress
|
||
|
jobs_processed += 1
|
||
|
logger.debug("worker exiting, processed %i jobs", jobs_processed)
|
||
|
|
||
|
def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=None, total_words=None):
|
||
|
"""Fill the jobs queue using the data found in the input stream.
|
||
|
|
||
|
Each job is represented by a tuple where the first element is the corpus chunk to be processed and
|
||
|
the second is a dictionary of parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
data_iterator : iterable of list of objects
|
||
|
The input dataset. This will be split in chunks and these chunks will be pushed to the queue.
|
||
|
job_queue : Queue of (list of object, dict of (str, int))
|
||
|
A queue of jobs still to be processed. The worker will take up jobs from this queue.
|
||
|
Each job is represented by a tuple where the first element is the corpus chunk to be processed and
|
||
|
the second is the dictionary of parameters.
|
||
|
cur_epoch : int, optional
|
||
|
The current training epoch, needed to compute the training parameters for each job.
|
||
|
For example in many implementations the learning rate would be dropping with the number of epochs.
|
||
|
total_examples : int, optional
|
||
|
Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
|
||
|
in a corpus. Used to log progress.
|
||
|
total_words : int, optional
|
||
|
Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
|
||
|
words in a corpus. Used to log progress.
|
||
|
|
||
|
"""
|
||
|
job_batch, batch_size = [], 0
|
||
|
pushed_words, pushed_examples = 0, 0
|
||
|
next_job_params = self._get_job_params(cur_epoch)
|
||
|
job_no = 0
|
||
|
|
||
|
for data_idx, data in enumerate(data_iterator):
|
||
|
data_length = self._raw_word_count([data])
|
||
|
|
||
|
# can we fit this sentence into the existing job batch?
|
||
|
if batch_size + data_length <= self.batch_words:
|
||
|
# yes => add it to the current job
|
||
|
job_batch.append(data)
|
||
|
batch_size += data_length
|
||
|
else:
|
||
|
job_no += 1
|
||
|
job_queue.put((job_batch, next_job_params))
|
||
|
|
||
|
# update the learning rate for the next job
|
||
|
if total_examples:
|
||
|
# examples-based decay
|
||
|
pushed_examples += len(job_batch)
|
||
|
epoch_progress = 1.0 * pushed_examples / total_examples
|
||
|
else:
|
||
|
# words-based decay
|
||
|
pushed_words += self._raw_word_count(job_batch)
|
||
|
epoch_progress = 1.0 * pushed_words / total_words
|
||
|
next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch)
|
||
|
|
||
|
# add the sentence that didn't fit as the first item of a new job
|
||
|
job_batch, batch_size = [data], data_length
|
||
|
# add the last job too (may be significantly smaller than batch_words)
|
||
|
if job_batch:
|
||
|
job_no += 1
|
||
|
job_queue.put((job_batch, next_job_params))
|
||
|
|
||
|
if job_no == 0 and self.train_count == 0:
|
||
|
logger.warning(
|
||
|
"train() called with an empty iterator (if not intended, "
|
||
|
"be sure to provide a corpus that offers restartable iteration = an iterable)."
|
||
|
)
|
||
|
|
||
|
# give the workers heads up that they can finish -- no more work!
|
||
|
for _ in xrange(self.workers):
|
||
|
job_queue.put(None)
|
||
|
logger.debug("job loop exiting, total %i jobs", job_no)
|
||
|
|
||
|
def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
|
||
|
raw_word_count, total_words, trained_word_count, elapsed):
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
|
||
|
trained_word_count, elapsed):
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally):
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None,
|
||
|
report_delay=1.0):
|
||
|
"""Get the progress report for a single training epoch.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
progress_queue : Queue of (int, int, int)
|
||
|
A queue of progress reports. Each report is represented as a tuple of these 3 elements:
|
||
|
* size of data chunk processed, for example number of sentences in the corpus chunk.
|
||
|
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
|
||
|
* Total word count used in training.
|
||
|
job_queue : Queue of (list of object, dict of (str, int))
|
||
|
A queue of jobs still to be processed. The worker will take up jobs from this queue.
|
||
|
Each job is represented by a tuple where the first element is the corpus chunk to be processed and
|
||
|
the second is the dictionary of parameters.
|
||
|
cur_epoch : int, optional
|
||
|
The current training epoch, needed to compute the training parameters for each job.
|
||
|
For example in many implementations the learning rate would be dropping with the number of epochs.
|
||
|
total_examples : int, optional
|
||
|
Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
|
||
|
in a corpus. Used to log progress.
|
||
|
total_words : int, optional
|
||
|
Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
|
||
|
words in a corpus. Used to log progress.
|
||
|
report_delay : float, optional
|
||
|
Number of seconds between two consecutive progress report messages in the logger.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(int, int, int)
|
||
|
The epoch report consisting of three elements:
|
||
|
* size of data chunk processed, for example number of sentences in the corpus chunk.
|
||
|
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
|
||
|
* Total word count used in training.
|
||
|
|
||
|
"""
|
||
|
example_count, trained_word_count, raw_word_count = 0, 0, 0
|
||
|
start, next_report = default_timer() - 0.00001, 1.0
|
||
|
job_tally = 0
|
||
|
unfinished_worker_count = self.workers
|
||
|
|
||
|
while unfinished_worker_count > 0:
|
||
|
report = progress_queue.get() # blocks if workers too slow
|
||
|
if report is None: # a thread reporting that it finished
|
||
|
unfinished_worker_count -= 1
|
||
|
logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count)
|
||
|
continue
|
||
|
examples, trained_words, raw_words = report
|
||
|
job_tally += 1
|
||
|
|
||
|
# update progress stats
|
||
|
example_count += examples
|
||
|
trained_word_count += trained_words # only words in vocab & sampled
|
||
|
raw_word_count += raw_words
|
||
|
|
||
|
# log progress once every report_delay seconds
|
||
|
elapsed = default_timer() - start
|
||
|
if elapsed >= next_report:
|
||
|
self._log_progress(
|
||
|
job_queue, progress_queue, cur_epoch, example_count, total_examples,
|
||
|
raw_word_count, total_words, trained_word_count, elapsed)
|
||
|
next_report = elapsed + report_delay
|
||
|
# all done; report the final stats
|
||
|
elapsed = default_timer() - start
|
||
|
self._log_epoch_end(
|
||
|
cur_epoch, example_count, total_examples, raw_word_count, total_words,
|
||
|
trained_word_count, elapsed)
|
||
|
self.total_train_time += elapsed
|
||
|
return trained_word_count, raw_word_count, job_tally
|
||
|
|
||
|
def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None,
|
||
|
total_words=None, queue_factor=2, report_delay=1.0):
|
||
|
"""Train the model for a single epoch.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
data_iterable : iterable of list of object
|
||
|
The input corpus. This will be split in chunks and these chunks will be pushed to the queue.
|
||
|
cur_epoch : int, optional
|
||
|
The current training epoch, needed to compute the training parameters for each job.
|
||
|
For example in many implementations the learning rate would be dropping with the number of epochs.
|
||
|
total_examples : int, optional
|
||
|
Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
|
||
|
in a corpus, used to log progress.
|
||
|
total_words : int, optional
|
||
|
Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
|
||
|
words in a corpus, used to log progress.
|
||
|
queue_factor : int, optional
|
||
|
Multiplier for size of queue -> size = number of workers * queue_factor.
|
||
|
report_delay : float, optional
|
||
|
Number of seconds between two consecutive progress report messages in the logger.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(int, int, int)
|
||
|
The training report for this epoch consisting of three elements:
|
||
|
* Size of data chunk processed, for example number of sentences in the corpus chunk.
|
||
|
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
|
||
|
* Total word count used in training.
|
||
|
|
||
|
"""
|
||
|
job_queue = Queue(maxsize=queue_factor * self.workers)
|
||
|
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
|
||
|
|
||
|
workers = [
|
||
|
threading.Thread(
|
||
|
target=self._worker_loop,
|
||
|
args=(job_queue, progress_queue,))
|
||
|
for _ in xrange(self.workers)
|
||
|
]
|
||
|
|
||
|
workers.append(threading.Thread(
|
||
|
target=self._job_producer,
|
||
|
args=(data_iterable, job_queue),
|
||
|
kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words}))
|
||
|
|
||
|
for thread in workers:
|
||
|
thread.daemon = True # make interrupting the process with ctrl+c easier
|
||
|
thread.start()
|
||
|
|
||
|
trained_word_count, raw_word_count, job_tally = self._log_epoch_progress(
|
||
|
progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words,
|
||
|
report_delay=report_delay)
|
||
|
|
||
|
return trained_word_count, raw_word_count, job_tally
|
||
|
|
||
|
def train(self, data_iterable, epochs=None, total_examples=None,
|
||
|
total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs):
|
||
|
"""Train the model for multiple epochs using multiple workers.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
data_iterable : iterable of list of object
|
||
|
The input corpus. This will be split in chunks and these chunks will be pushed to the queue.
|
||
|
epochs : int, optional
|
||
|
Number of epochs (training iterations over the whole input) of training.
|
||
|
total_examples : int, optional
|
||
|
Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences
|
||
|
in a corpus, used to log progress.
|
||
|
total_words : int, optional
|
||
|
Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw
|
||
|
words in a corpus, used to log progress.
|
||
|
queue_factor : int, optional
|
||
|
Multiplier for size of queue -> size = number of workers * queue_factor.
|
||
|
report_delay : float, optional
|
||
|
Number of seconds between two consecutive progress report messages in the logger.
|
||
|
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
|
||
|
List of callbacks to execute at specific stages during training.
|
||
|
**kwargs : object
|
||
|
Additional key word parameters for the specific model inheriting from this class.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(int, int)
|
||
|
The total training report consisting of two elements:
|
||
|
* size of total data processed, for example number of sentences in the whole corpus.
|
||
|
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
|
||
|
|
||
|
"""
|
||
|
self._set_train_params(**kwargs)
|
||
|
if callbacks:
|
||
|
self.callbacks = callbacks
|
||
|
self.epochs = epochs
|
||
|
self._check_training_sanity(
|
||
|
epochs=epochs,
|
||
|
total_examples=total_examples,
|
||
|
total_words=total_words, **kwargs)
|
||
|
|
||
|
for callback in self.callbacks:
|
||
|
callback.on_train_begin(self)
|
||
|
|
||
|
trained_word_count = 0
|
||
|
raw_word_count = 0
|
||
|
start = default_timer() - 0.00001
|
||
|
job_tally = 0
|
||
|
|
||
|
for cur_epoch in range(self.epochs):
|
||
|
for callback in self.callbacks:
|
||
|
callback.on_epoch_begin(self)
|
||
|
|
||
|
trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch(
|
||
|
data_iterable, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words,
|
||
|
queue_factor=queue_factor, report_delay=report_delay)
|
||
|
trained_word_count += trained_word_count_epoch
|
||
|
raw_word_count += raw_word_count_epoch
|
||
|
job_tally += job_tally_epoch
|
||
|
|
||
|
for callback in self.callbacks:
|
||
|
callback.on_epoch_end(self)
|
||
|
|
||
|
# Log overall time
|
||
|
total_elapsed = default_timer() - start
|
||
|
self._log_train_end(raw_word_count, trained_word_count, total_elapsed, job_tally)
|
||
|
|
||
|
self.train_count += 1 # number of times train() has been called
|
||
|
self._clear_post_train()
|
||
|
|
||
|
for callback in self.callbacks:
|
||
|
callback.on_train_end(self)
|
||
|
return trained_word_count, raw_word_count
|
||
|
|
||
|
@classmethod
|
||
|
def load(cls, fname_or_handle, **kwargs):
|
||
|
"""Load a previously saved object (using :meth:`gensim.models.base_any2vec.BaseAny2VecModel.save`) from a file.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname_or_handle : {str, file-like object}
|
||
|
Path to file that contains needed object or handle to an open file.
|
||
|
**kwargs : object
|
||
|
Keyword arguments propagated to :meth:`~gensim.utils.SaveLoad.load`.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
:meth:`~gensim.models.base_any2vec.BaseAny2VecModel.save`
|
||
|
Method for save a model.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
object
|
||
|
Object loaded from `fname_or_handle`.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
IOError
|
||
|
When methods are called on an instance (should be called on a class, this is a class method).
|
||
|
|
||
|
"""
|
||
|
return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs)
|
||
|
|
||
|
def save(self, fname_or_handle, **kwargs):
|
||
|
""""Save the object to file.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname_or_handle : {str, file-like object}
|
||
|
Path to file where the model will be persisted.
|
||
|
**kwargs : object
|
||
|
Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
:meth:`~gensim.models.base_any2vec.BaseAny2VecModel.load`
|
||
|
Method for load model after current method.
|
||
|
|
||
|
"""
|
||
|
super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs)
|
||
|
|
||
|
|
||
|
class BaseWordEmbeddingsModel(BaseAny2VecModel):
|
||
|
"""Base class containing common methods for training, using & evaluating word embeddings learning models.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
:class:`~gensim.models.word2vec.Word2Vec`.
|
||
|
Word2Vec model - embeddings for words.
|
||
|
:class:`~gensim.models.fasttext.FastText`.
|
||
|
FastText model - embeddings for words (ngram-based).
|
||
|
:class:`~gensim.models.doc2vec.Doc2Vec`.
|
||
|
Doc2Vec model - embeddings for documents.
|
||
|
:class:`~gensim.models.poincare.PoincareModel`
|
||
|
Poincare model - embeddings for graphs.
|
||
|
|
||
|
"""
|
||
|
def _clear_post_train(self):
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def _set_train_params(self, **kwargs):
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000,
|
||
|
trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1,
|
||
|
min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs):
|
||
|
"""
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sentences : iterable of list of str, optional
|
||
|
Can be simply a list of lists of tokens, but for larger corpora,
|
||
|
consider an iterable that streams the sentences directly from disk/network.
|
||
|
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
|
||
|
or :class:`~gensim.models.word2vec.LineSentence` for such examples.
|
||
|
workers : int, optional
|
||
|
Number of working threads, used for multiprocessing.
|
||
|
vector_size : int, optional
|
||
|
Dimensionality of the feature vectors.
|
||
|
epochs : int, optional
|
||
|
Number of iterations (epochs) of training through the corpus.
|
||
|
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
|
||
|
List of callbacks that need to be executed/run at specific stages during training.
|
||
|
batch_words : int, optional
|
||
|
Number of words to be processed by a single job.
|
||
|
trim_rule : function, optional
|
||
|
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
|
||
|
be trimmed away, or handled using the default (discard if word count < min_count).
|
||
|
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
|
||
|
or a callable that accepts parameters (word, count, min_count) and returns either
|
||
|
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
|
||
|
The rule, if given, is only used to prune vocabulary during current method call and is not stored as part
|
||
|
of the model.
|
||
|
|
||
|
The input parameters are of the following types:
|
||
|
* `word` (str) - the word we are examining
|
||
|
* `count` (int) - the word's frequency count in the corpus
|
||
|
* `min_count` (int) - the minimum count threshold.
|
||
|
|
||
|
sg : {1, 0}, optional
|
||
|
Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed.
|
||
|
alpha : float, optional
|
||
|
The beginning learning rate. This will linearly reduce with iterations until it reaches `min_alpha`.
|
||
|
window : int, optional
|
||
|
The maximum distance between the current and predicted word within a sentence.
|
||
|
seed : int, optional
|
||
|
Seed for the random number generator. Initial vectors for each word are seeded with a hash of
|
||
|
the concatenation of word + `str(seed)`.
|
||
|
Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker
|
||
|
thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling.
|
||
|
In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED`
|
||
|
environment variable to control hash randomization.
|
||
|
hs : {1,0}, optional
|
||
|
If 1, hierarchical softmax will be used for model training.
|
||
|
If set to 0, and `negative` is non-zero, negative sampling will be used.
|
||
|
negative : int, optional
|
||
|
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
|
||
|
should be drawn (usually between 5-20).
|
||
|
If set to 0, no negative sampling is used.
|
||
|
cbow_mean : {1,0}, optional
|
||
|
If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
|
||
|
min_alpha : float, optional
|
||
|
Final learning rate. Drops linearly with the number of iterations from `alpha`.
|
||
|
compute_loss : bool, optional
|
||
|
If True, loss will be computed while training the Word2Vec model and stored in
|
||
|
:attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss` attribute.
|
||
|
fast_version : {-1, 1}, optional
|
||
|
Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
|
||
|
**kwargs : object
|
||
|
Key word arguments needed to allow children classes to accept more arguments.
|
||
|
|
||
|
"""
|
||
|
self.sg = int(sg)
|
||
|
if vector_size % 4 != 0:
|
||
|
logger.warning("consider setting layer size to a multiple of 4 for greater performance")
|
||
|
self.alpha = float(alpha)
|
||
|
self.window = int(window)
|
||
|
self.random = random.RandomState(seed)
|
||
|
self.min_alpha = float(min_alpha)
|
||
|
self.hs = int(hs)
|
||
|
self.negative = int(negative)
|
||
|
self.ns_exponent = ns_exponent
|
||
|
self.cbow_mean = int(cbow_mean)
|
||
|
self.compute_loss = bool(compute_loss)
|
||
|
self.running_training_loss = 0
|
||
|
self.min_alpha_yet_reached = float(alpha)
|
||
|
self.corpus_count = 0
|
||
|
|
||
|
super(BaseWordEmbeddingsModel, self).__init__(
|
||
|
workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words)
|
||
|
|
||
|
if fast_version < 0:
|
||
|
warnings.warn(
|
||
|
"C extension not loaded, training will be slow. "
|
||
|
"Install a C compiler and reinstall gensim for fast training."
|
||
|
)
|
||
|
self.neg_labels = []
|
||
|
if self.negative > 0:
|
||
|
# precompute negative labels optimization for pure-python training
|
||
|
self.neg_labels = zeros(self.negative + 1)
|
||
|
self.neg_labels[0] = 1.
|
||
|
|
||
|
if sentences is not None:
|
||
|
if isinstance(sentences, GeneratorType):
|
||
|
raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
|
||
|
self.build_vocab(sentences, trim_rule=trim_rule)
|
||
|
self.train(
|
||
|
sentences, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha,
|
||
|
end_alpha=self.min_alpha, compute_loss=compute_loss)
|
||
|
else:
|
||
|
if trim_rule is not None:
|
||
|
logger.warning(
|
||
|
"The rule, if given, is only used to prune vocabulary during build_vocab() "
|
||
|
"and is not stored as part of the model. Model initialized without sentences. "
|
||
|
"trim_rule provided, if any, will be ignored.")
|
||
|
|
||
|
# for backward compatibility (aliases pointing to corresponding variables in trainables, vocabulary)
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.epochs instead")
|
||
|
def iter(self):
|
||
|
return self.epochs
|
||
|
|
||
|
@iter.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.epochs instead")
|
||
|
def iter(self, value):
|
||
|
self.epochs = value
|
||
|
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead")
|
||
|
def syn1(self):
|
||
|
return self.trainables.syn1
|
||
|
|
||
|
@syn1.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead")
|
||
|
def syn1(self, value):
|
||
|
self.trainables.syn1 = value
|
||
|
|
||
|
@syn1.deleter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1 instead")
|
||
|
def syn1(self):
|
||
|
del self.trainables.syn1
|
||
|
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead")
|
||
|
def syn1neg(self):
|
||
|
return self.trainables.syn1neg
|
||
|
|
||
|
@syn1neg.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead")
|
||
|
def syn1neg(self, value):
|
||
|
self.trainables.syn1neg = value
|
||
|
|
||
|
@syn1neg.deleter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.syn1neg instead")
|
||
|
def syn1neg(self):
|
||
|
del self.trainables.syn1neg
|
||
|
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead")
|
||
|
def syn0_lockf(self):
|
||
|
return self.trainables.vectors_lockf
|
||
|
|
||
|
@syn0_lockf.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead")
|
||
|
def syn0_lockf(self, value):
|
||
|
self.trainables.vectors_lockf = value
|
||
|
|
||
|
@syn0_lockf.deleter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_lockf instead")
|
||
|
def syn0_lockf(self):
|
||
|
del self.trainables.vectors_lockf
|
||
|
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.layer1_size instead")
|
||
|
def layer1_size(self):
|
||
|
return self.trainables.layer1_size
|
||
|
|
||
|
@layer1_size.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.layer1_size instead")
|
||
|
def layer1_size(self, value):
|
||
|
self.trainables.layer1_size = value
|
||
|
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.hashfxn instead")
|
||
|
def hashfxn(self):
|
||
|
return self.trainables.hashfxn
|
||
|
|
||
|
@hashfxn.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.hashfxn instead")
|
||
|
def hashfxn(self, value):
|
||
|
self.trainables.hashfxn = value
|
||
|
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.sample instead")
|
||
|
def sample(self):
|
||
|
return self.vocabulary.sample
|
||
|
|
||
|
@sample.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.sample instead")
|
||
|
def sample(self, value):
|
||
|
self.vocabulary.sample = value
|
||
|
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.min_count instead")
|
||
|
def min_count(self):
|
||
|
return self.vocabulary.min_count
|
||
|
|
||
|
@min_count.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.min_count instead")
|
||
|
def min_count(self, value):
|
||
|
self.vocabulary.min_count = value
|
||
|
|
||
|
@property
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead")
|
||
|
def cum_table(self):
|
||
|
return self.vocabulary.cum_table
|
||
|
|
||
|
@cum_table.setter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead")
|
||
|
def cum_table(self, value):
|
||
|
self.vocabulary.cum_table = value
|
||
|
|
||
|
@cum_table.deleter
|
||
|
@deprecated("Attribute will be removed in 4.0.0, use self.vocabulary.cum_table instead")
|
||
|
def cum_table(self):
|
||
|
del self.vocabulary.cum_table
|
||
|
|
||
|
def __str__(self):
|
||
|
"""Get a human readable representation of the object.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
A human readable string containing the class name, as well as the size of dictionary, number of
|
||
|
features and starting learning rate used by the object.
|
||
|
|
||
|
"""
|
||
|
return "%s(vocab=%s, size=%s, alpha=%s)" % (
|
||
|
self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha
|
||
|
)
|
||
|
|
||
|
def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
|
||
|
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sentences : iterable of list of str
|
||
|
Can be simply a list of lists of tokens, but for larger corpora,
|
||
|
consider an iterable that streams the sentences directly from disk/network.
|
||
|
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
|
||
|
or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
|
||
|
update : bool, optional
|
||
|
If true, the new words in `sentences` will be added to model's vocab.
|
||
|
progress_per : int, optional
|
||
|
Indicates how many words to process before showing/updating the progress.
|
||
|
keep_raw_vocab : bool, optional
|
||
|
If False, the raw vocabulary will be deleted after the scaling is done to free up RAM.
|
||
|
trim_rule : function, optional
|
||
|
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
|
||
|
be trimmed away, or handled using the default (discard if word count < min_count).
|
||
|
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
|
||
|
or a callable that accepts parameters (word, count, min_count) and returns either
|
||
|
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
|
||
|
The rule, if given, is only used to prune vocabulary during current method call and is not stored as part
|
||
|
of the model.
|
||
|
|
||
|
The input parameters are of the following types:
|
||
|
* `word` (str) - the word we are examining
|
||
|
* `count` (int) - the word's frequency count in the corpus
|
||
|
* `min_count` (int) - the minimum count threshold.
|
||
|
|
||
|
**kwargs : object
|
||
|
Key word arguments propagated to `self.vocabulary.prepare_vocab`
|
||
|
|
||
|
"""
|
||
|
total_words, corpus_count = self.vocabulary.scan_vocab(
|
||
|
sentences, progress_per=progress_per, trim_rule=trim_rule)
|
||
|
self.corpus_count = corpus_count
|
||
|
report_values = self.vocabulary.prepare_vocab(
|
||
|
self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab,
|
||
|
trim_rule=trim_rule, **kwargs)
|
||
|
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
|
||
|
self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary)
|
||
|
|
||
|
def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
|
||
|
"""Build vocabulary from a dictionary of word frequencies.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
word_freq : dict of (str, int)
|
||
|
A mapping from a word in the vocabulary to its frequency count.
|
||
|
keep_raw_vocab : bool, optional
|
||
|
If False, delete the raw vocabulary after the scaling is done to free up RAM.
|
||
|
corpus_count : int, optional
|
||
|
Even if no corpus is provided, this argument can set corpus_count explicitly.
|
||
|
trim_rule : function, optional
|
||
|
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
|
||
|
be trimmed away, or handled using the default (discard if word count < min_count).
|
||
|
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
|
||
|
or a callable that accepts parameters (word, count, min_count) and returns either
|
||
|
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
|
||
|
The rule, if given, is only used to prune vocabulary during current method call and is not stored as part
|
||
|
of the model.
|
||
|
|
||
|
The input parameters are of the following types:
|
||
|
* `word` (str) - the word we are examining
|
||
|
* `count` (int) - the word's frequency count in the corpus
|
||
|
* `min_count` (int) - the minimum count threshold.
|
||
|
|
||
|
update : bool, optional
|
||
|
If true, the new provided words in `word_freq` dict will be added to model's vocab.
|
||
|
|
||
|
"""
|
||
|
logger.info("Processing provided word frequencies")
|
||
|
# Instead of scanning text, this will assign provided word frequencies dictionary(word_freq)
|
||
|
# to be directly the raw vocab
|
||
|
raw_vocab = word_freq
|
||
|
logger.info(
|
||
|
"collected %i different raw word, with total frequency of %i",
|
||
|
len(raw_vocab), sum(itervalues(raw_vocab))
|
||
|
)
|
||
|
|
||
|
# Since no sentences are provided, this is to control the corpus_count.
|
||
|
self.corpus_count = corpus_count or 0
|
||
|
self.vocabulary.raw_vocab = raw_vocab
|
||
|
|
||
|
# trim by min_count & precalculate downsampling
|
||
|
report_values = self.vocabulary.prepare_vocab(
|
||
|
self.hs, self.negative, self.wv, keep_raw_vocab=keep_raw_vocab,
|
||
|
trim_rule=trim_rule, update=update)
|
||
|
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
|
||
|
self.trainables.prepare_weights(
|
||
|
self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) # build tables & arrays
|
||
|
|
||
|
def estimate_memory(self, vocab_size=None, report=None):
|
||
|
"""Estimate required memory for a model using current settings and provided vocabulary size.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
vocab_size : int, optional
|
||
|
Number of unique tokens in the vocabulary
|
||
|
report : dict of (str, int), optional
|
||
|
A dictionary from string representations of the model's memory consuming members to their size in bytes.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dict of (str, int)
|
||
|
A dictionary from string representations of the model's memory consuming members to their size in bytes.
|
||
|
|
||
|
"""
|
||
|
vocab_size = vocab_size or len(self.wv.vocab)
|
||
|
report = report or {}
|
||
|
report['vocab'] = vocab_size * (700 if self.hs else 500)
|
||
|
report['vectors'] = vocab_size * self.vector_size * dtype(REAL).itemsize
|
||
|
if self.hs:
|
||
|
report['syn1'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize
|
||
|
if self.negative:
|
||
|
report['syn1neg'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize
|
||
|
report['total'] = sum(report.values())
|
||
|
logger.info(
|
||
|
"estimated required memory for %i words and %i dimensions: %i bytes",
|
||
|
vocab_size, self.vector_size, report['total']
|
||
|
)
|
||
|
return report
|
||
|
|
||
|
def train(self, sentences, total_examples=None, total_words=None,
|
||
|
epochs=None, start_alpha=None, end_alpha=None, word_count=0,
|
||
|
queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()):
|
||
|
"""Train the model. If the hyper-parameters are passed, they override the ones set in the constructor.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sentences : iterable of list of str
|
||
|
Can be simply a list of lists of tokens, but for larger corpora,
|
||
|
consider an iterable that streams the sentences directly from disk/network.
|
||
|
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
|
||
|
or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
|
||
|
total_examples : int, optional
|
||
|
Count of sentences.
|
||
|
total_words : int, optional
|
||
|
Count of raw words in sentences.
|
||
|
epochs : int, optional
|
||
|
Number of iterations (epochs) over the corpus.
|
||
|
start_alpha : float, optional
|
||
|
Initial learning rate.
|
||
|
end_alpha : float, optional
|
||
|
Final learning rate. Drops linearly with the number of iterations from `start_alpha`.
|
||
|
word_count : int, optional
|
||
|
Count of words already trained. Leave this to 0 for the usual case of training on all words in sentences.
|
||
|
queue_factor : int, optional
|
||
|
Multiplier for size of queue -> size = number of workers * queue_factor.
|
||
|
report_delay : float, optional
|
||
|
Seconds to wait before reporting progress.
|
||
|
compute_loss : bool, optional
|
||
|
If True, loss will be computed while training the Word2Vec model and stored in
|
||
|
:attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`.
|
||
|
callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
|
||
|
List of callbacks that need to be executed/run at specific stages during training.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(int, int)
|
||
|
Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count).
|
||
|
|
||
|
"""
|
||
|
|
||
|
self.alpha = start_alpha or self.alpha
|
||
|
self.min_alpha = end_alpha or self.min_alpha
|
||
|
self.compute_loss = compute_loss
|
||
|
self.running_training_loss = 0.0
|
||
|
return super(BaseWordEmbeddingsModel, self).train(
|
||
|
sentences, total_examples=total_examples, total_words=total_words,
|
||
|
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
|
||
|
queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)
|
||
|
|
||
|
def _get_job_params(self, cur_epoch):
|
||
|
"""Get the learning rate used in the current epoch.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
cur_epoch : int
|
||
|
Current iteration through the corpus
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
The learning rate for this epoch (it is linearly reduced with epochs from `self.alpha` to `self.min_alpha`).
|
||
|
|
||
|
"""
|
||
|
alpha = self.alpha - ((self.alpha - self.min_alpha) * float(cur_epoch) / self.epochs)
|
||
|
return alpha
|
||
|
|
||
|
def _update_job_params(self, job_params, epoch_progress, cur_epoch):
|
||
|
"""Get the correct learning rate for the next iteration.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
job_params : dict of (str, obj)
|
||
|
UNUSED.
|
||
|
epoch_progress : float
|
||
|
Ratio of finished work in the current epoch.
|
||
|
cur_epoch : int
|
||
|
Number of current iteration.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
The learning rate to be used in the next training epoch.
|
||
|
|
||
|
"""
|
||
|
start_alpha = self.alpha
|
||
|
end_alpha = self.min_alpha
|
||
|
progress = (cur_epoch + epoch_progress) / self.epochs
|
||
|
next_alpha = start_alpha - (start_alpha - end_alpha) * progress
|
||
|
next_alpha = max(end_alpha, next_alpha)
|
||
|
self.min_alpha_yet_reached = next_alpha
|
||
|
return next_alpha
|
||
|
|
||
|
def _get_thread_working_mem(self):
|
||
|
"""Computes the memory used per worker thread.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(np.ndarray, np.ndarray)
|
||
|
Each worker threads private work memory.
|
||
|
|
||
|
"""
|
||
|
work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory
|
||
|
neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)
|
||
|
return work, neu1
|
||
|
|
||
|
def _raw_word_count(self, job):
|
||
|
"""Get the number of words in a given job.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
job: iterable of list of str
|
||
|
The corpus chunk processed in a single batch.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
int
|
||
|
Number of raw words in the corpus chunk.
|
||
|
|
||
|
"""
|
||
|
return sum(len(sentence) for sentence in job)
|
||
|
|
||
|
def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs):
|
||
|
"""Checks whether the training parameters make sense.
|
||
|
|
||
|
Called right before training starts in :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.train`
|
||
|
and raises warning or errors depending on the severity of the issue in case an inconsistent parameter
|
||
|
combination is detected.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
epochs : int, optional
|
||
|
Number of training epochs. Must have a (non None) value.
|
||
|
total_examples : int, optional
|
||
|
Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied.
|
||
|
total_words : int, optional
|
||
|
Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied.
|
||
|
**kwargs : object
|
||
|
Unused. Present to preserve signature among base and inherited implementations.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
RuntimeError
|
||
|
If one of the required training pre/post processing steps have not been performed.
|
||
|
ValueError
|
||
|
If the combination of input parameters is inconsistent.
|
||
|
|
||
|
"""
|
||
|
if self.alpha > self.min_alpha_yet_reached:
|
||
|
logger.warning("Effective 'alpha' higher than previous training cycles")
|
||
|
if self.model_trimmed_post_training:
|
||
|
raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
|
||
|
|
||
|
if not self.wv.vocab: # should be set by `build_vocab`
|
||
|
raise RuntimeError("you must first build vocabulary before training the model")
|
||
|
if not len(self.wv.vectors):
|
||
|
raise RuntimeError("you must initialize vectors before training the model")
|
||
|
|
||
|
if not hasattr(self, 'corpus_count'):
|
||
|
raise ValueError(
|
||
|
"The number of examples in the training corpus is missing. "
|
||
|
"Please make sure this is set inside `build_vocab` function."
|
||
|
"Call the `build_vocab` function before calling `train`."
|
||
|
)
|
||
|
|
||
|
if total_words is None and total_examples is None:
|
||
|
raise ValueError(
|
||
|
"You must specify either total_examples or total_words, for proper job parameters updation"
|
||
|
"and progress calculations. "
|
||
|
"The usual value is total_examples=model.corpus_count."
|
||
|
)
|
||
|
if epochs is None:
|
||
|
raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.epochs.")
|
||
|
logger.info(
|
||
|
"training model with %i workers on %i vocabulary and %i features, "
|
||
|
"using sg=%s hs=%s sample=%s negative=%s window=%s",
|
||
|
self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg,
|
||
|
self.hs, self.vocabulary.sample, self.negative, self.window
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def load(cls, *args, **kwargs):
|
||
|
"""Load a previously saved object (using :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save`) from file.
|
||
|
|
||
|
Also initializes extra instance attributes in case the loaded model does not include them.
|
||
|
`*args` or `**kwargs` **MUST** include the fname argument (path to saved file).
|
||
|
See :meth:`~gensim.utils.SaveLoad.load`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
*args : object
|
||
|
Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`.
|
||
|
**kwargs : object
|
||
|
Key word arguments passed to :meth:`~gensim.utils.SaveLoad.load`.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
:meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save`
|
||
|
Method for save a model.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
|
||
|
Model loaded from disk.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
IOError
|
||
|
When methods are called on instance (should be called from class).
|
||
|
|
||
|
"""
|
||
|
model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs)
|
||
|
if not hasattr(model, 'ns_exponent'):
|
||
|
model.ns_exponent = 0.75
|
||
|
if not hasattr(model.vocabulary, 'ns_exponent'):
|
||
|
model.vocabulary.ns_exponent = 0.75
|
||
|
if model.negative and hasattr(model.wv, 'index2word'):
|
||
|
model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary
|
||
|
if not hasattr(model, 'corpus_count'):
|
||
|
model.corpus_count = None
|
||
|
if not hasattr(model.trainables, 'vectors_lockf') and hasattr(model.wv, 'vectors'):
|
||
|
model.trainables.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL)
|
||
|
if not hasattr(model, 'random'):
|
||
|
model.random = random.RandomState(model.trainables.seed)
|
||
|
if not hasattr(model, 'train_count'):
|
||
|
model.train_count = 0
|
||
|
model.total_train_time = 0
|
||
|
return model
|
||
|
|
||
|
def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
|
||
|
raw_word_count, total_words, trained_word_count, elapsed):
|
||
|
"""Callback used to log progress for long running jobs.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
job_queue : Queue of (list of object, dict of (str, float))
|
||
|
The queue of jobs still to be performed by workers. Each job is represented as a tuple containing
|
||
|
the batch of data to be processed and the parameters to be used for the processing as a dict.
|
||
|
progress_queue : Queue of (int, int, int)
|
||
|
A queue of progress reports. Each report is represented as a tuple of these 3 elements:
|
||
|
* size of data chunk processed, for example number of sentences in the corpus chunk.
|
||
|
* Effective word count used in training (after ignoring unknown words and trimming the sentence length).
|
||
|
* Total word count used in training.
|
||
|
cur_epoch : int
|
||
|
The current training iteration through the corpus.
|
||
|
example_count : int
|
||
|
Number of examples (could be sentences for example) processed until now.
|
||
|
total_examples : int
|
||
|
Number of all examples present in the input corpus.
|
||
|
raw_word_count : int
|
||
|
Number of words used in training until now.
|
||
|
total_words : int
|
||
|
Number of all words in the input corpus.
|
||
|
trained_word_count : int
|
||
|
Number of effective words used in training until now (after ignoring unknown words and trimming
|
||
|
the sentence length).
|
||
|
elapsed : int
|
||
|
Elapsed time since the beginning of training in seconds.
|
||
|
|
||
|
"""
|
||
|
if total_examples:
|
||
|
# examples-based progress %
|
||
|
logger.info(
|
||
|
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
|
||
|
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
|
||
|
utils.qsize(job_queue), utils.qsize(progress_queue)
|
||
|
)
|
||
|
else:
|
||
|
# words-based progress %
|
||
|
logger.info(
|
||
|
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
|
||
|
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
|
||
|
utils.qsize(job_queue), utils.qsize(progress_queue)
|
||
|
)
|
||
|
|
||
|
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
|
||
|
trained_word_count, elapsed):
|
||
|
"""Callback used to log the end of a training epoch.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
cur_epoch : int
|
||
|
The current training iteration through the corpus.
|
||
|
example_count : int
|
||
|
Number of examples (could be sentences for example) processed until now.
|
||
|
total_examples : int
|
||
|
Number of all examples present in the input corpus.
|
||
|
raw_word_count : int
|
||
|
Number of words used in training until now.
|
||
|
total_words : int
|
||
|
Number of all words in the input corpus.
|
||
|
trained_word_count : int
|
||
|
Number of effective words used in training until now (after ignoring unknown words and trimming
|
||
|
the sentence length).
|
||
|
elapsed : int
|
||
|
Elapsed time since the beginning of training in seconds.
|
||
|
|
||
|
Warnings
|
||
|
--------
|
||
|
In case the corpus is changed while the epoch was running.
|
||
|
|
||
|
"""
|
||
|
logger.info(
|
||
|
"EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
|
||
|
cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed
|
||
|
)
|
||
|
|
||
|
# check that the input corpus hasn't changed during iteration
|
||
|
if total_examples and total_examples != example_count:
|
||
|
logger.warning(
|
||
|
"EPOCH - %i : supplied example count (%i) did not equal expected count (%i)", cur_epoch + 1,
|
||
|
example_count, total_examples
|
||
|
)
|
||
|
if total_words and total_words != raw_word_count:
|
||
|
logger.warning(
|
||
|
"EPOCH - %i : supplied raw word count (%i) did not equal expected count (%i)", cur_epoch + 1,
|
||
|
raw_word_count, total_words
|
||
|
)
|
||
|
|
||
|
def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally):
|
||
|
"""Callback to log the end of training.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
raw_word_count : int
|
||
|
Number of words used in the whole training.
|
||
|
trained_word_count : int
|
||
|
Number of effective words used in training (after ignoring unknown words and trimming the sentence length).
|
||
|
total_elapsed : int
|
||
|
Total time spent during training in seconds.
|
||
|
job_tally : int
|
||
|
Total number of jobs processed during training.
|
||
|
|
||
|
"""
|
||
|
logger.info(
|
||
|
"training on a %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
|
||
|
raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed
|
||
|
)
|
||
|
if job_tally < 10 * self.workers:
|
||
|
logger.warning(
|
||
|
"under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
|
||
|
)
|
||
|
|
||
|
# for backward compatibility
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.most_similar() instead")
|
||
|
def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
|
||
|
"""Deprecated, use self.wv.most_similar() instead.
|
||
|
|
||
|
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer)
|
||
|
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.wmdistance() instead")
|
||
|
def wmdistance(self, document1, document2):
|
||
|
"""Deprecated, use self.wv.wmdistance() instead.
|
||
|
|
||
|
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.wmdistance`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.wmdistance(document1, document2)
|
||
|
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.most_similar_cosmul() instead")
|
||
|
def most_similar_cosmul(self, positive=None, negative=None, topn=10):
|
||
|
"""Deprecated, use self.wv.most_similar_cosmul() instead.
|
||
|
|
||
|
Refer to the documentation for
|
||
|
:meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar_cosmul`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.most_similar_cosmul(positive, negative, topn)
|
||
|
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.similar_by_word() instead")
|
||
|
def similar_by_word(self, word, topn=10, restrict_vocab=None):
|
||
|
"""Deprecated, use self.wv.similar_by_word() instead.
|
||
|
|
||
|
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_word`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.similar_by_word(word, topn, restrict_vocab)
|
||
|
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.similar_by_vector() instead")
|
||
|
def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
|
||
|
"""Deprecated, use self.wv.similar_by_vector() instead.
|
||
|
|
||
|
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similar_by_vector`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.similar_by_vector(vector, topn, restrict_vocab)
|
||
|
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.doesnt_match() instead")
|
||
|
def doesnt_match(self, words):
|
||
|
"""Deprecated, use self.wv.doesnt_match() instead.
|
||
|
|
||
|
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.doesnt_match`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.doesnt_match(words)
|
||
|
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.similarity() instead")
|
||
|
def similarity(self, w1, w2):
|
||
|
"""Deprecated, use self.wv.similarity() instead.
|
||
|
|
||
|
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.similarity(w1, w2)
|
||
|
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.n_similarity() instead")
|
||
|
def n_similarity(self, ws1, ws2):
|
||
|
"""Deprecated, use self.wv.n_similarity() instead.
|
||
|
|
||
|
Refer to the documentation for :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.n_similarity`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.n_similarity(ws1, ws2)
|
||
|
|
||
|
@deprecated("Method will be removed in 4.0.0, use self.wv.evaluate_word_pairs() instead")
|
||
|
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
|
||
|
case_insensitive=True, dummy4unknown=False):
|
||
|
"""Deprecated, use self.wv.evaluate_word_pairs() instead.
|
||
|
|
||
|
Refer to the documentation for
|
||
|
:meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.evaluate_word_pairs`.
|
||
|
|
||
|
"""
|
||
|
return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)
|