laywerrobot/lib/python3.6/site-packages/gensim/models/deprecated/fasttext_wrapper.py
2020-08-27 21:55:39 +02:00

458 lines
18 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Jayant Jain <jayantjain1992@gmail.com>
# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Warnings
--------
.. deprecated:: 3.2.0
Use :mod:`gensim.models.fasttext` instead.
Python wrapper around word representation learning from FastText, a library for efficient learning
of word representations and sentence classification [1].
This module allows training a word embedding from a training corpus with the additional ability
to obtain word vectors for out-of-vocabulary words, using the fastText C implementation.
The wrapped model can NOT be updated with new documents for online training -- use gensim's
`Word2Vec` for that.
Example:
>>> from gensim.models.wrappers import FastText
>>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8')
>>> print model['forests'] # prints vector for given out-of-vocabulary word
.. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information
"""
import logging
import tempfile
import os
import struct
import numpy as np
from numpy import float32 as REAL, sqrt, newaxis
from gensim import utils
from gensim.models.deprecated.keyedvectors import KeyedVectors, Vocab
from gensim.models.deprecated.word2vec import Word2Vec
logger = logging.getLogger(__name__)
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError
FASTTEXT_FILEFORMAT_MAGIC = 793712314
class FastTextKeyedVectors(KeyedVectors):
"""
Class to contain vectors, vocab and ngrams for the FastText training class and other methods not directly
involved in training such as most_similar().
Subclasses KeyedVectors to implement oov lookups, storing ngrams and other FastText specific methods
"""
def __init__(self):
super(FastTextKeyedVectors, self).__init__()
self.syn0_vocab = None
self.syn0_vocab_norm = None
self.syn0_ngrams = None
self.syn0_ngrams_norm = None
self.ngrams = {}
self.hash2index = {}
self.ngrams_word = {}
self.min_n = 0
self.max_n = 0
def save(self, *args, **kwargs):
# don't bother storing the cached normalized vectors
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm'])
super(FastTextKeyedVectors, self).save(*args, **kwargs)
def word_vec(self, word, use_norm=False):
"""
Accept a single word as input.
Returns the word's representations in vector space, as a 1D numpy array.
The word can be out-of-vocabulary as long as ngrams for the word are present.
For words with all ngrams absent, a KeyError is raised.
Example::
>>> trained_model['office']
array([ -1.40128313e-02, ...])
"""
if word in self.vocab:
return super(FastTextKeyedVectors, self).word_vec(word, use_norm)
else:
word_vec = np.zeros(self.syn0_ngrams.shape[1], dtype=np.float32)
ngrams = compute_ngrams(word, self.min_n, self.max_n)
ngrams = [ng for ng in ngrams if ng in self.ngrams]
if use_norm:
ngram_weights = self.syn0_ngrams_norm
else:
ngram_weights = self.syn0_ngrams
for ngram in ngrams:
word_vec += ngram_weights[self.ngrams[ngram]]
if word_vec.any():
return word_vec / len(ngrams)
else: # No ngrams of the word are present in self.ngrams
raise KeyError('all ngrams for word %s absent from model' % word)
def init_sims(self, replace=False):
"""
Precompute L2-normalized vectors.
If `replace` is set, forget the original vectors and only keep the normalized
ones = saves lots of memory!
Note that you **cannot continue training** after doing a replace. The model becomes
effectively read-only = you can only call `most_similar`, `similarity` etc.
"""
super(FastTextKeyedVectors, self).init_sims(replace)
if getattr(self, 'syn0_ngrams_norm', None) is None or replace:
logger.info("precomputing L2-norms of ngram weight vectors")
if replace:
for i in range(self.syn0_ngrams.shape[0]):
self.syn0_ngrams[i, :] /= sqrt((self.syn0_ngrams[i, :] ** 2).sum(-1))
self.syn0_ngrams_norm = self.syn0_ngrams
else:
self.syn0_ngrams_norm = \
(self.syn0_ngrams / sqrt((self.syn0_ngrams ** 2).sum(-1))[..., newaxis]).astype(REAL)
def __contains__(self, word):
"""
Check if `word` or any character ngrams in `word` are present in the vocabulary.
A vector for the word is guaranteed to exist if `__contains__` returns True.
"""
if word in self.vocab:
return True
else:
char_ngrams = compute_ngrams(word, self.min_n, self.max_n)
return any(ng in self.ngrams for ng in char_ngrams)
@classmethod
def load_word2vec_format(cls, *args, **kwargs):
"""Not suppported. Use gensim.models.KeyedVectors.load_word2vec_format instead."""
raise NotImplementedError("Not supported. Use gensim.models.KeyedVectors.load_word2vec_format instead.")
class FastText(Word2Vec):
"""
Class for word vector training using FastText. Communication between FastText and Python
takes place by working with data files on disk and calling the FastText binary with
subprocess.call().
Implements functionality similar to [fasttext.py](https://github.com/salestock/fastText.py),
improving speed and scope of functionality like `most_similar`, `similarity` by extracting vectors
into numpy matrix.
Warnings
--------
.. deprecated:: 3.2.0
Use :class:`gensim.models.fasttext.FastText` instead of :class:`gensim.models.wrappers.fasttext.FastText`.
"""
def initialize_word_vectors(self):
self.wv = FastTextKeyedVectors()
@classmethod
def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5,
word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12):
"""
`ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`.
`corpus_file` is the filename of the text file to be used for training the FastText model.
Expects file to contain utf-8 encoded text.
`model` defines the training algorithm. By default, cbow is used. Accepted values are
'cbow', 'skipgram'.
`size` is the dimensionality of the feature vectors.
`window` is the maximum distance between the current and predicted word within a sentence.
`alpha` is the initial learning rate.
`min_count` = ignore all words with total occurrences lower than this.
`word_ngram` = max length of word ngram
`loss` = defines training objective. Allowed values are `hs` (hierarchical softmax),
`ns` (negative sampling) and `softmax`. Defaults to `ns`
`sample` = threshold for configuring which higher-frequency words are randomly downsampled;
default is 1e-3, useful range is (0, 1e-5).
`negative` = the value for negative specifies how many "noise words" should be drawn
(usually between 5-20). Default is 5. If set to 0, no negative samping is used.
Only relevant when `loss` is set to `ns`
`iter` = number of iterations (epochs) over the corpus. Default is 5.
`min_n` = min length of char ngrams to be used for training word representations. Default is 3.
`max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be
lesser than `min_n` to avoid char ngrams being used. Default is 6.
`sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before
assigning word indexes.
`threads` = number of threads to use. Default is 12.
"""
ft_path = ft_path
output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model')
ft_args = {
'input': corpus_file,
'output': output_file,
'lr': alpha,
'dim': size,
'ws': window,
'epoch': iter,
'minCount': min_count,
'wordNgrams': word_ngrams,
'neg': negative,
'loss': loss,
'minn': min_n,
'maxn': max_n,
'thread': threads,
't': sample
}
cmd = [ft_path, model]
for option, value in ft_args.items():
cmd.append("-%s" % option)
cmd.append(str(value))
utils.check_output(args=cmd)
model = cls.load_fasttext_format(output_file)
cls.delete_training_files(output_file)
return model
def save(self, *args, **kwargs):
# don't bother storing the cached normalized vectors
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm'])
super(FastText, self).save(*args, **kwargs)
@classmethod
def load_fasttext_format(cls, model_file, encoding='utf8'):
"""
Load the input-hidden weight matrix from the fast text output files.
Note that due to limitations in the FastText API, you cannot continue training
with a model loaded this way, though you can query for word similarity etc.
`model_file` is the path to the FastText output files.
FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
as gensim requires only `.bin` file to load entire fastText model.
"""
model = cls()
if not model_file.endswith('.bin'):
model_file += '.bin'
model.file_name = model_file
model.load_binary_data(encoding=encoding)
return model
@classmethod
def load(cls, *args, **kwargs):
model = super(FastText, cls).load(*args, **kwargs)
if hasattr(model.wv, 'syn0_all'):
setattr(model.wv, 'syn0_ngrams', model.wv.syn0_all)
delattr(model.wv, 'syn0_all')
return model
@classmethod
def delete_training_files(cls, model_file):
"""Deletes the files created by FastText training"""
try:
os.remove('%s.vec' % model_file)
os.remove('%s.bin' % model_file)
except FileNotFoundError:
logger.debug('Training files %s not found when attempting to delete', model_file)
pass
def load_binary_data(self, encoding='utf8'):
"""Loads data from the output binary file created by FastText training"""
# TODO use smart_open again when https://github.com/RaRe-Technologies/smart_open/issues/207 will be fixed
with open(self.file_name, 'rb') as f:
self.load_model_params(f)
self.load_dict(f, encoding=encoding)
self.load_vectors(f)
def load_model_params(self, file_handle):
magic, version = self.struct_unpack(file_handle, '@2i')
if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format
self.new_format = True
dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \
self.struct_unpack(file_handle, '@12i1d')
else: # older format
self.new_format = False
dim = magic
ws = version
epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
# Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
self.vector_size = dim
self.window = ws
self.iter = epoch
self.min_count = min_count
self.negative = neg
self.hs = loss == 1
self.sg = model == 2
self.bucket = bucket
self.wv.min_n = minn
self.wv.max_n = maxn
self.sample = t
def load_dict(self, file_handle, encoding='utf8'):
vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i')
# Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
if nlabels > 0:
raise NotImplementedError("Supervised fastText models are not supported")
logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name)
self.struct_unpack(file_handle, '@1q') # number of tokens
if self.new_format:
pruneidx_size, = self.struct_unpack(file_handle, '@q')
for i in range(vocab_size):
word_bytes = b''
char_byte = file_handle.read(1)
# Read vocab word
while char_byte != b'\x00':
word_bytes += char_byte
char_byte = file_handle.read(1)
word = word_bytes.decode(encoding)
count, _ = self.struct_unpack(file_handle, '@qb')
self.wv.vocab[word] = Vocab(index=i, count=count)
self.wv.index2word.append(word)
assert len(self.wv.vocab) == nwords, (
'mismatch between final vocab size ({} words), '
'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords))
if len(self.wv.vocab) != vocab_size:
# expecting to log this warning only for pretrained french vector, wiki.fr
logger.warning(
"mismatch between final vocab size (%s words), and expected vocab size (%s words)",
len(self.wv.vocab), vocab_size
)
if self.new_format:
for j in range(pruneidx_size):
self.struct_unpack(file_handle, '@2i')
def load_vectors(self, file_handle):
if self.new_format:
self.struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc
num_vectors, dim = self.struct_unpack(file_handle, '@2q')
# Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
assert self.vector_size == dim, (
'mismatch between vector size in model params ({}) and model vectors ({})'
.format(self.vector_size, dim)
)
float_size = struct.calcsize('@f')
if float_size == 4:
dtype = np.dtype(np.float32)
elif float_size == 8:
dtype = np.dtype(np.float64)
self.num_original_vectors = num_vectors
self.wv.syn0_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim)
self.wv.syn0_ngrams = self.wv.syn0_ngrams.reshape((num_vectors, dim))
assert self.wv.syn0_ngrams.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \
'mismatch between actual weight matrix shape {} and expected shape {}'\
.format(
self.wv.syn0_ngrams.shape, (self.bucket + len(self.wv.vocab), self.vector_size)
)
self.init_ngrams()
def struct_unpack(self, file_handle, fmt):
num_bytes = struct.calcsize(fmt)
return struct.unpack(fmt, file_handle.read(num_bytes))
def init_ngrams(self):
"""
Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams.
Vectors for other ngrams are initialized with a random uniform distribution in FastText. These
vectors are discarded here to save space.
"""
self.wv.ngrams = {}
all_ngrams = []
self.wv.syn0 = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL)
for w, vocab in self.wv.vocab.items():
all_ngrams += compute_ngrams(w, self.wv.min_n, self.wv.max_n)
self.wv.syn0[vocab.index] += np.array(self.wv.syn0_ngrams[vocab.index])
all_ngrams = set(all_ngrams)
self.num_ngram_vectors = len(all_ngrams)
ngram_indices = []
for i, ngram in enumerate(all_ngrams):
ngram_hash = ft_hash(ngram)
ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket)
self.wv.ngrams[ngram] = i
self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0)
ngram_weights = self.wv.syn0_ngrams
logger.info(
"loading weights for %s words for fastText model from %s",
len(self.wv.vocab), self.file_name
)
for w, vocab in self.wv.vocab.items():
word_ngrams = compute_ngrams(w, self.wv.min_n, self.wv.max_n)
for word_ngram in word_ngrams:
self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]])
self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1)
logger.info(
"loaded %s weight matrix for fastText model from %s",
self.wv.syn0.shape, self.file_name
)
def compute_ngrams(word, min_n, max_n):
BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix
extended_word = BOW + word + EOW
ngrams = []
for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
for i in range(0, len(extended_word) - ngram_length + 1):
ngrams.append(extended_word[i:i + ngram_length])
return ngrams
def ft_hash(string):
"""
Reproduces [hash method](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
used in fastText.
"""
# Runtime warnings for integer overflow are raised, this is expected behaviour. These warnings are suppressed.
old_settings = np.seterr(all='ignore')
h = np.uint32(2166136261)
for c in string:
h = h ^ np.uint32(ord(c))
h = h * np.uint32(16777619)
np.seterr(**old_settings)
return h