laywerrobot/lib/python3.6/site-packages/gensim/models/deprecated/fasttext_wrapper.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Jayant Jain <jayantjain1992@gmail.com>
# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Warnings
--------
.. deprecated:: 3.2.0
   Use :mod:`gensim.models.fasttext` instead.


Python wrapper around word representation learning from FastText, a library for efficient learning
of word representations and sentence classification [1].

This module allows training a word embedding from a training corpus with the additional ability
to obtain word vectors for out-of-vocabulary words, using the fastText C implementation.

The wrapped model can NOT be updated with new documents for online training -- use gensim's
`Word2Vec` for that.

Example:

>>> from gensim.models.wrappers import FastText
>>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8')
>>> print model['forests']  # prints vector for given out-of-vocabulary word

.. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information


"""


import logging
import tempfile
import os
import struct

import numpy as np
from numpy import float32 as REAL, sqrt, newaxis
from gensim import utils
from gensim.models.deprecated.keyedvectors import KeyedVectors, Vocab
from gensim.models.deprecated.word2vec import Word2Vec

logger = logging.getLogger(__name__)

try:
    FileNotFoundError
except NameError:
    FileNotFoundError = IOError

FASTTEXT_FILEFORMAT_MAGIC = 793712314


class FastTextKeyedVectors(KeyedVectors):
    """
    Class to contain vectors, vocab and ngrams for the FastText training class and other methods not directly
    involved in training such as most_similar().
    Subclasses KeyedVectors to implement oov lookups, storing ngrams and other FastText specific methods

    """

    def __init__(self):
        super(FastTextKeyedVectors, self).__init__()
        self.syn0_vocab = None
        self.syn0_vocab_norm = None
        self.syn0_ngrams = None
        self.syn0_ngrams_norm = None
        self.ngrams = {}
        self.hash2index = {}
        self.ngrams_word = {}
        self.min_n = 0
        self.max_n = 0

    def save(self, *args, **kwargs):
        # don't bother storing the cached normalized vectors
        kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm'])
        super(FastTextKeyedVectors, self).save(*args, **kwargs)

    def word_vec(self, word, use_norm=False):
        """
        Accept a single word as input.
        Returns the word's representations in vector space, as a 1D numpy array.

        The word can be out-of-vocabulary as long as ngrams for the word are present.
        For words with all ngrams absent, a KeyError is raised.

        Example::

          >>> trained_model['office']
          array([ -1.40128313e-02, ...])

        """
        if word in self.vocab:
            return super(FastTextKeyedVectors, self).word_vec(word, use_norm)
        else:
            word_vec = np.zeros(self.syn0_ngrams.shape[1], dtype=np.float32)
            ngrams = compute_ngrams(word, self.min_n, self.max_n)
            ngrams = [ng for ng in ngrams if ng in self.ngrams]
            if use_norm:
                ngram_weights = self.syn0_ngrams_norm
            else:
                ngram_weights = self.syn0_ngrams
            for ngram in ngrams:
                word_vec += ngram_weights[self.ngrams[ngram]]
            if word_vec.any():
                return word_vec / len(ngrams)
            else:  # No ngrams of the word are present in self.ngrams
                raise KeyError('all ngrams for word %s absent from model' % word)

    def init_sims(self, replace=False):
        """
        Precompute L2-normalized vectors.

        If `replace` is set, forget the original vectors and only keep the normalized
        ones = saves lots of memory!

        Note that you **cannot continue training** after doing a replace. The model becomes
        effectively read-only = you can only call `most_similar`, `similarity` etc.

        """
        super(FastTextKeyedVectors, self).init_sims(replace)
        if getattr(self, 'syn0_ngrams_norm', None) is None or replace:
            logger.info("precomputing L2-norms of ngram weight vectors")
            if replace:
                for i in range(self.syn0_ngrams.shape[0]):
                    self.syn0_ngrams[i, :] /= sqrt((self.syn0_ngrams[i, :] ** 2).sum(-1))
                self.syn0_ngrams_norm = self.syn0_ngrams
            else:
                self.syn0_ngrams_norm = \
                    (self.syn0_ngrams / sqrt((self.syn0_ngrams ** 2).sum(-1))[..., newaxis]).astype(REAL)

    def __contains__(self, word):
        """
        Check if `word` or any character ngrams in `word` are present in the vocabulary.
        A vector for the word is guaranteed to exist if `__contains__` returns True.
        """
        if word in self.vocab:
            return True
        else:
            char_ngrams = compute_ngrams(word, self.min_n, self.max_n)
            return any(ng in self.ngrams for ng in char_ngrams)

    @classmethod
    def load_word2vec_format(cls, *args, **kwargs):
        """Not suppported. Use gensim.models.KeyedVectors.load_word2vec_format instead."""
        raise NotImplementedError("Not supported. Use gensim.models.KeyedVectors.load_word2vec_format instead.")


class FastText(Word2Vec):
    """
    Class for word vector training using FastText. Communication between FastText and Python
    takes place by working with data files on disk and calling the FastText binary with
    subprocess.call().
    Implements functionality similar to [fasttext.py](https://github.com/salestock/fastText.py),
    improving speed and scope of functionality like `most_similar`, `similarity` by extracting vectors
    into numpy matrix.

    Warnings
    --------
    .. deprecated:: 3.2.0
       Use :class:`gensim.models.fasttext.FastText` instead of :class:`gensim.models.wrappers.fasttext.FastText`.


    """

    def initialize_word_vectors(self):
        self.wv = FastTextKeyedVectors()

    @classmethod
    def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5,
              word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12):
        """
        `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`.

        `corpus_file` is the filename of the text file to be used for training the FastText model.
        Expects file to contain utf-8 encoded text.

        `model` defines the training algorithm. By default, cbow is used. Accepted values are
        'cbow', 'skipgram'.

        `size` is the dimensionality of the feature vectors.

        `window` is the maximum distance between the current and predicted word within a sentence.

        `alpha` is the initial learning rate.

        `min_count` = ignore all words with total occurrences lower than this.

        `word_ngram` = max length of word ngram

        `loss` = defines training objective. Allowed values are `hs` (hierarchical softmax),
        `ns` (negative sampling) and `softmax`. Defaults to `ns`

        `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
            default is 1e-3, useful range is (0, 1e-5).

        `negative` = the value for negative specifies how many "noise words" should be drawn
        (usually between 5-20). Default is 5. If set to 0, no negative samping is used.
        Only relevant when `loss` is set to `ns`

        `iter` = number of iterations (epochs) over the corpus. Default is 5.

        `min_n` = min length of char ngrams to be used for training word representations. Default is 3.

        `max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be
        lesser than `min_n` to avoid char ngrams being used. Default is 6.

        `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before
        assigning word indexes.

        `threads` = number of threads to use. Default is 12.

        """
        ft_path = ft_path
        output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model')
        ft_args = {
            'input': corpus_file,
            'output': output_file,
            'lr': alpha,
            'dim': size,
            'ws': window,
            'epoch': iter,
            'minCount': min_count,
            'wordNgrams': word_ngrams,
            'neg': negative,
            'loss': loss,
            'minn': min_n,
            'maxn': max_n,
            'thread': threads,
            't': sample
        }
        cmd = [ft_path, model]
        for option, value in ft_args.items():
            cmd.append("-%s" % option)
            cmd.append(str(value))

        utils.check_output(args=cmd)
        model = cls.load_fasttext_format(output_file)
        cls.delete_training_files(output_file)
        return model

    def save(self, *args, **kwargs):
        # don't bother storing the cached normalized vectors
        kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm'])
        super(FastText, self).save(*args, **kwargs)

    @classmethod
    def load_fasttext_format(cls, model_file, encoding='utf8'):
        """
        Load the input-hidden weight matrix from the fast text output files.

        Note that due to limitations in the FastText API, you cannot continue training
        with a model loaded this way, though you can query for word similarity etc.

        `model_file` is the path to the FastText output files.
        FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`

        Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
        as gensim requires only `.bin` file to load entire fastText model.

        """
        model = cls()
        if not model_file.endswith('.bin'):
            model_file += '.bin'
        model.file_name = model_file
        model.load_binary_data(encoding=encoding)
        return model

    @classmethod
    def load(cls, *args, **kwargs):
        model = super(FastText, cls).load(*args, **kwargs)
        if hasattr(model.wv, 'syn0_all'):
            setattr(model.wv, 'syn0_ngrams', model.wv.syn0_all)
            delattr(model.wv, 'syn0_all')
        return model

    @classmethod
    def delete_training_files(cls, model_file):
        """Deletes the files created by FastText training"""
        try:
            os.remove('%s.vec' % model_file)
            os.remove('%s.bin' % model_file)
        except FileNotFoundError:
            logger.debug('Training files %s not found when attempting to delete', model_file)
            pass

    def load_binary_data(self, encoding='utf8'):
        """Loads data from the output binary file created by FastText training"""

        # TODO use smart_open again when https://github.com/RaRe-Technologies/smart_open/issues/207 will be fixed
        with open(self.file_name, 'rb') as f:
            self.load_model_params(f)
            self.load_dict(f, encoding=encoding)
            self.load_vectors(f)

    def load_model_params(self, file_handle):
        magic, version = self.struct_unpack(file_handle, '@2i')
        if magic == FASTTEXT_FILEFORMAT_MAGIC:  # newer format
            self.new_format = True
            dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \
                self.struct_unpack(file_handle, '@12i1d')
        else:  # older format
            self.new_format = False
            dim = magic
            ws = version
            epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
        # Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
        self.vector_size = dim
        self.window = ws
        self.iter = epoch
        self.min_count = min_count
        self.negative = neg
        self.hs = loss == 1
        self.sg = model == 2
        self.bucket = bucket
        self.wv.min_n = minn
        self.wv.max_n = maxn
        self.sample = t

    def load_dict(self, file_handle, encoding='utf8'):
        vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i')
        # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
        if nlabels > 0:
            raise NotImplementedError("Supervised fastText models are not supported")
        logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name)

        self.struct_unpack(file_handle, '@1q')  # number of tokens
        if self.new_format:
            pruneidx_size, = self.struct_unpack(file_handle, '@q')
        for i in range(vocab_size):
            word_bytes = b''
            char_byte = file_handle.read(1)
            # Read vocab word
            while char_byte != b'\x00':
                word_bytes += char_byte
                char_byte = file_handle.read(1)
            word = word_bytes.decode(encoding)
            count, _ = self.struct_unpack(file_handle, '@qb')

            self.wv.vocab[word] = Vocab(index=i, count=count)
            self.wv.index2word.append(word)

        assert len(self.wv.vocab) == nwords, (
            'mismatch between final vocab size ({} words), '
            'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords))
        if len(self.wv.vocab) != vocab_size:
            # expecting to log this warning only for pretrained french vector, wiki.fr
            logger.warning(
                "mismatch between final vocab size (%s words), and expected vocab size (%s words)",
                len(self.wv.vocab), vocab_size
            )

        if self.new_format:
            for j in range(pruneidx_size):
                self.struct_unpack(file_handle, '@2i')

    def load_vectors(self, file_handle):
        if self.new_format:
            self.struct_unpack(file_handle, '@?')  # bool quant_input in fasttext.cc
        num_vectors, dim = self.struct_unpack(file_handle, '@2q')
        # Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
        assert self.vector_size == dim, (
            'mismatch between vector size in model params ({}) and model vectors ({})'
            .format(self.vector_size, dim)
        )
        float_size = struct.calcsize('@f')
        if float_size == 4:
            dtype = np.dtype(np.float32)
        elif float_size == 8:
            dtype = np.dtype(np.float64)

        self.num_original_vectors = num_vectors
        self.wv.syn0_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim)
        self.wv.syn0_ngrams = self.wv.syn0_ngrams.reshape((num_vectors, dim))
        assert self.wv.syn0_ngrams.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \
            'mismatch between actual weight matrix shape {} and expected shape {}'\
            .format(
                self.wv.syn0_ngrams.shape, (self.bucket + len(self.wv.vocab), self.vector_size)
            )

        self.init_ngrams()

    def struct_unpack(self, file_handle, fmt):
        num_bytes = struct.calcsize(fmt)
        return struct.unpack(fmt, file_handle.read(num_bytes))

    def init_ngrams(self):
        """
        Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams.
        Vectors for other ngrams are initialized with a random uniform distribution in FastText. These
        vectors are discarded here to save space.

        """
        self.wv.ngrams = {}
        all_ngrams = []
        self.wv.syn0 = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL)

        for w, vocab in self.wv.vocab.items():
            all_ngrams += compute_ngrams(w, self.wv.min_n, self.wv.max_n)
            self.wv.syn0[vocab.index] += np.array(self.wv.syn0_ngrams[vocab.index])

        all_ngrams = set(all_ngrams)
        self.num_ngram_vectors = len(all_ngrams)
        ngram_indices = []
        for i, ngram in enumerate(all_ngrams):
            ngram_hash = ft_hash(ngram)
            ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket)
            self.wv.ngrams[ngram] = i
        self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0)

        ngram_weights = self.wv.syn0_ngrams

        logger.info(
            "loading weights for %s words for fastText model from %s",
            len(self.wv.vocab), self.file_name
        )

        for w, vocab in self.wv.vocab.items():
            word_ngrams = compute_ngrams(w, self.wv.min_n, self.wv.max_n)
            for word_ngram in word_ngrams:
                self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]])

            self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1)
        logger.info(
            "loaded %s weight matrix for fastText model from %s",
            self.wv.syn0.shape, self.file_name
        )


def compute_ngrams(word, min_n, max_n):
    BOW, EOW = ('<', '>')  # Used by FastText to attach to all words as prefix and suffix
    extended_word = BOW + word + EOW
    ngrams = []
    for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
        for i in range(0, len(extended_word) - ngram_length + 1):
            ngrams.append(extended_word[i:i + ngram_length])
    return ngrams


def ft_hash(string):
    """
    Reproduces [hash method](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
    used in fastText.

    """
    # Runtime warnings for integer overflow are raised, this is expected behaviour. These warnings are suppressed.
    old_settings = np.seterr(all='ignore')
    h = np.uint32(2166136261)
    for c in string:
        h = h ^ np.uint32(ord(c))
        h = h * np.uint32(16777619)
    np.seterr(**old_settings)
    return h