234 lines
9.8 KiB
Python
234 lines
9.8 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Author: Shiva Manne <s.manne@rare-technologies.com>
|
||
|
# Copyright (C) 2018 RaRe Technologies s.r.o.
|
||
|
|
||
|
"""General functions used for any2vec models."""
|
||
|
|
||
|
import logging
|
||
|
import numpy as np
|
||
|
from gensim import utils
|
||
|
|
||
|
from numpy import zeros, dtype, float32 as REAL, ascontiguousarray, fromstring
|
||
|
|
||
|
from six.moves import xrange
|
||
|
from six import iteritems
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
try:
|
||
|
from gensim.models._utils_any2vec import ft_hash as _ft_hash, compute_ngrams as _compute_ngrams
|
||
|
except ImportError:
|
||
|
FAST_VERSION = -1
|
||
|
|
||
|
# failed... fall back to plain python
|
||
|
def _ft_hash(string):
|
||
|
"""Calculate hash based on `string`.
|
||
|
Reproduce `hash method from Facebook fastText implementation
|
||
|
<https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc>`_.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
string : str
|
||
|
The string whose hash needs to be calculated.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
int
|
||
|
The hash of the string.
|
||
|
|
||
|
"""
|
||
|
# Runtime warnings for integer overflow are raised, this is expected behaviour. These warnings are suppressed.
|
||
|
old_settings = np.seterr(all='ignore')
|
||
|
h = np.uint32(2166136261)
|
||
|
for c in string:
|
||
|
h = h ^ np.uint32(ord(c))
|
||
|
h = h * np.uint32(16777619)
|
||
|
np.seterr(**old_settings)
|
||
|
return h
|
||
|
|
||
|
def _compute_ngrams(word, min_n, max_n):
|
||
|
"""Get the list of all possible ngrams for a given word.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
word : str
|
||
|
The word whose ngrams need to be computed.
|
||
|
min_n : int
|
||
|
Minimum character length of the ngrams.
|
||
|
max_n : int
|
||
|
Maximum character length of the ngrams.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Sequence of character ngrams.
|
||
|
|
||
|
"""
|
||
|
BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix
|
||
|
extended_word = BOW + word + EOW
|
||
|
ngrams = []
|
||
|
for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
|
||
|
for i in range(0, len(extended_word) - ngram_length + 1):
|
||
|
ngrams.append(extended_word[i:i + ngram_length])
|
||
|
return ngrams
|
||
|
|
||
|
|
||
|
def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None):
|
||
|
"""Store the input-hidden weight matrix in the same format used by the original
|
||
|
C word2vec-tool, for compatibility.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname : str
|
||
|
The file path used to save the vectors in.
|
||
|
vocab : dict
|
||
|
The vocabulary of words.
|
||
|
vectors : numpy.array
|
||
|
The vectors to be stored.
|
||
|
fvocab : str, optional
|
||
|
File path used to save the vocabulary.
|
||
|
binary : bool, optional
|
||
|
If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
|
||
|
total_vec : int, optional
|
||
|
Explicitly specify total number of vectors
|
||
|
(in case word vectors are appended with document vectors afterwards).
|
||
|
|
||
|
"""
|
||
|
if not (vocab or vectors):
|
||
|
raise RuntimeError("no input")
|
||
|
if total_vec is None:
|
||
|
total_vec = len(vocab)
|
||
|
vector_size = vectors.shape[1]
|
||
|
if fvocab is not None:
|
||
|
logger.info("storing vocabulary in %s", fvocab)
|
||
|
with utils.smart_open(fvocab, 'wb') as vout:
|
||
|
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
|
||
|
vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count)))
|
||
|
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
|
||
|
assert (len(vocab), vector_size) == vectors.shape
|
||
|
with utils.smart_open(fname, 'wb') as fout:
|
||
|
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
|
||
|
# store in sorted order: most frequent words at the top
|
||
|
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
|
||
|
row = vectors[vocab_.index]
|
||
|
if binary:
|
||
|
row = row.astype(REAL)
|
||
|
fout.write(utils.to_utf8(word) + b" " + row.tostring())
|
||
|
else:
|
||
|
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))
|
||
|
|
||
|
|
||
|
def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
|
||
|
limit=None, datatype=REAL):
|
||
|
"""Load the input-hidden weight matrix from the original C word2vec-tool format.
|
||
|
|
||
|
Note that the information stored in the file is incomplete (the binary tree is missing),
|
||
|
so while you can query for word similarity etc., you cannot continue training
|
||
|
with a model loaded this way.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname : str
|
||
|
The file path to the saved word2vec-format file.
|
||
|
fvocab : str, optional
|
||
|
File path to the vocabulary.Word counts are read from `fvocab` filename, if set
|
||
|
(this is the file generated by `-save-vocab` flag of the original C tool).
|
||
|
binary : bool, optional
|
||
|
If True, indicates whether the data is in binary word2vec format.
|
||
|
encoding : str, optional
|
||
|
If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`.
|
||
|
unicode_errors : str, optional
|
||
|
default 'strict', is a string suitable to be passed as the `errors`
|
||
|
argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
|
||
|
file may include word tokens truncated in the middle of a multibyte unicode character
|
||
|
(as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
|
||
|
limit : int, optional
|
||
|
Sets a maximum number of word-vectors to read from the file. The default,
|
||
|
None, means read all.
|
||
|
datatype : type, optional
|
||
|
(Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory.
|
||
|
Such types may result in much slower bulk operations or incompatibility with optimized routines.)
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
object
|
||
|
Returns the loaded model as an instance of :class:`cls`.
|
||
|
|
||
|
"""
|
||
|
from gensim.models.keyedvectors import Vocab
|
||
|
counts = None
|
||
|
if fvocab is not None:
|
||
|
logger.info("loading word counts from %s", fvocab)
|
||
|
counts = {}
|
||
|
with utils.smart_open(fvocab) as fin:
|
||
|
for line in fin:
|
||
|
word, count = utils.to_unicode(line).strip().split()
|
||
|
counts[word] = int(count)
|
||
|
|
||
|
logger.info("loading projection weights from %s", fname)
|
||
|
with utils.smart_open(fname) as fin:
|
||
|
header = utils.to_unicode(fin.readline(), encoding=encoding)
|
||
|
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
|
||
|
if limit:
|
||
|
vocab_size = min(vocab_size, limit)
|
||
|
result = cls(vector_size)
|
||
|
result.vector_size = vector_size
|
||
|
result.vectors = zeros((vocab_size, vector_size), dtype=datatype)
|
||
|
|
||
|
def add_word(word, weights):
|
||
|
word_id = len(result.vocab)
|
||
|
if word in result.vocab:
|
||
|
logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
|
||
|
return
|
||
|
if counts is None:
|
||
|
# most common scenario: no vocab file given. just make up some bogus counts, in descending order
|
||
|
result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
|
||
|
elif word in counts:
|
||
|
# use count from the vocab file
|
||
|
result.vocab[word] = Vocab(index=word_id, count=counts[word])
|
||
|
else:
|
||
|
# vocab file given, but word is missing -- set count to None (TODO: or raise?)
|
||
|
logger.warning("vocabulary file is incomplete: '%s' is missing", word)
|
||
|
result.vocab[word] = Vocab(index=word_id, count=None)
|
||
|
result.vectors[word_id] = weights
|
||
|
result.index2word.append(word)
|
||
|
|
||
|
if binary:
|
||
|
binary_len = dtype(REAL).itemsize * vector_size
|
||
|
for _ in xrange(vocab_size):
|
||
|
# mixed text and binary: read text first, then binary
|
||
|
word = []
|
||
|
while True:
|
||
|
ch = fin.read(1)
|
||
|
if ch == b' ':
|
||
|
break
|
||
|
if ch == b'':
|
||
|
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
|
||
|
if ch != b'\n': # ignore newlines in front of words (some binary files have)
|
||
|
word.append(ch)
|
||
|
word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
|
||
|
weights = fromstring(fin.read(binary_len), dtype=REAL).astype(datatype)
|
||
|
add_word(word, weights)
|
||
|
else:
|
||
|
for line_no in xrange(vocab_size):
|
||
|
line = fin.readline()
|
||
|
if line == b'':
|
||
|
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
|
||
|
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
|
||
|
if len(parts) != vector_size + 1:
|
||
|
raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
|
||
|
word, weights = parts[0], [datatype(x) for x in parts[1:]]
|
||
|
add_word(word, weights)
|
||
|
if result.vectors.shape[0] != len(result.vocab):
|
||
|
logger.info(
|
||
|
"duplicate words detected, shrinking matrix size from %i to %i",
|
||
|
result.vectors.shape[0], len(result.vocab)
|
||
|
)
|
||
|
result.vectors = ascontiguousarray(result.vectors[: len(result.vocab)])
|
||
|
assert (len(result.vocab), vector_size) == result.vectors.shape
|
||
|
|
||
|
logger.info("loaded %s matrix from %s", result.vectors.shape, fname)
|
||
|
return result
|