1075 lines
43 KiB
Python
1075 lines
43 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: utf-8 -*-
|
|||
|
#
|
|||
|
# Copyright (C) 2016 Radim Rehurek <me@radimrehurek.com>
|
|||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|||
|
|
|||
|
"""
|
|||
|
Warnings
|
|||
|
--------
|
|||
|
.. deprecated:: 3.3.0
|
|||
|
Use :mod:`gensim.models.keyedvectors` instead.
|
|||
|
|
|||
|
|
|||
|
Word vector storage and similarity look-ups.
|
|||
|
Common code independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc)
|
|||
|
|
|||
|
The word vectors are considered read-only in this class.
|
|||
|
|
|||
|
Initialize the vectors by training e.g. Word2Vec::
|
|||
|
|
|||
|
>>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
|
|||
|
>>> word_vectors = model.wv
|
|||
|
|
|||
|
Persist the word vectors to disk with::
|
|||
|
|
|||
|
>>> word_vectors.save(fname)
|
|||
|
>>> word_vectors = KeyedVectors.load(fname)
|
|||
|
|
|||
|
The vectors can also be instantiated from an existing file on disk
|
|||
|
in the original Google's word2vec C format as a KeyedVectors instance::
|
|||
|
|
|||
|
>>> from gensim.models.keyedvectors import KeyedVectors
|
|||
|
>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
|
|||
|
>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format
|
|||
|
|
|||
|
You can perform various syntactic/semantic NLP word tasks with the vectors. Some of them
|
|||
|
are already built-in::
|
|||
|
|
|||
|
>>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
|
|||
|
[('queen', 0.50882536), ...]
|
|||
|
|
|||
|
>>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
|
|||
|
[('queen', 0.71382287), ...]
|
|||
|
|
|||
|
>>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split())
|
|||
|
'cereal'
|
|||
|
|
|||
|
>>> word_vectors.similarity('woman', 'man')
|
|||
|
0.73723527
|
|||
|
|
|||
|
Correlation with human opinion on word similarity::
|
|||
|
|
|||
|
>>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
|
|||
|
0.51, 0.62, 0.13
|
|||
|
|
|||
|
And on analogies::
|
|||
|
|
|||
|
>>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
|
|||
|
|
|||
|
and so on.
|
|||
|
|
|||
|
"""
|
|||
|
from __future__ import division # py3 "true division"
|
|||
|
|
|||
|
import logging
|
|||
|
|
|||
|
try:
|
|||
|
from queue import Queue, Empty
|
|||
|
except ImportError:
|
|||
|
from Queue import Queue, Empty # noqa:F401
|
|||
|
|
|||
|
# If pyemd C extension is available, import it.
|
|||
|
# If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance
|
|||
|
try:
|
|||
|
from pyemd import emd
|
|||
|
PYEMD_EXT = True
|
|||
|
except ImportError:
|
|||
|
PYEMD_EXT = False
|
|||
|
|
|||
|
from numpy import dot, zeros, dtype, float32 as REAL,\
|
|||
|
double, array, vstack, fromstring, sqrt, newaxis,\
|
|||
|
ndarray, sum as np_sum, prod, ascontiguousarray,\
|
|||
|
argmax
|
|||
|
import numpy as np
|
|||
|
|
|||
|
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
|
|||
|
from gensim.corpora.dictionary import Dictionary
|
|||
|
from six import string_types, iteritems
|
|||
|
from six.moves import xrange
|
|||
|
from scipy import stats
|
|||
|
|
|||
|
|
|||
|
logger = logging.getLogger(__name__)
|
|||
|
|
|||
|
|
|||
|
class Vocab(object):
|
|||
|
"""
|
|||
|
A single vocabulary item, used internally for collecting per-word frequency/sampling info,
|
|||
|
and for constructing binary trees (incl. both word leaves and inner nodes).
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, **kwargs):
|
|||
|
self.count = 0
|
|||
|
self.__dict__.update(kwargs)
|
|||
|
|
|||
|
def __lt__(self, other): # used for sorting in a priority queue
|
|||
|
return self.count < other.count
|
|||
|
|
|||
|
def __str__(self):
|
|||
|
vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')]
|
|||
|
return "%s(%s)" % (self.__class__.__name__, ', '.join(vals))
|
|||
|
|
|||
|
|
|||
|
class KeyedVectorsBase(utils.SaveLoad):
|
|||
|
"""
|
|||
|
Base class to contain vectors and vocab for any set of vectors which are each associated with a key.
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
self.syn0 = []
|
|||
|
self.vocab = {}
|
|||
|
self.index2word = []
|
|||
|
self.vector_size = None
|
|||
|
|
|||
|
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
|
|||
|
"""
|
|||
|
Store the input-hidden weight matrix in the same format used by the original
|
|||
|
C word2vec-tool, for compatibility.
|
|||
|
|
|||
|
`fname` is the file used to save the vectors in
|
|||
|
`fvocab` is an optional file used to save the vocabulary
|
|||
|
`binary` is an optional boolean indicating whether the data is to be saved
|
|||
|
in binary word2vec format (default: False)
|
|||
|
`total_vec` is an optional parameter to explicitly specify total no. of vectors
|
|||
|
(in case word vectors are appended with document vectors afterwards)
|
|||
|
|
|||
|
"""
|
|||
|
if total_vec is None:
|
|||
|
total_vec = len(self.vocab)
|
|||
|
vector_size = self.syn0.shape[1]
|
|||
|
if fvocab is not None:
|
|||
|
logger.info("storing vocabulary in %s", fvocab)
|
|||
|
with utils.smart_open(fvocab, 'wb') as vout:
|
|||
|
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
|
|||
|
vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
|
|||
|
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
|
|||
|
assert (len(self.vocab), vector_size) == self.syn0.shape
|
|||
|
with utils.smart_open(fname, 'wb') as fout:
|
|||
|
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
|
|||
|
# store in sorted order: most frequent words at the top
|
|||
|
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
|
|||
|
row = self.syn0[vocab.index]
|
|||
|
if binary:
|
|||
|
fout.write(utils.to_utf8(word) + b" " + row.tostring())
|
|||
|
else:
|
|||
|
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
|
|||
|
|
|||
|
@classmethod
|
|||
|
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
|
|||
|
limit=None, datatype=REAL):
|
|||
|
"""
|
|||
|
Load the input-hidden weight matrix from the original C word2vec-tool format.
|
|||
|
|
|||
|
Note that the information stored in the file is incomplete (the binary tree is missing),
|
|||
|
so while you can query for word similarity etc., you cannot continue training
|
|||
|
with a model loaded this way.
|
|||
|
|
|||
|
`binary` is a boolean indicating whether the data is in binary word2vec format.
|
|||
|
`norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
|
|||
|
Word counts are read from `fvocab` filename, if set (this is the file generated
|
|||
|
by `-save-vocab` flag of the original C tool).
|
|||
|
|
|||
|
If you trained the C model using non-utf8 encoding for words, specify that
|
|||
|
encoding in `encoding`.
|
|||
|
|
|||
|
`unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
|
|||
|
argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
|
|||
|
file may include word tokens truncated in the middle of a multibyte unicode character
|
|||
|
(as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
|
|||
|
|
|||
|
`limit` sets a maximum number of word-vectors to read from the file. The default,
|
|||
|
None, means read all.
|
|||
|
|
|||
|
`datatype` (experimental) can coerce dimensions to a non-default float type (such
|
|||
|
as np.float16) to save memory. (Such types may result in much slower bulk operations
|
|||
|
or incompatibility with optimized routines.)
|
|||
|
|
|||
|
"""
|
|||
|
counts = None
|
|||
|
if fvocab is not None:
|
|||
|
logger.info("loading word counts from %s", fvocab)
|
|||
|
counts = {}
|
|||
|
with utils.smart_open(fvocab) as fin:
|
|||
|
for line in fin:
|
|||
|
word, count = utils.to_unicode(line).strip().split()
|
|||
|
counts[word] = int(count)
|
|||
|
|
|||
|
logger.info("loading projection weights from %s", fname)
|
|||
|
with utils.smart_open(fname) as fin:
|
|||
|
header = utils.to_unicode(fin.readline(), encoding=encoding)
|
|||
|
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
|
|||
|
if limit:
|
|||
|
vocab_size = min(vocab_size, limit)
|
|||
|
result = cls()
|
|||
|
result.vector_size = vector_size
|
|||
|
result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)
|
|||
|
|
|||
|
def add_word(word, weights):
|
|||
|
word_id = len(result.vocab)
|
|||
|
if word in result.vocab:
|
|||
|
logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
|
|||
|
return
|
|||
|
if counts is None:
|
|||
|
# most common scenario: no vocab file given. just make up some bogus counts, in descending order
|
|||
|
result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
|
|||
|
elif word in counts:
|
|||
|
# use count from the vocab file
|
|||
|
result.vocab[word] = Vocab(index=word_id, count=counts[word])
|
|||
|
else:
|
|||
|
# vocab file given, but word is missing -- set count to None (TODO: or raise?)
|
|||
|
logger.warning("vocabulary file is incomplete: '%s' is missing", word)
|
|||
|
result.vocab[word] = Vocab(index=word_id, count=None)
|
|||
|
result.syn0[word_id] = weights
|
|||
|
result.index2word.append(word)
|
|||
|
|
|||
|
if binary:
|
|||
|
binary_len = dtype(REAL).itemsize * vector_size
|
|||
|
for _ in xrange(vocab_size):
|
|||
|
# mixed text and binary: read text first, then binary
|
|||
|
word = []
|
|||
|
while True:
|
|||
|
ch = fin.read(1)
|
|||
|
if ch == b' ':
|
|||
|
break
|
|||
|
if ch == b'':
|
|||
|
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
|
|||
|
if ch != b'\n': # ignore newlines in front of words (some binary files have)
|
|||
|
word.append(ch)
|
|||
|
word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
|
|||
|
weights = fromstring(fin.read(binary_len), dtype=REAL)
|
|||
|
add_word(word, weights)
|
|||
|
else:
|
|||
|
for line_no in xrange(vocab_size):
|
|||
|
line = fin.readline()
|
|||
|
if line == b'':
|
|||
|
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
|
|||
|
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
|
|||
|
if len(parts) != vector_size + 1:
|
|||
|
raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
|
|||
|
word, weights = parts[0], [REAL(x) for x in parts[1:]]
|
|||
|
add_word(word, weights)
|
|||
|
if result.syn0.shape[0] != len(result.vocab):
|
|||
|
logger.info(
|
|||
|
"duplicate words detected, shrinking matrix size from %i to %i",
|
|||
|
result.syn0.shape[0], len(result.vocab)
|
|||
|
)
|
|||
|
result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
|
|||
|
assert (len(result.vocab), vector_size) == result.syn0.shape
|
|||
|
|
|||
|
logger.info("loaded %s matrix from %s", result.syn0.shape, fname)
|
|||
|
return result
|
|||
|
|
|||
|
def similarity(self, w1, w2):
|
|||
|
"""
|
|||
|
Compute similarity between vectors of two input words.
|
|||
|
To be implemented by child class.
|
|||
|
|
|||
|
"""
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
def distance(self, w1, w2):
|
|||
|
"""
|
|||
|
Compute distance between vectors of two input words.
|
|||
|
To be implemented by child class.
|
|||
|
|
|||
|
"""
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
def distances(self, word_or_vector, other_words=()):
|
|||
|
"""
|
|||
|
Compute distances from given word or vector to all words in `other_words`.
|
|||
|
If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab.
|
|||
|
To be implemented by child class.
|
|||
|
|
|||
|
"""
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
def word_vec(self, word):
|
|||
|
"""
|
|||
|
Accept a single word as input.
|
|||
|
Returns the word's representations in vector space, as a 1D numpy array.
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.word_vec('office')
|
|||
|
array([ -1.40128313e-02, ...])
|
|||
|
|
|||
|
"""
|
|||
|
if word in self.vocab:
|
|||
|
result = self.syn0[self.vocab[word].index]
|
|||
|
result.setflags(write=False)
|
|||
|
return result
|
|||
|
else:
|
|||
|
raise KeyError("word '%s' not in vocabulary" % word)
|
|||
|
|
|||
|
def __getitem__(self, words):
|
|||
|
"""
|
|||
|
Accept a single word or a list of words as input.
|
|||
|
|
|||
|
If a single word: returns the word's representations in vector space, as
|
|||
|
a 1D numpy array.
|
|||
|
|
|||
|
Multiple words: return the words' representations in vector space, as a
|
|||
|
2d numpy array: #words x #vector_size. Matrix rows are in the same order
|
|||
|
as in input.
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model['office']
|
|||
|
array([ -1.40128313e-02, ...])
|
|||
|
|
|||
|
>>> trained_model[['office', 'products']]
|
|||
|
array([ -1.40128313e-02, ...]
|
|||
|
[ -1.70425311e-03, ...]
|
|||
|
...)
|
|||
|
|
|||
|
"""
|
|||
|
if isinstance(words, string_types):
|
|||
|
# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]
|
|||
|
return self.word_vec(words)
|
|||
|
|
|||
|
return vstack([self.word_vec(word) for word in words])
|
|||
|
|
|||
|
def __contains__(self, word):
|
|||
|
return word in self.vocab
|
|||
|
|
|||
|
def most_similar_to_given(self, w1, word_list):
|
|||
|
"""Return the word from word_list most similar to w1.
|
|||
|
|
|||
|
Args:
|
|||
|
w1 (str): a word
|
|||
|
word_list (list): list of words containing a word most similar to w1
|
|||
|
|
|||
|
Returns:
|
|||
|
the word in word_list with the highest similarity to w1
|
|||
|
|
|||
|
Raises:
|
|||
|
KeyError: If w1 or any word in word_list is not in the vocabulary
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse'])
|
|||
|
'sound'
|
|||
|
|
|||
|
>>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone'])
|
|||
|
'animal'
|
|||
|
|
|||
|
"""
|
|||
|
return word_list[argmax([self.similarity(w1, word) for word in word_list])]
|
|||
|
|
|||
|
def words_closer_than(self, w1, w2):
|
|||
|
"""
|
|||
|
Returns all words that are closer to `w1` than `w2` is to `w1`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
w1 : str
|
|||
|
Input word.
|
|||
|
w2 : str
|
|||
|
Input word.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
list (str)
|
|||
|
List of words that are closer to `w1` than `w2` is to `w1`.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
|
|||
|
>>> model.words_closer_than('carnivore.n.01', 'mammal.n.01')
|
|||
|
['dog.n.01', 'canine.n.02']
|
|||
|
|
|||
|
"""
|
|||
|
all_distances = self.distances(w1)
|
|||
|
w1_index = self.vocab[w1].index
|
|||
|
w2_index = self.vocab[w2].index
|
|||
|
closer_node_indices = np.where(all_distances < all_distances[w2_index])[0]
|
|||
|
return [self.index2word[index] for index in closer_node_indices if index != w1_index]
|
|||
|
|
|||
|
def rank(self, w1, w2):
|
|||
|
"""
|
|||
|
Rank of the distance of `w2` from `w1`, in relation to distances of all words from `w1`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
w1 : str
|
|||
|
Input word.
|
|||
|
w2 : str
|
|||
|
Input word.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
int
|
|||
|
Rank of `w2` from `w1` in relation to all other nodes.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
|
|||
|
>>> model.rank('mammal.n.01', 'carnivore.n.01')
|
|||
|
3
|
|||
|
|
|||
|
"""
|
|||
|
return len(self.words_closer_than(w1, w2)) + 1
|
|||
|
|
|||
|
|
|||
|
class EuclideanKeyedVectors(KeyedVectorsBase):
|
|||
|
"""
|
|||
|
Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly
|
|||
|
involved in training such as most_similar()
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
super(EuclideanKeyedVectors, self).__init__()
|
|||
|
self.syn0norm = None
|
|||
|
|
|||
|
@property
|
|||
|
def wv(self):
|
|||
|
return self
|
|||
|
|
|||
|
def save(self, *args, **kwargs):
|
|||
|
# don't bother storing the cached normalized vectors
|
|||
|
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
|
|||
|
super(EuclideanKeyedVectors, self).save(*args, **kwargs)
|
|||
|
|
|||
|
def word_vec(self, word, use_norm=False):
|
|||
|
"""
|
|||
|
Accept a single word as input.
|
|||
|
Returns the word's representations in vector space, as a 1D numpy array.
|
|||
|
|
|||
|
If `use_norm` is True, returns the normalized word vector.
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model['office']
|
|||
|
array([ -1.40128313e-02, ...])
|
|||
|
|
|||
|
"""
|
|||
|
if word in self.vocab:
|
|||
|
if use_norm:
|
|||
|
result = self.syn0norm[self.vocab[word].index]
|
|||
|
else:
|
|||
|
result = self.syn0[self.vocab[word].index]
|
|||
|
|
|||
|
result.setflags(write=False)
|
|||
|
return result
|
|||
|
else:
|
|||
|
raise KeyError("word '%s' not in vocabulary" % word)
|
|||
|
|
|||
|
def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
|
|||
|
"""
|
|||
|
Find the top-N most similar words. Positive words contribute positively towards the
|
|||
|
similarity, negative words negatively.
|
|||
|
|
|||
|
This method computes cosine similarity between a simple mean of the projection
|
|||
|
weight vectors of the given words and the vectors for each word in the model.
|
|||
|
The method corresponds to the `word-analogy` and `distance` scripts in the original
|
|||
|
word2vec implementation.
|
|||
|
|
|||
|
If topn is False, most_similar returns the vector of similarity scores.
|
|||
|
|
|||
|
`restrict_vocab` is an optional integer which limits the range of vectors which
|
|||
|
are searched for most-similar values. For example, restrict_vocab=10000 would
|
|||
|
only check the first 10000 word vectors in the vocabulary order. (This may be
|
|||
|
meaningful if you've sorted the vocabulary by descending frequency.)
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
|
|||
|
[('queen', 0.50882536), ...]
|
|||
|
|
|||
|
"""
|
|||
|
if positive is None:
|
|||
|
positive = []
|
|||
|
if negative is None:
|
|||
|
negative = []
|
|||
|
|
|||
|
self.init_sims()
|
|||
|
|
|||
|
if isinstance(positive, string_types) and not negative:
|
|||
|
# allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
|
|||
|
positive = [positive]
|
|||
|
|
|||
|
# add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
|
|||
|
positive = [
|
|||
|
(word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
|
|||
|
for word in positive
|
|||
|
]
|
|||
|
negative = [
|
|||
|
(word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
|
|||
|
for word in negative
|
|||
|
]
|
|||
|
|
|||
|
# compute the weighted average of all words
|
|||
|
all_words, mean = set(), []
|
|||
|
for word, weight in positive + negative:
|
|||
|
if isinstance(word, ndarray):
|
|||
|
mean.append(weight * word)
|
|||
|
else:
|
|||
|
mean.append(weight * self.word_vec(word, use_norm=True))
|
|||
|
if word in self.vocab:
|
|||
|
all_words.add(self.vocab[word].index)
|
|||
|
if not mean:
|
|||
|
raise ValueError("cannot compute similarity with no input")
|
|||
|
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
|
|||
|
|
|||
|
if indexer is not None:
|
|||
|
return indexer.most_similar(mean, topn)
|
|||
|
|
|||
|
limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab]
|
|||
|
dists = dot(limited, mean)
|
|||
|
if not topn:
|
|||
|
return dists
|
|||
|
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
|
|||
|
# ignore (don't return) words from the input
|
|||
|
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
|
|||
|
return result[:topn]
|
|||
|
|
|||
|
def similar_by_word(self, word, topn=10, restrict_vocab=None):
|
|||
|
"""
|
|||
|
Find the top-N most similar words.
|
|||
|
|
|||
|
If topn is False, similar_by_word returns the vector of similarity scores.
|
|||
|
|
|||
|
`restrict_vocab` is an optional integer which limits the range of vectors which
|
|||
|
are searched for most-similar values. For example, restrict_vocab=10000 would
|
|||
|
only check the first 10000 word vectors in the vocabulary order. (This may be
|
|||
|
meaningful if you've sorted the vocabulary by descending frequency.)
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.similar_by_word('graph')
|
|||
|
[('user', 0.9999163150787354), ...]
|
|||
|
|
|||
|
"""
|
|||
|
return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
|
|||
|
|
|||
|
def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
|
|||
|
"""
|
|||
|
Find the top-N most similar words by vector.
|
|||
|
|
|||
|
If topn is False, similar_by_vector returns the vector of similarity scores.
|
|||
|
|
|||
|
`restrict_vocab` is an optional integer which limits the range of vectors which
|
|||
|
are searched for most-similar values. For example, restrict_vocab=10000 would
|
|||
|
only check the first 10000 word vectors in the vocabulary order. (This may be
|
|||
|
meaningful if you've sorted the vocabulary by descending frequency.)
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.similar_by_vector([1,2])
|
|||
|
[('survey', 0.9942699074745178), ...]
|
|||
|
|
|||
|
"""
|
|||
|
return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
|
|||
|
|
|||
|
def wmdistance(self, document1, document2):
|
|||
|
"""
|
|||
|
Compute the Word Mover's Distance between two documents. When using this
|
|||
|
code, please consider citing the following papers:
|
|||
|
|
|||
|
.. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching".
|
|||
|
.. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances".
|
|||
|
.. Matt Kusner et al. "From Word Embeddings To Document Distances".
|
|||
|
|
|||
|
Note that if one of the documents have no words that exist in the
|
|||
|
Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned.
|
|||
|
|
|||
|
This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).
|
|||
|
|
|||
|
Example:
|
|||
|
>>> # Train word2vec model.
|
|||
|
>>> model = Word2Vec(sentences)
|
|||
|
|
|||
|
>>> # Some sentences to test.
|
|||
|
>>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
|
|||
|
>>> sentence_president = 'The president greets the press in Chicago'.lower().split()
|
|||
|
|
|||
|
>>> # Remove their stopwords.
|
|||
|
>>> from nltk.corpus import stopwords
|
|||
|
>>> stopwords = nltk.corpus.stopwords.words('english')
|
|||
|
>>> sentence_obama = [w for w in sentence_obama if w not in stopwords]
|
|||
|
>>> sentence_president = [w for w in sentence_president if w not in stopwords]
|
|||
|
|
|||
|
>>> # Compute WMD.
|
|||
|
>>> distance = model.wmdistance(sentence_obama, sentence_president)
|
|||
|
"""
|
|||
|
|
|||
|
if not PYEMD_EXT:
|
|||
|
raise ImportError("Please install pyemd Python package to compute WMD.")
|
|||
|
|
|||
|
# Remove out-of-vocabulary words.
|
|||
|
len_pre_oov1 = len(document1)
|
|||
|
len_pre_oov2 = len(document2)
|
|||
|
document1 = [token for token in document1 if token in self]
|
|||
|
document2 = [token for token in document2 if token in self]
|
|||
|
diff1 = len_pre_oov1 - len(document1)
|
|||
|
diff2 = len_pre_oov2 - len(document2)
|
|||
|
if diff1 > 0 or diff2 > 0:
|
|||
|
logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2)
|
|||
|
|
|||
|
if len(document1) == 0 or len(document2) == 0:
|
|||
|
logger.info(
|
|||
|
"At least one of the documents had no words that werein the vocabulary. "
|
|||
|
"Aborting (returning inf)."
|
|||
|
)
|
|||
|
return float('inf')
|
|||
|
|
|||
|
dictionary = Dictionary(documents=[document1, document2])
|
|||
|
vocab_len = len(dictionary)
|
|||
|
|
|||
|
if vocab_len == 1:
|
|||
|
# Both documents are composed by a single unique token
|
|||
|
return 0.0
|
|||
|
|
|||
|
# Sets for faster look-up.
|
|||
|
docset1 = set(document1)
|
|||
|
docset2 = set(document2)
|
|||
|
|
|||
|
# Compute distance matrix.
|
|||
|
distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
|
|||
|
for i, t1 in dictionary.items():
|
|||
|
for j, t2 in dictionary.items():
|
|||
|
if t1 not in docset1 or t2 not in docset2:
|
|||
|
continue
|
|||
|
# Compute Euclidean distance between word vectors.
|
|||
|
distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2))
|
|||
|
|
|||
|
if np_sum(distance_matrix) == 0.0:
|
|||
|
# `emd` gets stuck if the distance matrix contains only zeros.
|
|||
|
logger.info('The distance matrix is all zeros. Aborting (returning inf).')
|
|||
|
return float('inf')
|
|||
|
|
|||
|
def nbow(document):
|
|||
|
d = zeros(vocab_len, dtype=double)
|
|||
|
nbow = dictionary.doc2bow(document) # Word frequencies.
|
|||
|
doc_len = len(document)
|
|||
|
for idx, freq in nbow:
|
|||
|
d[idx] = freq / float(doc_len) # Normalized word frequencies.
|
|||
|
return d
|
|||
|
|
|||
|
# Compute nBOW representation of documents.
|
|||
|
d1 = nbow(document1)
|
|||
|
d2 = nbow(document2)
|
|||
|
|
|||
|
# Compute WMD.
|
|||
|
return emd(d1, d2, distance_matrix)
|
|||
|
|
|||
|
def most_similar_cosmul(self, positive=None, negative=None, topn=10):
|
|||
|
"""
|
|||
|
Find the top-N most similar words, using the multiplicative combination objective
|
|||
|
proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute
|
|||
|
positively towards the similarity, negative words negatively, but with less
|
|||
|
susceptibility to one large distance dominating the calculation.
|
|||
|
|
|||
|
In the common analogy-solving case, of two positive and one negative examples,
|
|||
|
this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg.
|
|||
|
|
|||
|
Additional positive or negative examples contribute to the numerator or denominator,
|
|||
|
respectively – a potentially sensible but untested extension of the method. (With
|
|||
|
a single positive example, rankings will be the same as in the default most_similar.)
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
|
|||
|
[(u'iraq', 0.8488819003105164), ...]
|
|||
|
|
|||
|
.. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.
|
|||
|
|
|||
|
"""
|
|||
|
if positive is None:
|
|||
|
positive = []
|
|||
|
if negative is None:
|
|||
|
negative = []
|
|||
|
|
|||
|
self.init_sims()
|
|||
|
|
|||
|
if isinstance(positive, string_types) and not negative:
|
|||
|
# allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
|
|||
|
positive = [positive]
|
|||
|
|
|||
|
all_words = {
|
|||
|
self.vocab[word].index for word in positive + negative
|
|||
|
if not isinstance(word, ndarray) and word in self.vocab
|
|||
|
}
|
|||
|
|
|||
|
positive = [
|
|||
|
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
|
|||
|
for word in positive
|
|||
|
]
|
|||
|
negative = [
|
|||
|
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
|
|||
|
for word in negative
|
|||
|
]
|
|||
|
|
|||
|
if not positive:
|
|||
|
raise ValueError("cannot compute similarity with no input")
|
|||
|
|
|||
|
# equation (4) of Levy & Goldberg "Linguistic Regularities...",
|
|||
|
# with distances shifted to [0,1] per footnote (7)
|
|||
|
pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
|
|||
|
neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative]
|
|||
|
dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001)
|
|||
|
|
|||
|
if not topn:
|
|||
|
return dists
|
|||
|
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
|
|||
|
# ignore (don't return) words from the input
|
|||
|
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
|
|||
|
return result[:topn]
|
|||
|
|
|||
|
def doesnt_match(self, words):
|
|||
|
"""
|
|||
|
Which word from the given list doesn't go with the others?
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
|
|||
|
'cereal'
|
|||
|
|
|||
|
"""
|
|||
|
self.init_sims()
|
|||
|
|
|||
|
used_words = [word for word in words if word in self]
|
|||
|
if len(used_words) != len(words):
|
|||
|
ignored_words = set(words) - set(used_words)
|
|||
|
logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words)
|
|||
|
if not used_words:
|
|||
|
raise ValueError("cannot select a word from an empty list")
|
|||
|
vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
|
|||
|
mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
|
|||
|
dists = dot(vectors, mean)
|
|||
|
return sorted(zip(dists, used_words))[0][1]
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def cosine_similarities(vector_1, vectors_all):
|
|||
|
"""
|
|||
|
Return cosine similarities between one vector and a set of other vectors.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
vector_1 : numpy.array
|
|||
|
vector from which similarities are to be computed.
|
|||
|
expected shape (dim,)
|
|||
|
vectors_all : numpy.array
|
|||
|
for each row in vectors_all, distance from vector_1 is computed.
|
|||
|
expected shape (num_vectors, dim)
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
numpy.array
|
|||
|
Contains cosine distance between vector_1 and each row in vectors_all.
|
|||
|
shape (num_vectors,)
|
|||
|
|
|||
|
"""
|
|||
|
norm = np.linalg.norm(vector_1)
|
|||
|
all_norms = np.linalg.norm(vectors_all, axis=1)
|
|||
|
dot_products = dot(vectors_all, vector_1)
|
|||
|
similarities = dot_products / (norm * all_norms)
|
|||
|
return similarities
|
|||
|
|
|||
|
def distances(self, word_or_vector, other_words=()):
|
|||
|
"""
|
|||
|
Compute cosine distances from given word or vector to all words in `other_words`.
|
|||
|
If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
word_or_vector : str or numpy.array
|
|||
|
Word or vector from which distances are to be computed.
|
|||
|
|
|||
|
other_words : iterable(str) or None
|
|||
|
For each word in `other_words` distance from `word_or_vector` is computed.
|
|||
|
If None or empty, distance of `word_or_vector` from all words in vocab is computed (including itself).
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
numpy.array
|
|||
|
Array containing distances to all words in `other_words` from input `word_or_vector`,
|
|||
|
in the same order as `other_words`.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Raises KeyError if either `word_or_vector` or any word in `other_words` is absent from vocab.
|
|||
|
|
|||
|
"""
|
|||
|
if isinstance(word_or_vector, string_types):
|
|||
|
input_vector = self.word_vec(word_or_vector)
|
|||
|
else:
|
|||
|
input_vector = word_or_vector
|
|||
|
if not other_words:
|
|||
|
other_vectors = self.syn0
|
|||
|
else:
|
|||
|
other_indices = [self.vocab[word].index for word in other_words]
|
|||
|
other_vectors = self.syn0[other_indices]
|
|||
|
return 1 - self.cosine_similarities(input_vector, other_vectors)
|
|||
|
|
|||
|
def distance(self, w1, w2):
|
|||
|
"""
|
|||
|
Compute cosine distance between two words.
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.distance('woman', 'man')
|
|||
|
0.34
|
|||
|
|
|||
|
>>> trained_model.distance('woman', 'woman')
|
|||
|
0.0
|
|||
|
|
|||
|
"""
|
|||
|
return 1 - self.similarity(w1, w2)
|
|||
|
|
|||
|
def similarity(self, w1, w2):
|
|||
|
"""
|
|||
|
Compute cosine similarity between two words.
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.similarity('woman', 'man')
|
|||
|
0.73723527
|
|||
|
|
|||
|
>>> trained_model.similarity('woman', 'woman')
|
|||
|
1.0
|
|||
|
|
|||
|
"""
|
|||
|
return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
|
|||
|
|
|||
|
def n_similarity(self, ws1, ws2):
|
|||
|
"""
|
|||
|
Compute cosine similarity between two sets of words.
|
|||
|
|
|||
|
Example::
|
|||
|
|
|||
|
>>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
|
|||
|
0.61540466561049689
|
|||
|
|
|||
|
>>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
|
|||
|
1.0000000000000004
|
|||
|
|
|||
|
>>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
|
|||
|
True
|
|||
|
|
|||
|
"""
|
|||
|
if not(len(ws1) and len(ws2)):
|
|||
|
raise ZeroDivisionError('At least one of the passed list is empty.')
|
|||
|
v1 = [self[word] for word in ws1]
|
|||
|
v2 = [self[word] for word in ws2]
|
|||
|
return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def log_accuracy(section):
|
|||
|
correct, incorrect = len(section['correct']), len(section['incorrect'])
|
|||
|
if correct + incorrect > 0:
|
|||
|
logger.info(
|
|||
|
"%s: %.1f%% (%i/%i)",
|
|||
|
section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect
|
|||
|
)
|
|||
|
|
|||
|
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True):
|
|||
|
"""
|
|||
|
Compute accuracy of the model. `questions` is a filename where lines are
|
|||
|
4-tuples of words, split into sections by ": SECTION NAME" lines.
|
|||
|
See questions-words.txt in
|
|||
|
https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip
|
|||
|
for an example.
|
|||
|
|
|||
|
The accuracy is reported (=printed to log and returned as a list) for each
|
|||
|
section separately, plus there's one aggregate summary at the end.
|
|||
|
|
|||
|
Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
|
|||
|
words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
|
|||
|
In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
|
|||
|
case normalization is performed.
|
|||
|
|
|||
|
Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
|
|||
|
evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
|
|||
|
and question words. In case of multiple case variants of a single word, the vector for the first
|
|||
|
occurrence (also the most frequent if vocabulary is sorted) is taken.
|
|||
|
|
|||
|
This method corresponds to the `compute-accuracy` script of the original C word2vec.
|
|||
|
|
|||
|
"""
|
|||
|
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
|
|||
|
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
|
|||
|
|
|||
|
sections, section = [], None
|
|||
|
for line_no, line in enumerate(utils.smart_open(questions)):
|
|||
|
# TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
|
|||
|
line = utils.to_unicode(line)
|
|||
|
if line.startswith(': '):
|
|||
|
# a new section starts => store the old section
|
|||
|
if section:
|
|||
|
sections.append(section)
|
|||
|
self.log_accuracy(section)
|
|||
|
section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
|
|||
|
else:
|
|||
|
if not section:
|
|||
|
raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
|
|||
|
try:
|
|||
|
if case_insensitive:
|
|||
|
a, b, c, expected = [word.upper() for word in line.split()]
|
|||
|
else:
|
|||
|
a, b, c, expected = [word for word in line.split()]
|
|||
|
except ValueError:
|
|||
|
logger.info("skipping invalid line #%i in %s", line_no, questions)
|
|||
|
continue
|
|||
|
if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
|
|||
|
logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip())
|
|||
|
continue
|
|||
|
|
|||
|
original_vocab = self.vocab
|
|||
|
self.vocab = ok_vocab
|
|||
|
ignore = {a, b, c} # input words to be ignored
|
|||
|
predicted = None
|
|||
|
# find the most likely prediction, ignoring OOV words and input words
|
|||
|
sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
|
|||
|
self.vocab = original_vocab
|
|||
|
for index in matutils.argsort(sims, reverse=True):
|
|||
|
predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index]
|
|||
|
if predicted in ok_vocab and predicted not in ignore:
|
|||
|
if predicted != expected:
|
|||
|
logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted)
|
|||
|
break
|
|||
|
if predicted == expected:
|
|||
|
section['correct'].append((a, b, c, expected))
|
|||
|
else:
|
|||
|
section['incorrect'].append((a, b, c, expected))
|
|||
|
if section:
|
|||
|
# store the last section, too
|
|||
|
sections.append(section)
|
|||
|
self.log_accuracy(section)
|
|||
|
|
|||
|
total = {
|
|||
|
'section': 'total',
|
|||
|
'correct': sum((s['correct'] for s in sections), []),
|
|||
|
'incorrect': sum((s['incorrect'] for s in sections), []),
|
|||
|
}
|
|||
|
self.log_accuracy(total)
|
|||
|
sections.append(total)
|
|||
|
return sections
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
|
|||
|
logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0])
|
|||
|
logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0])
|
|||
|
logger.info('Pairs with unknown words ratio: %.1f%%', oov)
|
|||
|
|
|||
|
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
|
|||
|
case_insensitive=True, dummy4unknown=False):
|
|||
|
"""
|
|||
|
Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
|
|||
|
lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`.
|
|||
|
An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
|
|||
|
http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html.
|
|||
|
|
|||
|
The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
|
|||
|
between the similarities from the dataset and the similarities produced by the model itself.
|
|||
|
The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).
|
|||
|
|
|||
|
Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
|
|||
|
words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
|
|||
|
If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
|
|||
|
is performed.
|
|||
|
|
|||
|
Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
|
|||
|
evaluating the model (default True). Useful when you expect case-mismatch between training tokens
|
|||
|
and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
|
|||
|
occurrence (also the most frequent if vocabulary is sorted) is taken.
|
|||
|
|
|||
|
Use `dummy4unknown=True` to produce zero-valued similarities for pairs with out-of-vocabulary words.
|
|||
|
Otherwise (default False), these pairs are skipped entirely.
|
|||
|
"""
|
|||
|
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
|
|||
|
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
|
|||
|
|
|||
|
similarity_gold = []
|
|||
|
similarity_model = []
|
|||
|
oov = 0
|
|||
|
|
|||
|
original_vocab = self.vocab
|
|||
|
self.vocab = ok_vocab
|
|||
|
|
|||
|
for line_no, line in enumerate(utils.smart_open(pairs)):
|
|||
|
line = utils.to_unicode(line)
|
|||
|
if line.startswith('#'):
|
|||
|
# May be a comment
|
|||
|
continue
|
|||
|
else:
|
|||
|
try:
|
|||
|
if case_insensitive:
|
|||
|
a, b, sim = [word.upper() for word in line.split(delimiter)]
|
|||
|
else:
|
|||
|
a, b, sim = [word for word in line.split(delimiter)]
|
|||
|
sim = float(sim)
|
|||
|
except (ValueError, TypeError):
|
|||
|
logger.info('skipping invalid line #%d in %s', line_no, pairs)
|
|||
|
continue
|
|||
|
if a not in ok_vocab or b not in ok_vocab:
|
|||
|
oov += 1
|
|||
|
if dummy4unknown:
|
|||
|
similarity_model.append(0.0)
|
|||
|
similarity_gold.append(sim)
|
|||
|
continue
|
|||
|
else:
|
|||
|
logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip())
|
|||
|
continue
|
|||
|
similarity_gold.append(sim) # Similarity from the dataset
|
|||
|
similarity_model.append(self.similarity(a, b)) # Similarity from the model
|
|||
|
self.vocab = original_vocab
|
|||
|
spearman = stats.spearmanr(similarity_gold, similarity_model)
|
|||
|
pearson = stats.pearsonr(similarity_gold, similarity_model)
|
|||
|
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100
|
|||
|
|
|||
|
logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1])
|
|||
|
logger.debug(
|
|||
|
'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
|
|||
|
pairs, spearman[0], spearman[1]
|
|||
|
)
|
|||
|
logger.debug('Pairs with unknown words: %d', oov)
|
|||
|
self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
|
|||
|
return pearson, spearman, oov_ratio
|
|||
|
|
|||
|
def init_sims(self, replace=False):
|
|||
|
"""
|
|||
|
Precompute L2-normalized vectors.
|
|||
|
|
|||
|
If `replace` is set, forget the original vectors and only keep the normalized
|
|||
|
ones = saves lots of memory!
|
|||
|
|
|||
|
Note that you **cannot continue training** after doing a replace. The model becomes
|
|||
|
effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.
|
|||
|
|
|||
|
"""
|
|||
|
if getattr(self, 'syn0norm', None) is None or replace:
|
|||
|
logger.info("precomputing L2-norms of word weight vectors")
|
|||
|
if replace:
|
|||
|
for i in xrange(self.syn0.shape[0]):
|
|||
|
self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1))
|
|||
|
self.syn0norm = self.syn0
|
|||
|
else:
|
|||
|
self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)
|
|||
|
|
|||
|
def get_keras_embedding(self, train_embeddings=False):
|
|||
|
"""
|
|||
|
Return a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings
|
|||
|
"""
|
|||
|
try:
|
|||
|
from keras.layers import Embedding
|
|||
|
except ImportError:
|
|||
|
raise ImportError("Please install Keras to use this function")
|
|||
|
weights = self.syn0
|
|||
|
|
|||
|
# set `trainable` as `False` to use the pretrained word embedding
|
|||
|
# No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights
|
|||
|
layer = Embedding(
|
|||
|
input_dim=weights.shape[0], output_dim=weights.shape[1],
|
|||
|
weights=[weights], trainable=train_embeddings
|
|||
|
)
|
|||
|
return layer
|
|||
|
|
|||
|
|
|||
|
# For backward compatibility
|
|||
|
KeyedVectors = EuclideanKeyedVectors
|