1074 lines
43 KiB
Python
1074 lines
43 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
#
|
||
# Copyright (C) 2016 Radim Rehurek <me@radimrehurek.com>
|
||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
||
"""
|
||
Warnings
|
||
--------
|
||
.. deprecated:: 3.3.0
|
||
Use :mod:`gensim.models.keyedvectors` instead.
|
||
|
||
|
||
Word vector storage and similarity look-ups.
|
||
Common code independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc)
|
||
|
||
The word vectors are considered read-only in this class.
|
||
|
||
Initialize the vectors by training e.g. Word2Vec::
|
||
|
||
>>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
|
||
>>> word_vectors = model.wv
|
||
|
||
Persist the word vectors to disk with::
|
||
|
||
>>> word_vectors.save(fname)
|
||
>>> word_vectors = KeyedVectors.load(fname)
|
||
|
||
The vectors can also be instantiated from an existing file on disk
|
||
in the original Google's word2vec C format as a KeyedVectors instance::
|
||
|
||
>>> from gensim.models.keyedvectors import KeyedVectors
|
||
>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
|
||
>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format
|
||
|
||
You can perform various syntactic/semantic NLP word tasks with the vectors. Some of them
|
||
are already built-in::
|
||
|
||
>>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
|
||
[('queen', 0.50882536), ...]
|
||
|
||
>>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
|
||
[('queen', 0.71382287), ...]
|
||
|
||
>>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split())
|
||
'cereal'
|
||
|
||
>>> word_vectors.similarity('woman', 'man')
|
||
0.73723527
|
||
|
||
Correlation with human opinion on word similarity::
|
||
|
||
>>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
|
||
0.51, 0.62, 0.13
|
||
|
||
And on analogies::
|
||
|
||
>>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
|
||
|
||
and so on.
|
||
|
||
"""
|
||
from __future__ import division # py3 "true division"
|
||
|
||
import logging
|
||
|
||
try:
|
||
from queue import Queue, Empty
|
||
except ImportError:
|
||
from Queue import Queue, Empty # noqa:F401
|
||
|
||
# If pyemd C extension is available, import it.
|
||
# If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance
|
||
try:
|
||
from pyemd import emd
|
||
PYEMD_EXT = True
|
||
except ImportError:
|
||
PYEMD_EXT = False
|
||
|
||
from numpy import dot, zeros, dtype, float32 as REAL,\
|
||
double, array, vstack, fromstring, sqrt, newaxis,\
|
||
ndarray, sum as np_sum, prod, ascontiguousarray,\
|
||
argmax
|
||
import numpy as np
|
||
|
||
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
|
||
from gensim.corpora.dictionary import Dictionary
|
||
from six import string_types, iteritems
|
||
from six.moves import xrange
|
||
from scipy import stats
|
||
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class Vocab(object):
|
||
"""
|
||
A single vocabulary item, used internally for collecting per-word frequency/sampling info,
|
||
and for constructing binary trees (incl. both word leaves and inner nodes).
|
||
|
||
"""
|
||
|
||
def __init__(self, **kwargs):
|
||
self.count = 0
|
||
self.__dict__.update(kwargs)
|
||
|
||
def __lt__(self, other): # used for sorting in a priority queue
|
||
return self.count < other.count
|
||
|
||
def __str__(self):
|
||
vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')]
|
||
return "%s(%s)" % (self.__class__.__name__, ', '.join(vals))
|
||
|
||
|
||
class KeyedVectorsBase(utils.SaveLoad):
|
||
"""
|
||
Base class to contain vectors and vocab for any set of vectors which are each associated with a key.
|
||
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.syn0 = []
|
||
self.vocab = {}
|
||
self.index2word = []
|
||
self.vector_size = None
|
||
|
||
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
|
||
"""
|
||
Store the input-hidden weight matrix in the same format used by the original
|
||
C word2vec-tool, for compatibility.
|
||
|
||
`fname` is the file used to save the vectors in
|
||
`fvocab` is an optional file used to save the vocabulary
|
||
`binary` is an optional boolean indicating whether the data is to be saved
|
||
in binary word2vec format (default: False)
|
||
`total_vec` is an optional parameter to explicitly specify total no. of vectors
|
||
(in case word vectors are appended with document vectors afterwards)
|
||
|
||
"""
|
||
if total_vec is None:
|
||
total_vec = len(self.vocab)
|
||
vector_size = self.syn0.shape[1]
|
||
if fvocab is not None:
|
||
logger.info("storing vocabulary in %s", fvocab)
|
||
with utils.smart_open(fvocab, 'wb') as vout:
|
||
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
|
||
vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
|
||
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
|
||
assert (len(self.vocab), vector_size) == self.syn0.shape
|
||
with utils.smart_open(fname, 'wb') as fout:
|
||
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
|
||
# store in sorted order: most frequent words at the top
|
||
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
|
||
row = self.syn0[vocab.index]
|
||
if binary:
|
||
fout.write(utils.to_utf8(word) + b" " + row.tostring())
|
||
else:
|
||
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
|
||
|
||
@classmethod
|
||
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
|
||
limit=None, datatype=REAL):
|
||
"""
|
||
Load the input-hidden weight matrix from the original C word2vec-tool format.
|
||
|
||
Note that the information stored in the file is incomplete (the binary tree is missing),
|
||
so while you can query for word similarity etc., you cannot continue training
|
||
with a model loaded this way.
|
||
|
||
`binary` is a boolean indicating whether the data is in binary word2vec format.
|
||
`norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
|
||
Word counts are read from `fvocab` filename, if set (this is the file generated
|
||
by `-save-vocab` flag of the original C tool).
|
||
|
||
If you trained the C model using non-utf8 encoding for words, specify that
|
||
encoding in `encoding`.
|
||
|
||
`unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
|
||
argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
|
||
file may include word tokens truncated in the middle of a multibyte unicode character
|
||
(as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
|
||
|
||
`limit` sets a maximum number of word-vectors to read from the file. The default,
|
||
None, means read all.
|
||
|
||
`datatype` (experimental) can coerce dimensions to a non-default float type (such
|
||
as np.float16) to save memory. (Such types may result in much slower bulk operations
|
||
or incompatibility with optimized routines.)
|
||
|
||
"""
|
||
counts = None
|
||
if fvocab is not None:
|
||
logger.info("loading word counts from %s", fvocab)
|
||
counts = {}
|
||
with utils.smart_open(fvocab) as fin:
|
||
for line in fin:
|
||
word, count = utils.to_unicode(line).strip().split()
|
||
counts[word] = int(count)
|
||
|
||
logger.info("loading projection weights from %s", fname)
|
||
with utils.smart_open(fname) as fin:
|
||
header = utils.to_unicode(fin.readline(), encoding=encoding)
|
||
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
|
||
if limit:
|
||
vocab_size = min(vocab_size, limit)
|
||
result = cls()
|
||
result.vector_size = vector_size
|
||
result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)
|
||
|
||
def add_word(word, weights):
|
||
word_id = len(result.vocab)
|
||
if word in result.vocab:
|
||
logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
|
||
return
|
||
if counts is None:
|
||
# most common scenario: no vocab file given. just make up some bogus counts, in descending order
|
||
result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
|
||
elif word in counts:
|
||
# use count from the vocab file
|
||
result.vocab[word] = Vocab(index=word_id, count=counts[word])
|
||
else:
|
||
# vocab file given, but word is missing -- set count to None (TODO: or raise?)
|
||
logger.warning("vocabulary file is incomplete: '%s' is missing", word)
|
||
result.vocab[word] = Vocab(index=word_id, count=None)
|
||
result.syn0[word_id] = weights
|
||
result.index2word.append(word)
|
||
|
||
if binary:
|
||
binary_len = dtype(REAL).itemsize * vector_size
|
||
for _ in xrange(vocab_size):
|
||
# mixed text and binary: read text first, then binary
|
||
word = []
|
||
while True:
|
||
ch = fin.read(1)
|
||
if ch == b' ':
|
||
break
|
||
if ch == b'':
|
||
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
|
||
if ch != b'\n': # ignore newlines in front of words (some binary files have)
|
||
word.append(ch)
|
||
word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
|
||
weights = fromstring(fin.read(binary_len), dtype=REAL)
|
||
add_word(word, weights)
|
||
else:
|
||
for line_no in xrange(vocab_size):
|
||
line = fin.readline()
|
||
if line == b'':
|
||
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
|
||
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
|
||
if len(parts) != vector_size + 1:
|
||
raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
|
||
word, weights = parts[0], [REAL(x) for x in parts[1:]]
|
||
add_word(word, weights)
|
||
if result.syn0.shape[0] != len(result.vocab):
|
||
logger.info(
|
||
"duplicate words detected, shrinking matrix size from %i to %i",
|
||
result.syn0.shape[0], len(result.vocab)
|
||
)
|
||
result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
|
||
assert (len(result.vocab), vector_size) == result.syn0.shape
|
||
|
||
logger.info("loaded %s matrix from %s", result.syn0.shape, fname)
|
||
return result
|
||
|
||
def similarity(self, w1, w2):
|
||
"""
|
||
Compute similarity between vectors of two input words.
|
||
To be implemented by child class.
|
||
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
def distance(self, w1, w2):
|
||
"""
|
||
Compute distance between vectors of two input words.
|
||
To be implemented by child class.
|
||
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
def distances(self, word_or_vector, other_words=()):
|
||
"""
|
||
Compute distances from given word or vector to all words in `other_words`.
|
||
If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab.
|
||
To be implemented by child class.
|
||
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
def word_vec(self, word):
|
||
"""
|
||
Accept a single word as input.
|
||
Returns the word's representations in vector space, as a 1D numpy array.
|
||
|
||
Example::
|
||
|
||
>>> trained_model.word_vec('office')
|
||
array([ -1.40128313e-02, ...])
|
||
|
||
"""
|
||
if word in self.vocab:
|
||
result = self.syn0[self.vocab[word].index]
|
||
result.setflags(write=False)
|
||
return result
|
||
else:
|
||
raise KeyError("word '%s' not in vocabulary" % word)
|
||
|
||
def __getitem__(self, words):
|
||
"""
|
||
Accept a single word or a list of words as input.
|
||
|
||
If a single word: returns the word's representations in vector space, as
|
||
a 1D numpy array.
|
||
|
||
Multiple words: return the words' representations in vector space, as a
|
||
2d numpy array: #words x #vector_size. Matrix rows are in the same order
|
||
as in input.
|
||
|
||
Example::
|
||
|
||
>>> trained_model['office']
|
||
array([ -1.40128313e-02, ...])
|
||
|
||
>>> trained_model[['office', 'products']]
|
||
array([ -1.40128313e-02, ...]
|
||
[ -1.70425311e-03, ...]
|
||
...)
|
||
|
||
"""
|
||
if isinstance(words, string_types):
|
||
# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]
|
||
return self.word_vec(words)
|
||
|
||
return vstack([self.word_vec(word) for word in words])
|
||
|
||
def __contains__(self, word):
|
||
return word in self.vocab
|
||
|
||
def most_similar_to_given(self, w1, word_list):
|
||
"""Return the word from word_list most similar to w1.
|
||
|
||
Args:
|
||
w1 (str): a word
|
||
word_list (list): list of words containing a word most similar to w1
|
||
|
||
Returns:
|
||
the word in word_list with the highest similarity to w1
|
||
|
||
Raises:
|
||
KeyError: If w1 or any word in word_list is not in the vocabulary
|
||
|
||
Example::
|
||
|
||
>>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse'])
|
||
'sound'
|
||
|
||
>>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone'])
|
||
'animal'
|
||
|
||
"""
|
||
return word_list[argmax([self.similarity(w1, word) for word in word_list])]
|
||
|
||
def words_closer_than(self, w1, w2):
|
||
"""
|
||
Returns all words that are closer to `w1` than `w2` is to `w1`.
|
||
|
||
Parameters
|
||
----------
|
||
w1 : str
|
||
Input word.
|
||
w2 : str
|
||
Input word.
|
||
|
||
Returns
|
||
-------
|
||
list (str)
|
||
List of words that are closer to `w1` than `w2` is to `w1`.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> model.words_closer_than('carnivore.n.01', 'mammal.n.01')
|
||
['dog.n.01', 'canine.n.02']
|
||
|
||
"""
|
||
all_distances = self.distances(w1)
|
||
w1_index = self.vocab[w1].index
|
||
w2_index = self.vocab[w2].index
|
||
closer_node_indices = np.where(all_distances < all_distances[w2_index])[0]
|
||
return [self.index2word[index] for index in closer_node_indices if index != w1_index]
|
||
|
||
def rank(self, w1, w2):
|
||
"""
|
||
Rank of the distance of `w2` from `w1`, in relation to distances of all words from `w1`.
|
||
|
||
Parameters
|
||
----------
|
||
w1 : str
|
||
Input word.
|
||
w2 : str
|
||
Input word.
|
||
|
||
Returns
|
||
-------
|
||
int
|
||
Rank of `w2` from `w1` in relation to all other nodes.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> model.rank('mammal.n.01', 'carnivore.n.01')
|
||
3
|
||
|
||
"""
|
||
return len(self.words_closer_than(w1, w2)) + 1
|
||
|
||
|
||
class EuclideanKeyedVectors(KeyedVectorsBase):
|
||
"""
|
||
Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly
|
||
involved in training such as most_similar()
|
||
"""
|
||
|
||
def __init__(self):
|
||
super(EuclideanKeyedVectors, self).__init__()
|
||
self.syn0norm = None
|
||
|
||
@property
|
||
def wv(self):
|
||
return self
|
||
|
||
def save(self, *args, **kwargs):
|
||
# don't bother storing the cached normalized vectors
|
||
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
|
||
super(EuclideanKeyedVectors, self).save(*args, **kwargs)
|
||
|
||
def word_vec(self, word, use_norm=False):
|
||
"""
|
||
Accept a single word as input.
|
||
Returns the word's representations in vector space, as a 1D numpy array.
|
||
|
||
If `use_norm` is True, returns the normalized word vector.
|
||
|
||
Example::
|
||
|
||
>>> trained_model['office']
|
||
array([ -1.40128313e-02, ...])
|
||
|
||
"""
|
||
if word in self.vocab:
|
||
if use_norm:
|
||
result = self.syn0norm[self.vocab[word].index]
|
||
else:
|
||
result = self.syn0[self.vocab[word].index]
|
||
|
||
result.setflags(write=False)
|
||
return result
|
||
else:
|
||
raise KeyError("word '%s' not in vocabulary" % word)
|
||
|
||
def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
|
||
"""
|
||
Find the top-N most similar words. Positive words contribute positively towards the
|
||
similarity, negative words negatively.
|
||
|
||
This method computes cosine similarity between a simple mean of the projection
|
||
weight vectors of the given words and the vectors for each word in the model.
|
||
The method corresponds to the `word-analogy` and `distance` scripts in the original
|
||
word2vec implementation.
|
||
|
||
If topn is False, most_similar returns the vector of similarity scores.
|
||
|
||
`restrict_vocab` is an optional integer which limits the range of vectors which
|
||
are searched for most-similar values. For example, restrict_vocab=10000 would
|
||
only check the first 10000 word vectors in the vocabulary order. (This may be
|
||
meaningful if you've sorted the vocabulary by descending frequency.)
|
||
|
||
Example::
|
||
|
||
>>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
|
||
[('queen', 0.50882536), ...]
|
||
|
||
"""
|
||
if positive is None:
|
||
positive = []
|
||
if negative is None:
|
||
negative = []
|
||
|
||
self.init_sims()
|
||
|
||
if isinstance(positive, string_types) and not negative:
|
||
# allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
|
||
positive = [positive]
|
||
|
||
# add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
|
||
positive = [
|
||
(word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
|
||
for word in positive
|
||
]
|
||
negative = [
|
||
(word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
|
||
for word in negative
|
||
]
|
||
|
||
# compute the weighted average of all words
|
||
all_words, mean = set(), []
|
||
for word, weight in positive + negative:
|
||
if isinstance(word, ndarray):
|
||
mean.append(weight * word)
|
||
else:
|
||
mean.append(weight * self.word_vec(word, use_norm=True))
|
||
if word in self.vocab:
|
||
all_words.add(self.vocab[word].index)
|
||
if not mean:
|
||
raise ValueError("cannot compute similarity with no input")
|
||
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
|
||
|
||
if indexer is not None:
|
||
return indexer.most_similar(mean, topn)
|
||
|
||
limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab]
|
||
dists = dot(limited, mean)
|
||
if not topn:
|
||
return dists
|
||
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
|
||
# ignore (don't return) words from the input
|
||
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
|
||
return result[:topn]
|
||
|
||
def similar_by_word(self, word, topn=10, restrict_vocab=None):
|
||
"""
|
||
Find the top-N most similar words.
|
||
|
||
If topn is False, similar_by_word returns the vector of similarity scores.
|
||
|
||
`restrict_vocab` is an optional integer which limits the range of vectors which
|
||
are searched for most-similar values. For example, restrict_vocab=10000 would
|
||
only check the first 10000 word vectors in the vocabulary order. (This may be
|
||
meaningful if you've sorted the vocabulary by descending frequency.)
|
||
|
||
Example::
|
||
|
||
>>> trained_model.similar_by_word('graph')
|
||
[('user', 0.9999163150787354), ...]
|
||
|
||
"""
|
||
return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
|
||
|
||
def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
|
||
"""
|
||
Find the top-N most similar words by vector.
|
||
|
||
If topn is False, similar_by_vector returns the vector of similarity scores.
|
||
|
||
`restrict_vocab` is an optional integer which limits the range of vectors which
|
||
are searched for most-similar values. For example, restrict_vocab=10000 would
|
||
only check the first 10000 word vectors in the vocabulary order. (This may be
|
||
meaningful if you've sorted the vocabulary by descending frequency.)
|
||
|
||
Example::
|
||
|
||
>>> trained_model.similar_by_vector([1,2])
|
||
[('survey', 0.9942699074745178), ...]
|
||
|
||
"""
|
||
return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
|
||
|
||
def wmdistance(self, document1, document2):
|
||
"""
|
||
Compute the Word Mover's Distance between two documents. When using this
|
||
code, please consider citing the following papers:
|
||
|
||
.. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching".
|
||
.. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances".
|
||
.. Matt Kusner et al. "From Word Embeddings To Document Distances".
|
||
|
||
Note that if one of the documents have no words that exist in the
|
||
Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned.
|
||
|
||
This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).
|
||
|
||
Example:
|
||
>>> # Train word2vec model.
|
||
>>> model = Word2Vec(sentences)
|
||
|
||
>>> # Some sentences to test.
|
||
>>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
|
||
>>> sentence_president = 'The president greets the press in Chicago'.lower().split()
|
||
|
||
>>> # Remove their stopwords.
|
||
>>> from nltk.corpus import stopwords
|
||
>>> stopwords = nltk.corpus.stopwords.words('english')
|
||
>>> sentence_obama = [w for w in sentence_obama if w not in stopwords]
|
||
>>> sentence_president = [w for w in sentence_president if w not in stopwords]
|
||
|
||
>>> # Compute WMD.
|
||
>>> distance = model.wmdistance(sentence_obama, sentence_president)
|
||
"""
|
||
|
||
if not PYEMD_EXT:
|
||
raise ImportError("Please install pyemd Python package to compute WMD.")
|
||
|
||
# Remove out-of-vocabulary words.
|
||
len_pre_oov1 = len(document1)
|
||
len_pre_oov2 = len(document2)
|
||
document1 = [token for token in document1 if token in self]
|
||
document2 = [token for token in document2 if token in self]
|
||
diff1 = len_pre_oov1 - len(document1)
|
||
diff2 = len_pre_oov2 - len(document2)
|
||
if diff1 > 0 or diff2 > 0:
|
||
logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2)
|
||
|
||
if len(document1) == 0 or len(document2) == 0:
|
||
logger.info(
|
||
"At least one of the documents had no words that werein the vocabulary. "
|
||
"Aborting (returning inf)."
|
||
)
|
||
return float('inf')
|
||
|
||
dictionary = Dictionary(documents=[document1, document2])
|
||
vocab_len = len(dictionary)
|
||
|
||
if vocab_len == 1:
|
||
# Both documents are composed by a single unique token
|
||
return 0.0
|
||
|
||
# Sets for faster look-up.
|
||
docset1 = set(document1)
|
||
docset2 = set(document2)
|
||
|
||
# Compute distance matrix.
|
||
distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
|
||
for i, t1 in dictionary.items():
|
||
for j, t2 in dictionary.items():
|
||
if t1 not in docset1 or t2 not in docset2:
|
||
continue
|
||
# Compute Euclidean distance between word vectors.
|
||
distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2))
|
||
|
||
if np_sum(distance_matrix) == 0.0:
|
||
# `emd` gets stuck if the distance matrix contains only zeros.
|
||
logger.info('The distance matrix is all zeros. Aborting (returning inf).')
|
||
return float('inf')
|
||
|
||
def nbow(document):
|
||
d = zeros(vocab_len, dtype=double)
|
||
nbow = dictionary.doc2bow(document) # Word frequencies.
|
||
doc_len = len(document)
|
||
for idx, freq in nbow:
|
||
d[idx] = freq / float(doc_len) # Normalized word frequencies.
|
||
return d
|
||
|
||
# Compute nBOW representation of documents.
|
||
d1 = nbow(document1)
|
||
d2 = nbow(document2)
|
||
|
||
# Compute WMD.
|
||
return emd(d1, d2, distance_matrix)
|
||
|
||
def most_similar_cosmul(self, positive=None, negative=None, topn=10):
|
||
"""
|
||
Find the top-N most similar words, using the multiplicative combination objective
|
||
proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute
|
||
positively towards the similarity, negative words negatively, but with less
|
||
susceptibility to one large distance dominating the calculation.
|
||
|
||
In the common analogy-solving case, of two positive and one negative examples,
|
||
this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg.
|
||
|
||
Additional positive or negative examples contribute to the numerator or denominator,
|
||
respectively – a potentially sensible but untested extension of the method. (With
|
||
a single positive example, rankings will be the same as in the default most_similar.)
|
||
|
||
Example::
|
||
|
||
>>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
|
||
[(u'iraq', 0.8488819003105164), ...]
|
||
|
||
.. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.
|
||
|
||
"""
|
||
if positive is None:
|
||
positive = []
|
||
if negative is None:
|
||
negative = []
|
||
|
||
self.init_sims()
|
||
|
||
if isinstance(positive, string_types) and not negative:
|
||
# allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
|
||
positive = [positive]
|
||
|
||
all_words = {
|
||
self.vocab[word].index for word in positive + negative
|
||
if not isinstance(word, ndarray) and word in self.vocab
|
||
}
|
||
|
||
positive = [
|
||
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
|
||
for word in positive
|
||
]
|
||
negative = [
|
||
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
|
||
for word in negative
|
||
]
|
||
|
||
if not positive:
|
||
raise ValueError("cannot compute similarity with no input")
|
||
|
||
# equation (4) of Levy & Goldberg "Linguistic Regularities...",
|
||
# with distances shifted to [0,1] per footnote (7)
|
||
pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
|
||
neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative]
|
||
dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001)
|
||
|
||
if not topn:
|
||
return dists
|
||
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
|
||
# ignore (don't return) words from the input
|
||
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
|
||
return result[:topn]
|
||
|
||
def doesnt_match(self, words):
|
||
"""
|
||
Which word from the given list doesn't go with the others?
|
||
|
||
Example::
|
||
|
||
>>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
|
||
'cereal'
|
||
|
||
"""
|
||
self.init_sims()
|
||
|
||
used_words = [word for word in words if word in self]
|
||
if len(used_words) != len(words):
|
||
ignored_words = set(words) - set(used_words)
|
||
logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words)
|
||
if not used_words:
|
||
raise ValueError("cannot select a word from an empty list")
|
||
vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
|
||
mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
|
||
dists = dot(vectors, mean)
|
||
return sorted(zip(dists, used_words))[0][1]
|
||
|
||
@staticmethod
|
||
def cosine_similarities(vector_1, vectors_all):
|
||
"""
|
||
Return cosine similarities between one vector and a set of other vectors.
|
||
|
||
Parameters
|
||
----------
|
||
vector_1 : numpy.array
|
||
vector from which similarities are to be computed.
|
||
expected shape (dim,)
|
||
vectors_all : numpy.array
|
||
for each row in vectors_all, distance from vector_1 is computed.
|
||
expected shape (num_vectors, dim)
|
||
|
||
Returns
|
||
-------
|
||
numpy.array
|
||
Contains cosine distance between vector_1 and each row in vectors_all.
|
||
shape (num_vectors,)
|
||
|
||
"""
|
||
norm = np.linalg.norm(vector_1)
|
||
all_norms = np.linalg.norm(vectors_all, axis=1)
|
||
dot_products = dot(vectors_all, vector_1)
|
||
similarities = dot_products / (norm * all_norms)
|
||
return similarities
|
||
|
||
def distances(self, word_or_vector, other_words=()):
|
||
"""
|
||
Compute cosine distances from given word or vector to all words in `other_words`.
|
||
If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab.
|
||
|
||
Parameters
|
||
----------
|
||
word_or_vector : str or numpy.array
|
||
Word or vector from which distances are to be computed.
|
||
|
||
other_words : iterable(str) or None
|
||
For each word in `other_words` distance from `word_or_vector` is computed.
|
||
If None or empty, distance of `word_or_vector` from all words in vocab is computed (including itself).
|
||
|
||
Returns
|
||
-------
|
||
numpy.array
|
||
Array containing distances to all words in `other_words` from input `word_or_vector`,
|
||
in the same order as `other_words`.
|
||
|
||
Notes
|
||
-----
|
||
Raises KeyError if either `word_or_vector` or any word in `other_words` is absent from vocab.
|
||
|
||
"""
|
||
if isinstance(word_or_vector, string_types):
|
||
input_vector = self.word_vec(word_or_vector)
|
||
else:
|
||
input_vector = word_or_vector
|
||
if not other_words:
|
||
other_vectors = self.syn0
|
||
else:
|
||
other_indices = [self.vocab[word].index for word in other_words]
|
||
other_vectors = self.syn0[other_indices]
|
||
return 1 - self.cosine_similarities(input_vector, other_vectors)
|
||
|
||
def distance(self, w1, w2):
|
||
"""
|
||
Compute cosine distance between two words.
|
||
|
||
Example::
|
||
|
||
>>> trained_model.distance('woman', 'man')
|
||
0.34
|
||
|
||
>>> trained_model.distance('woman', 'woman')
|
||
0.0
|
||
|
||
"""
|
||
return 1 - self.similarity(w1, w2)
|
||
|
||
def similarity(self, w1, w2):
|
||
"""
|
||
Compute cosine similarity between two words.
|
||
|
||
Example::
|
||
|
||
>>> trained_model.similarity('woman', 'man')
|
||
0.73723527
|
||
|
||
>>> trained_model.similarity('woman', 'woman')
|
||
1.0
|
||
|
||
"""
|
||
return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
|
||
|
||
def n_similarity(self, ws1, ws2):
|
||
"""
|
||
Compute cosine similarity between two sets of words.
|
||
|
||
Example::
|
||
|
||
>>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
|
||
0.61540466561049689
|
||
|
||
>>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
|
||
1.0000000000000004
|
||
|
||
>>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
|
||
True
|
||
|
||
"""
|
||
if not(len(ws1) and len(ws2)):
|
||
raise ZeroDivisionError('At least one of the passed list is empty.')
|
||
v1 = [self[word] for word in ws1]
|
||
v2 = [self[word] for word in ws2]
|
||
return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
|
||
|
||
@staticmethod
|
||
def log_accuracy(section):
|
||
correct, incorrect = len(section['correct']), len(section['incorrect'])
|
||
if correct + incorrect > 0:
|
||
logger.info(
|
||
"%s: %.1f%% (%i/%i)",
|
||
section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect
|
||
)
|
||
|
||
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True):
|
||
"""
|
||
Compute accuracy of the model. `questions` is a filename where lines are
|
||
4-tuples of words, split into sections by ": SECTION NAME" lines.
|
||
See questions-words.txt in
|
||
https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip
|
||
for an example.
|
||
|
||
The accuracy is reported (=printed to log and returned as a list) for each
|
||
section separately, plus there's one aggregate summary at the end.
|
||
|
||
Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
|
||
words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
|
||
In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
|
||
case normalization is performed.
|
||
|
||
Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
|
||
evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
|
||
and question words. In case of multiple case variants of a single word, the vector for the first
|
||
occurrence (also the most frequent if vocabulary is sorted) is taken.
|
||
|
||
This method corresponds to the `compute-accuracy` script of the original C word2vec.
|
||
|
||
"""
|
||
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
|
||
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
|
||
|
||
sections, section = [], None
|
||
for line_no, line in enumerate(utils.smart_open(questions)):
|
||
# TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
|
||
line = utils.to_unicode(line)
|
||
if line.startswith(': '):
|
||
# a new section starts => store the old section
|
||
if section:
|
||
sections.append(section)
|
||
self.log_accuracy(section)
|
||
section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
|
||
else:
|
||
if not section:
|
||
raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
|
||
try:
|
||
if case_insensitive:
|
||
a, b, c, expected = [word.upper() for word in line.split()]
|
||
else:
|
||
a, b, c, expected = [word for word in line.split()]
|
||
except ValueError:
|
||
logger.info("skipping invalid line #%i in %s", line_no, questions)
|
||
continue
|
||
if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
|
||
logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip())
|
||
continue
|
||
|
||
original_vocab = self.vocab
|
||
self.vocab = ok_vocab
|
||
ignore = {a, b, c} # input words to be ignored
|
||
predicted = None
|
||
# find the most likely prediction, ignoring OOV words and input words
|
||
sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
|
||
self.vocab = original_vocab
|
||
for index in matutils.argsort(sims, reverse=True):
|
||
predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index]
|
||
if predicted in ok_vocab and predicted not in ignore:
|
||
if predicted != expected:
|
||
logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted)
|
||
break
|
||
if predicted == expected:
|
||
section['correct'].append((a, b, c, expected))
|
||
else:
|
||
section['incorrect'].append((a, b, c, expected))
|
||
if section:
|
||
# store the last section, too
|
||
sections.append(section)
|
||
self.log_accuracy(section)
|
||
|
||
total = {
|
||
'section': 'total',
|
||
'correct': sum((s['correct'] for s in sections), []),
|
||
'incorrect': sum((s['incorrect'] for s in sections), []),
|
||
}
|
||
self.log_accuracy(total)
|
||
sections.append(total)
|
||
return sections
|
||
|
||
@staticmethod
|
||
def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
|
||
logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0])
|
||
logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0])
|
||
logger.info('Pairs with unknown words ratio: %.1f%%', oov)
|
||
|
||
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
|
||
case_insensitive=True, dummy4unknown=False):
|
||
"""
|
||
Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
|
||
lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`.
|
||
An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
|
||
http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html.
|
||
|
||
The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
|
||
between the similarities from the dataset and the similarities produced by the model itself.
|
||
The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).
|
||
|
||
Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
|
||
words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
|
||
If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
|
||
is performed.
|
||
|
||
Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
|
||
evaluating the model (default True). Useful when you expect case-mismatch between training tokens
|
||
and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
|
||
occurrence (also the most frequent if vocabulary is sorted) is taken.
|
||
|
||
Use `dummy4unknown=True` to produce zero-valued similarities for pairs with out-of-vocabulary words.
|
||
Otherwise (default False), these pairs are skipped entirely.
|
||
"""
|
||
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
|
||
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
|
||
|
||
similarity_gold = []
|
||
similarity_model = []
|
||
oov = 0
|
||
|
||
original_vocab = self.vocab
|
||
self.vocab = ok_vocab
|
||
|
||
for line_no, line in enumerate(utils.smart_open(pairs)):
|
||
line = utils.to_unicode(line)
|
||
if line.startswith('#'):
|
||
# May be a comment
|
||
continue
|
||
else:
|
||
try:
|
||
if case_insensitive:
|
||
a, b, sim = [word.upper() for word in line.split(delimiter)]
|
||
else:
|
||
a, b, sim = [word for word in line.split(delimiter)]
|
||
sim = float(sim)
|
||
except (ValueError, TypeError):
|
||
logger.info('skipping invalid line #%d in %s', line_no, pairs)
|
||
continue
|
||
if a not in ok_vocab or b not in ok_vocab:
|
||
oov += 1
|
||
if dummy4unknown:
|
||
similarity_model.append(0.0)
|
||
similarity_gold.append(sim)
|
||
continue
|
||
else:
|
||
logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip())
|
||
continue
|
||
similarity_gold.append(sim) # Similarity from the dataset
|
||
similarity_model.append(self.similarity(a, b)) # Similarity from the model
|
||
self.vocab = original_vocab
|
||
spearman = stats.spearmanr(similarity_gold, similarity_model)
|
||
pearson = stats.pearsonr(similarity_gold, similarity_model)
|
||
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100
|
||
|
||
logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1])
|
||
logger.debug(
|
||
'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
|
||
pairs, spearman[0], spearman[1]
|
||
)
|
||
logger.debug('Pairs with unknown words: %d', oov)
|
||
self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
|
||
return pearson, spearman, oov_ratio
|
||
|
||
def init_sims(self, replace=False):
|
||
"""
|
||
Precompute L2-normalized vectors.
|
||
|
||
If `replace` is set, forget the original vectors and only keep the normalized
|
||
ones = saves lots of memory!
|
||
|
||
Note that you **cannot continue training** after doing a replace. The model becomes
|
||
effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.
|
||
|
||
"""
|
||
if getattr(self, 'syn0norm', None) is None or replace:
|
||
logger.info("precomputing L2-norms of word weight vectors")
|
||
if replace:
|
||
for i in xrange(self.syn0.shape[0]):
|
||
self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1))
|
||
self.syn0norm = self.syn0
|
||
else:
|
||
self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)
|
||
|
||
def get_keras_embedding(self, train_embeddings=False):
|
||
"""
|
||
Return a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings
|
||
"""
|
||
try:
|
||
from keras.layers import Embedding
|
||
except ImportError:
|
||
raise ImportError("Please install Keras to use this function")
|
||
weights = self.syn0
|
||
|
||
# set `trainable` as `False` to use the pretrained word embedding
|
||
# No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights
|
||
layer = Embedding(
|
||
input_dim=weights.shape[0], output_dim=weights.shape[1],
|
||
weights=[weights], trainable=train_embeddings
|
||
)
|
||
return layer
|
||
|
||
|
||
# For backward compatibility
|
||
KeyedVectors = EuclideanKeyedVectors
|