laywerrobot/lib/python3.6/site-packages/gensim/models/deprecated/keyedvectors.py
2020-08-27 21:55:39 +02:00

1074 lines
43 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2016 Radim Rehurek <me@radimrehurek.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Warnings
--------
.. deprecated:: 3.3.0
Use :mod:`gensim.models.keyedvectors` instead.
Word vector storage and similarity look-ups.
Common code independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc)
The word vectors are considered read-only in this class.
Initialize the vectors by training e.g. Word2Vec::
>>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
>>> word_vectors = model.wv
Persist the word vectors to disk with::
>>> word_vectors.save(fname)
>>> word_vectors = KeyedVectors.load(fname)
The vectors can also be instantiated from an existing file on disk
in the original Google's word2vec C format as a KeyedVectors instance::
>>> from gensim.models.keyedvectors import KeyedVectors
>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format
>>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format
You can perform various syntactic/semantic NLP word tasks with the vectors. Some of them
are already built-in::
>>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
[('queen', 0.50882536), ...]
>>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
[('queen', 0.71382287), ...]
>>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split())
'cereal'
>>> word_vectors.similarity('woman', 'man')
0.73723527
Correlation with human opinion on word similarity::
>>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
0.51, 0.62, 0.13
And on analogies::
>>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
and so on.
"""
from __future__ import division # py3 "true division"
import logging
try:
from queue import Queue, Empty
except ImportError:
from Queue import Queue, Empty # noqa:F401
# If pyemd C extension is available, import it.
# If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance
try:
from pyemd import emd
PYEMD_EXT = True
except ImportError:
PYEMD_EXT = False
from numpy import dot, zeros, dtype, float32 as REAL,\
double, array, vstack, fromstring, sqrt, newaxis,\
ndarray, sum as np_sum, prod, ascontiguousarray,\
argmax
import numpy as np
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
from gensim.corpora.dictionary import Dictionary
from six import string_types, iteritems
from six.moves import xrange
from scipy import stats
logger = logging.getLogger(__name__)
class Vocab(object):
"""
A single vocabulary item, used internally for collecting per-word frequency/sampling info,
and for constructing binary trees (incl. both word leaves and inner nodes).
"""
def __init__(self, **kwargs):
self.count = 0
self.__dict__.update(kwargs)
def __lt__(self, other): # used for sorting in a priority queue
return self.count < other.count
def __str__(self):
vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')]
return "%s(%s)" % (self.__class__.__name__, ', '.join(vals))
class KeyedVectorsBase(utils.SaveLoad):
"""
Base class to contain vectors and vocab for any set of vectors which are each associated with a key.
"""
def __init__(self):
self.syn0 = []
self.vocab = {}
self.index2word = []
self.vector_size = None
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
"""
Store the input-hidden weight matrix in the same format used by the original
C word2vec-tool, for compatibility.
`fname` is the file used to save the vectors in
`fvocab` is an optional file used to save the vocabulary
`binary` is an optional boolean indicating whether the data is to be saved
in binary word2vec format (default: False)
`total_vec` is an optional parameter to explicitly specify total no. of vectors
(in case word vectors are appended with document vectors afterwards)
"""
if total_vec is None:
total_vec = len(self.vocab)
vector_size = self.syn0.shape[1]
if fvocab is not None:
logger.info("storing vocabulary in %s", fvocab)
with utils.smart_open(fvocab, 'wb') as vout:
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
assert (len(self.vocab), vector_size) == self.syn0.shape
with utils.smart_open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
row = self.syn0[vocab.index]
if binary:
fout.write(utils.to_utf8(word) + b" " + row.tostring())
else:
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
@classmethod
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
limit=None, datatype=REAL):
"""
Load the input-hidden weight matrix from the original C word2vec-tool format.
Note that the information stored in the file is incomplete (the binary tree is missing),
so while you can query for word similarity etc., you cannot continue training
with a model loaded this way.
`binary` is a boolean indicating whether the data is in binary word2vec format.
`norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
Word counts are read from `fvocab` filename, if set (this is the file generated
by `-save-vocab` flag of the original C tool).
If you trained the C model using non-utf8 encoding for words, specify that
encoding in `encoding`.
`unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
file may include word tokens truncated in the middle of a multibyte unicode character
(as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
`limit` sets a maximum number of word-vectors to read from the file. The default,
None, means read all.
`datatype` (experimental) can coerce dimensions to a non-default float type (such
as np.float16) to save memory. (Such types may result in much slower bulk operations
or incompatibility with optimized routines.)
"""
counts = None
if fvocab is not None:
logger.info("loading word counts from %s", fvocab)
counts = {}
with utils.smart_open(fvocab) as fin:
for line in fin:
word, count = utils.to_unicode(line).strip().split()
counts[word] = int(count)
logger.info("loading projection weights from %s", fname)
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
if limit:
vocab_size = min(vocab_size, limit)
result = cls()
result.vector_size = vector_size
result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)
def add_word(word, weights):
word_id = len(result.vocab)
if word in result.vocab:
logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
return
if counts is None:
# most common scenario: no vocab file given. just make up some bogus counts, in descending order
result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
elif word in counts:
# use count from the vocab file
result.vocab[word] = Vocab(index=word_id, count=counts[word])
else:
# vocab file given, but word is missing -- set count to None (TODO: or raise?)
logger.warning("vocabulary file is incomplete: '%s' is missing", word)
result.vocab[word] = Vocab(index=word_id, count=None)
result.syn0[word_id] = weights
result.index2word.append(word)
if binary:
binary_len = dtype(REAL).itemsize * vector_size
for _ in xrange(vocab_size):
# mixed text and binary: read text first, then binary
word = []
while True:
ch = fin.read(1)
if ch == b' ':
break
if ch == b'':
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
if ch != b'\n': # ignore newlines in front of words (some binary files have)
word.append(ch)
word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
weights = fromstring(fin.read(binary_len), dtype=REAL)
add_word(word, weights)
else:
for line_no in xrange(vocab_size):
line = fin.readline()
if line == b'':
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
if len(parts) != vector_size + 1:
raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
word, weights = parts[0], [REAL(x) for x in parts[1:]]
add_word(word, weights)
if result.syn0.shape[0] != len(result.vocab):
logger.info(
"duplicate words detected, shrinking matrix size from %i to %i",
result.syn0.shape[0], len(result.vocab)
)
result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
assert (len(result.vocab), vector_size) == result.syn0.shape
logger.info("loaded %s matrix from %s", result.syn0.shape, fname)
return result
def similarity(self, w1, w2):
"""
Compute similarity between vectors of two input words.
To be implemented by child class.
"""
raise NotImplementedError
def distance(self, w1, w2):
"""
Compute distance between vectors of two input words.
To be implemented by child class.
"""
raise NotImplementedError
def distances(self, word_or_vector, other_words=()):
"""
Compute distances from given word or vector to all words in `other_words`.
If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab.
To be implemented by child class.
"""
raise NotImplementedError
def word_vec(self, word):
"""
Accept a single word as input.
Returns the word's representations in vector space, as a 1D numpy array.
Example::
>>> trained_model.word_vec('office')
array([ -1.40128313e-02, ...])
"""
if word in self.vocab:
result = self.syn0[self.vocab[word].index]
result.setflags(write=False)
return result
else:
raise KeyError("word '%s' not in vocabulary" % word)
def __getitem__(self, words):
"""
Accept a single word or a list of words as input.
If a single word: returns the word's representations in vector space, as
a 1D numpy array.
Multiple words: return the words' representations in vector space, as a
2d numpy array: #words x #vector_size. Matrix rows are in the same order
as in input.
Example::
>>> trained_model['office']
array([ -1.40128313e-02, ...])
>>> trained_model[['office', 'products']]
array([ -1.40128313e-02, ...]
[ -1.70425311e-03, ...]
...)
"""
if isinstance(words, string_types):
# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]
return self.word_vec(words)
return vstack([self.word_vec(word) for word in words])
def __contains__(self, word):
return word in self.vocab
def most_similar_to_given(self, w1, word_list):
"""Return the word from word_list most similar to w1.
Args:
w1 (str): a word
word_list (list): list of words containing a word most similar to w1
Returns:
the word in word_list with the highest similarity to w1
Raises:
KeyError: If w1 or any word in word_list is not in the vocabulary
Example::
>>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse'])
'sound'
>>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone'])
'animal'
"""
return word_list[argmax([self.similarity(w1, word) for word in word_list])]
def words_closer_than(self, w1, w2):
"""
Returns all words that are closer to `w1` than `w2` is to `w1`.
Parameters
----------
w1 : str
Input word.
w2 : str
Input word.
Returns
-------
list (str)
List of words that are closer to `w1` than `w2` is to `w1`.
Examples
--------
>>> model.words_closer_than('carnivore.n.01', 'mammal.n.01')
['dog.n.01', 'canine.n.02']
"""
all_distances = self.distances(w1)
w1_index = self.vocab[w1].index
w2_index = self.vocab[w2].index
closer_node_indices = np.where(all_distances < all_distances[w2_index])[0]
return [self.index2word[index] for index in closer_node_indices if index != w1_index]
def rank(self, w1, w2):
"""
Rank of the distance of `w2` from `w1`, in relation to distances of all words from `w1`.
Parameters
----------
w1 : str
Input word.
w2 : str
Input word.
Returns
-------
int
Rank of `w2` from `w1` in relation to all other nodes.
Examples
--------
>>> model.rank('mammal.n.01', 'carnivore.n.01')
3
"""
return len(self.words_closer_than(w1, w2)) + 1
class EuclideanKeyedVectors(KeyedVectorsBase):
"""
Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly
involved in training such as most_similar()
"""
def __init__(self):
super(EuclideanKeyedVectors, self).__init__()
self.syn0norm = None
@property
def wv(self):
return self
def save(self, *args, **kwargs):
# don't bother storing the cached normalized vectors
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
super(EuclideanKeyedVectors, self).save(*args, **kwargs)
def word_vec(self, word, use_norm=False):
"""
Accept a single word as input.
Returns the word's representations in vector space, as a 1D numpy array.
If `use_norm` is True, returns the normalized word vector.
Example::
>>> trained_model['office']
array([ -1.40128313e-02, ...])
"""
if word in self.vocab:
if use_norm:
result = self.syn0norm[self.vocab[word].index]
else:
result = self.syn0[self.vocab[word].index]
result.setflags(write=False)
return result
else:
raise KeyError("word '%s' not in vocabulary" % word)
def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
"""
Find the top-N most similar words. Positive words contribute positively towards the
similarity, negative words negatively.
This method computes cosine similarity between a simple mean of the projection
weight vectors of the given words and the vectors for each word in the model.
The method corresponds to the `word-analogy` and `distance` scripts in the original
word2vec implementation.
If topn is False, most_similar returns the vector of similarity scores.
`restrict_vocab` is an optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 word vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
Example::
>>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
[('queen', 0.50882536), ...]
"""
if positive is None:
positive = []
if negative is None:
negative = []
self.init_sims()
if isinstance(positive, string_types) and not negative:
# allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
positive = [positive]
# add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
positive = [
(word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
for word in positive
]
negative = [
(word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
for word in negative
]
# compute the weighted average of all words
all_words, mean = set(), []
for word, weight in positive + negative:
if isinstance(word, ndarray):
mean.append(weight * word)
else:
mean.append(weight * self.word_vec(word, use_norm=True))
if word in self.vocab:
all_words.add(self.vocab[word].index)
if not mean:
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
if indexer is not None:
return indexer.most_similar(mean, topn)
limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab]
dists = dot(limited, mean)
if not topn:
return dists
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
# ignore (don't return) words from the input
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
return result[:topn]
def similar_by_word(self, word, topn=10, restrict_vocab=None):
"""
Find the top-N most similar words.
If topn is False, similar_by_word returns the vector of similarity scores.
`restrict_vocab` is an optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 word vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
Example::
>>> trained_model.similar_by_word('graph')
[('user', 0.9999163150787354), ...]
"""
return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
"""
Find the top-N most similar words by vector.
If topn is False, similar_by_vector returns the vector of similarity scores.
`restrict_vocab` is an optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 word vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
Example::
>>> trained_model.similar_by_vector([1,2])
[('survey', 0.9942699074745178), ...]
"""
return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
def wmdistance(self, document1, document2):
"""
Compute the Word Mover's Distance between two documents. When using this
code, please consider citing the following papers:
.. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching".
.. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances".
.. Matt Kusner et al. "From Word Embeddings To Document Distances".
Note that if one of the documents have no words that exist in the
Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned.
This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).
Example:
>>> # Train word2vec model.
>>> model = Word2Vec(sentences)
>>> # Some sentences to test.
>>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
>>> sentence_president = 'The president greets the press in Chicago'.lower().split()
>>> # Remove their stopwords.
>>> from nltk.corpus import stopwords
>>> stopwords = nltk.corpus.stopwords.words('english')
>>> sentence_obama = [w for w in sentence_obama if w not in stopwords]
>>> sentence_president = [w for w in sentence_president if w not in stopwords]
>>> # Compute WMD.
>>> distance = model.wmdistance(sentence_obama, sentence_president)
"""
if not PYEMD_EXT:
raise ImportError("Please install pyemd Python package to compute WMD.")
# Remove out-of-vocabulary words.
len_pre_oov1 = len(document1)
len_pre_oov2 = len(document2)
document1 = [token for token in document1 if token in self]
document2 = [token for token in document2 if token in self]
diff1 = len_pre_oov1 - len(document1)
diff2 = len_pre_oov2 - len(document2)
if diff1 > 0 or diff2 > 0:
logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2)
if len(document1) == 0 or len(document2) == 0:
logger.info(
"At least one of the documents had no words that werein the vocabulary. "
"Aborting (returning inf)."
)
return float('inf')
dictionary = Dictionary(documents=[document1, document2])
vocab_len = len(dictionary)
if vocab_len == 1:
# Both documents are composed by a single unique token
return 0.0
# Sets for faster look-up.
docset1 = set(document1)
docset2 = set(document2)
# Compute distance matrix.
distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
for i, t1 in dictionary.items():
for j, t2 in dictionary.items():
if t1 not in docset1 or t2 not in docset2:
continue
# Compute Euclidean distance between word vectors.
distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2))
if np_sum(distance_matrix) == 0.0:
# `emd` gets stuck if the distance matrix contains only zeros.
logger.info('The distance matrix is all zeros. Aborting (returning inf).')
return float('inf')
def nbow(document):
d = zeros(vocab_len, dtype=double)
nbow = dictionary.doc2bow(document) # Word frequencies.
doc_len = len(document)
for idx, freq in nbow:
d[idx] = freq / float(doc_len) # Normalized word frequencies.
return d
# Compute nBOW representation of documents.
d1 = nbow(document1)
d2 = nbow(document2)
# Compute WMD.
return emd(d1, d2, distance_matrix)
def most_similar_cosmul(self, positive=None, negative=None, topn=10):
"""
Find the top-N most similar words, using the multiplicative combination objective
proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute
positively towards the similarity, negative words negatively, but with less
susceptibility to one large distance dominating the calculation.
In the common analogy-solving case, of two positive and one negative examples,
this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg.
Additional positive or negative examples contribute to the numerator or denominator,
respectively a potentially sensible but untested extension of the method. (With
a single positive example, rankings will be the same as in the default most_similar.)
Example::
>>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
[(u'iraq', 0.8488819003105164), ...]
.. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.
"""
if positive is None:
positive = []
if negative is None:
negative = []
self.init_sims()
if isinstance(positive, string_types) and not negative:
# allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
positive = [positive]
all_words = {
self.vocab[word].index for word in positive + negative
if not isinstance(word, ndarray) and word in self.vocab
}
positive = [
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
for word in positive
]
negative = [
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
for word in negative
]
if not positive:
raise ValueError("cannot compute similarity with no input")
# equation (4) of Levy & Goldberg "Linguistic Regularities...",
# with distances shifted to [0,1] per footnote (7)
pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative]
dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001)
if not topn:
return dists
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
# ignore (don't return) words from the input
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
return result[:topn]
def doesnt_match(self, words):
"""
Which word from the given list doesn't go with the others?
Example::
>>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
'cereal'
"""
self.init_sims()
used_words = [word for word in words if word in self]
if len(used_words) != len(words):
ignored_words = set(words) - set(used_words)
logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words)
if not used_words:
raise ValueError("cannot select a word from an empty list")
vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
dists = dot(vectors, mean)
return sorted(zip(dists, used_words))[0][1]
@staticmethod
def cosine_similarities(vector_1, vectors_all):
"""
Return cosine similarities between one vector and a set of other vectors.
Parameters
----------
vector_1 : numpy.array
vector from which similarities are to be computed.
expected shape (dim,)
vectors_all : numpy.array
for each row in vectors_all, distance from vector_1 is computed.
expected shape (num_vectors, dim)
Returns
-------
numpy.array
Contains cosine distance between vector_1 and each row in vectors_all.
shape (num_vectors,)
"""
norm = np.linalg.norm(vector_1)
all_norms = np.linalg.norm(vectors_all, axis=1)
dot_products = dot(vectors_all, vector_1)
similarities = dot_products / (norm * all_norms)
return similarities
def distances(self, word_or_vector, other_words=()):
"""
Compute cosine distances from given word or vector to all words in `other_words`.
If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab.
Parameters
----------
word_or_vector : str or numpy.array
Word or vector from which distances are to be computed.
other_words : iterable(str) or None
For each word in `other_words` distance from `word_or_vector` is computed.
If None or empty, distance of `word_or_vector` from all words in vocab is computed (including itself).
Returns
-------
numpy.array
Array containing distances to all words in `other_words` from input `word_or_vector`,
in the same order as `other_words`.
Notes
-----
Raises KeyError if either `word_or_vector` or any word in `other_words` is absent from vocab.
"""
if isinstance(word_or_vector, string_types):
input_vector = self.word_vec(word_or_vector)
else:
input_vector = word_or_vector
if not other_words:
other_vectors = self.syn0
else:
other_indices = [self.vocab[word].index for word in other_words]
other_vectors = self.syn0[other_indices]
return 1 - self.cosine_similarities(input_vector, other_vectors)
def distance(self, w1, w2):
"""
Compute cosine distance between two words.
Example::
>>> trained_model.distance('woman', 'man')
0.34
>>> trained_model.distance('woman', 'woman')
0.0
"""
return 1 - self.similarity(w1, w2)
def similarity(self, w1, w2):
"""
Compute cosine similarity between two words.
Example::
>>> trained_model.similarity('woman', 'man')
0.73723527
>>> trained_model.similarity('woman', 'woman')
1.0
"""
return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
def n_similarity(self, ws1, ws2):
"""
Compute cosine similarity between two sets of words.
Example::
>>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
0.61540466561049689
>>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
1.0000000000000004
>>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
True
"""
if not(len(ws1) and len(ws2)):
raise ZeroDivisionError('At least one of the passed list is empty.')
v1 = [self[word] for word in ws1]
v2 = [self[word] for word in ws2]
return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
@staticmethod
def log_accuracy(section):
correct, incorrect = len(section['correct']), len(section['incorrect'])
if correct + incorrect > 0:
logger.info(
"%s: %.1f%% (%i/%i)",
section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect
)
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True):
"""
Compute accuracy of the model. `questions` is a filename where lines are
4-tuples of words, split into sections by ": SECTION NAME" lines.
See questions-words.txt in
https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip
for an example.
The accuracy is reported (=printed to log and returned as a list) for each
section separately, plus there's one aggregate summary at the end.
Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
case normalization is performed.
Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
and question words. In case of multiple case variants of a single word, the vector for the first
occurrence (also the most frequent if vocabulary is sorted) is taken.
This method corresponds to the `compute-accuracy` script of the original C word2vec.
"""
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
sections, section = [], None
for line_no, line in enumerate(utils.smart_open(questions)):
# TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
line = utils.to_unicode(line)
if line.startswith(': '):
# a new section starts => store the old section
if section:
sections.append(section)
self.log_accuracy(section)
section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
else:
if not section:
raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
try:
if case_insensitive:
a, b, c, expected = [word.upper() for word in line.split()]
else:
a, b, c, expected = [word for word in line.split()]
except ValueError:
logger.info("skipping invalid line #%i in %s", line_no, questions)
continue
if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip())
continue
original_vocab = self.vocab
self.vocab = ok_vocab
ignore = {a, b, c} # input words to be ignored
predicted = None
# find the most likely prediction, ignoring OOV words and input words
sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
self.vocab = original_vocab
for index in matutils.argsort(sims, reverse=True):
predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index]
if predicted in ok_vocab and predicted not in ignore:
if predicted != expected:
logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted)
break
if predicted == expected:
section['correct'].append((a, b, c, expected))
else:
section['incorrect'].append((a, b, c, expected))
if section:
# store the last section, too
sections.append(section)
self.log_accuracy(section)
total = {
'section': 'total',
'correct': sum((s['correct'] for s in sections), []),
'incorrect': sum((s['incorrect'] for s in sections), []),
}
self.log_accuracy(total)
sections.append(total)
return sections
@staticmethod
def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0])
logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0])
logger.info('Pairs with unknown words ratio: %.1f%%', oov)
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
case_insensitive=True, dummy4unknown=False):
"""
Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`.
An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html.
The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
between the similarities from the dataset and the similarities produced by the model itself.
The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).
Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
is performed.
Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
evaluating the model (default True). Useful when you expect case-mismatch between training tokens
and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
occurrence (also the most frequent if vocabulary is sorted) is taken.
Use `dummy4unknown=True` to produce zero-valued similarities for pairs with out-of-vocabulary words.
Otherwise (default False), these pairs are skipped entirely.
"""
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
similarity_gold = []
similarity_model = []
oov = 0
original_vocab = self.vocab
self.vocab = ok_vocab
for line_no, line in enumerate(utils.smart_open(pairs)):
line = utils.to_unicode(line)
if line.startswith('#'):
# May be a comment
continue
else:
try:
if case_insensitive:
a, b, sim = [word.upper() for word in line.split(delimiter)]
else:
a, b, sim = [word for word in line.split(delimiter)]
sim = float(sim)
except (ValueError, TypeError):
logger.info('skipping invalid line #%d in %s', line_no, pairs)
continue
if a not in ok_vocab or b not in ok_vocab:
oov += 1
if dummy4unknown:
similarity_model.append(0.0)
similarity_gold.append(sim)
continue
else:
logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip())
continue
similarity_gold.append(sim) # Similarity from the dataset
similarity_model.append(self.similarity(a, b)) # Similarity from the model
self.vocab = original_vocab
spearman = stats.spearmanr(similarity_gold, similarity_model)
pearson = stats.pearsonr(similarity_gold, similarity_model)
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100
logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1])
logger.debug(
'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
pairs, spearman[0], spearman[1]
)
logger.debug('Pairs with unknown words: %d', oov)
self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
return pearson, spearman, oov_ratio
def init_sims(self, replace=False):
"""
Precompute L2-normalized vectors.
If `replace` is set, forget the original vectors and only keep the normalized
ones = saves lots of memory!
Note that you **cannot continue training** after doing a replace. The model becomes
effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.
"""
if getattr(self, 'syn0norm', None) is None or replace:
logger.info("precomputing L2-norms of word weight vectors")
if replace:
for i in xrange(self.syn0.shape[0]):
self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1))
self.syn0norm = self.syn0
else:
self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)
def get_keras_embedding(self, train_embeddings=False):
"""
Return a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings
"""
try:
from keras.layers import Embedding
except ImportError:
raise ImportError("Please install Keras to use this function")
weights = self.syn0
# set `trainable` as `False` to use the pretrained word embedding
# No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights
layer = Embedding(
input_dim=weights.shape[0], output_dim=weights.shape[1],
weights=[weights], trainable=train_embeddings
)
return layer
# For backward compatibility
KeyedVectors = EuclideanKeyedVectors