1798 lines
70 KiB
Python
1798 lines
70 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Author: Jayant Jain <jayantjain1992@gmail.com>
|
||
|
# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
|
||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
|
||
|
"""Python implementation of Poincaré Embeddings.
|
||
|
|
||
|
These embeddings are better at capturing latent hierarchical information than traditional Euclidean embeddings.
|
||
|
The method is described in detail in `Maximilian Nickel, Douwe Kiela -
|
||
|
"Poincaré Embeddings for Learning Hierarchical Representations" <https://arxiv.org/abs/1705.08039>`_.
|
||
|
|
||
|
The main use-case is to automatically learn hierarchical representations of nodes from a tree-like structure,
|
||
|
such as a Directed Acyclic Graph (DAG), using a transitive closure of the relations. Representations of nodes in a
|
||
|
symmetric graph can also be learned.
|
||
|
|
||
|
This module allows training Poincaré Embeddings from a training file containing relations of graph in a
|
||
|
csv-like format, or from a Python iterable of relations.
|
||
|
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
|
||
|
Initialize and train a model from a list
|
||
|
|
||
|
>>> from gensim.models.poincare import PoincareModel
|
||
|
>>> relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal'), ('gib', 'cat')]
|
||
|
>>> model = PoincareModel(relations, negative=2)
|
||
|
>>> model.train(epochs=50)
|
||
|
|
||
|
Initialize and train a model from a file containing one relation per line
|
||
|
|
||
|
>>> from gensim.models.poincare import PoincareModel, PoincareRelations
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>> file_path = datapath('poincare_hypernyms.tsv')
|
||
|
>>> model = PoincareModel(PoincareRelations(file_path), negative=2)
|
||
|
>>> model.train(epochs=50)
|
||
|
|
||
|
"""
|
||
|
|
||
|
import csv
|
||
|
import logging
|
||
|
import sys
|
||
|
import time
|
||
|
|
||
|
import numpy as np
|
||
|
from collections import defaultdict, Counter
|
||
|
from numpy import random as np_random
|
||
|
from scipy.stats import spearmanr
|
||
|
from six import string_types
|
||
|
from smart_open import smart_open
|
||
|
|
||
|
from gensim import utils, matutils
|
||
|
from gensim.models.keyedvectors import Vocab, BaseKeyedVectors
|
||
|
from gensim.models.utils_any2vec import _save_word2vec_format, _load_word2vec_format
|
||
|
from numpy import float32 as REAL
|
||
|
|
||
|
try:
|
||
|
from autograd import grad # Only required for optionally verifying gradients while training
|
||
|
from autograd import numpy as grad_np
|
||
|
AUTOGRAD_PRESENT = True
|
||
|
except ImportError:
|
||
|
AUTOGRAD_PRESENT = False
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
class PoincareModel(utils.SaveLoad):
|
||
|
"""Train, use and evaluate Poincare Embeddings.
|
||
|
|
||
|
The model can be stored/loaded via its :meth:`~gensim.models.poincare.PoincareModel.save`
|
||
|
and :meth:`~gensim.models.poincare.PoincareModel.load` methods, or stored/loaded in the word2vec format
|
||
|
via `model.kv.save_word2vec_format` and :meth:`~gensim.models.poincare.PoincareKeyedVectors.load_word2vec_format`.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Training cannot be resumed from a model loaded via `load_word2vec_format`, if you wish to train further,
|
||
|
use :meth:`~gensim.models.poincare.PoincareModel.save` and :meth:`~gensim.models.poincare.PoincareModel.load`
|
||
|
methods instead.
|
||
|
|
||
|
An important attribute (that provides a lot of additional functionality when directly accessed) are the
|
||
|
keyed vectors:
|
||
|
|
||
|
self.kv : :class:`~gensim.models.poincare.PoincareKeyedVectors`
|
||
|
This object essentially contains the mapping between nodes and embeddings, as well the vocabulary of the model
|
||
|
(set of unique nodes seen by the model). After training, it can be used to perform operations on the vectors
|
||
|
such as vector lookup, distance and similarity calculations etc.
|
||
|
See the documentation of its class for usage examples.
|
||
|
|
||
|
"""
|
||
|
def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsilon=1e-5, regularization_coeff=1.0,
|
||
|
burn_in=10, burn_in_alpha=0.01, init_range=(-0.001, 0.001), dtype=np.float64, seed=0):
|
||
|
"""Initialize and train a Poincare embedding model from an iterable of relations.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
train_data : {iterable of (str, str), :class:`gensim.models.poincare.PoincareRelations`}
|
||
|
Iterable of relations, e.g. a list of tuples, or a :class:`gensim.models.poincare.PoincareRelations`
|
||
|
instance streaming from a file. Note that the relations are treated as ordered pairs,
|
||
|
i.e. a relation (a, b) does not imply the opposite relation (b, a). In case the relations are symmetric,
|
||
|
the data should contain both relations (a, b) and (b, a).
|
||
|
size : int, optional
|
||
|
Number of dimensions of the trained model.
|
||
|
alpha : float, optional
|
||
|
Learning rate for training.
|
||
|
negative : int, optional
|
||
|
Number of negative samples to use.
|
||
|
workers : int, optional
|
||
|
Number of threads to use for training the model.
|
||
|
epsilon : float, optional
|
||
|
Constant used for clipping embeddings below a norm of one.
|
||
|
regularization_coeff : float, optional
|
||
|
Coefficient used for l2-regularization while training (0 effectively disables regularization).
|
||
|
burn_in : int, optional
|
||
|
Number of epochs to use for burn-in initialization (0 means no burn-in).
|
||
|
burn_in_alpha : float, optional
|
||
|
Learning rate for burn-in initialization, ignored if `burn_in` is 0.
|
||
|
init_range : 2-tuple (float, float)
|
||
|
Range within which the vectors are randomly initialized.
|
||
|
dtype : numpy.dtype
|
||
|
The numpy dtype to use for the vectors in the model (numpy.float64, numpy.float32 etc).
|
||
|
Using lower precision floats may be useful in increasing training speed and reducing memory usage.
|
||
|
seed : int, optional
|
||
|
Seed for random to ensure reproducibility.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Initialize a model from a list:
|
||
|
|
||
|
>>> from gensim.models.poincare import PoincareModel
|
||
|
>>> relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal'), ('gib', 'cat')]
|
||
|
>>> model = PoincareModel(relations, negative=2)
|
||
|
|
||
|
Initialize a model from a file containing one relation per line:
|
||
|
|
||
|
>>> from gensim.models.poincare import PoincareModel, PoincareRelations
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>> file_path = datapath('poincare_hypernyms.tsv')
|
||
|
>>> model = PoincareModel(PoincareRelations(file_path), negative=2)
|
||
|
|
||
|
See :class:`~gensim.models.poincare.PoincareRelations` for more options.
|
||
|
|
||
|
"""
|
||
|
self.train_data = train_data
|
||
|
self.kv = PoincareKeyedVectors(size)
|
||
|
self.size = size
|
||
|
self.train_alpha = alpha # Learning rate for training
|
||
|
self.burn_in_alpha = burn_in_alpha # Learning rate for burn-in
|
||
|
self.alpha = alpha # Current learning rate
|
||
|
self.negative = negative
|
||
|
self.workers = workers
|
||
|
self.epsilon = epsilon
|
||
|
self.regularization_coeff = regularization_coeff
|
||
|
self.burn_in = burn_in
|
||
|
self._burn_in_done = False
|
||
|
self.dtype = dtype
|
||
|
self.seed = seed
|
||
|
self._np_random = np_random.RandomState(seed)
|
||
|
self.init_range = init_range
|
||
|
self._loss_grad = None
|
||
|
self._load_relations()
|
||
|
self._init_embeddings()
|
||
|
|
||
|
def _load_relations(self):
|
||
|
"""Load relations from the train data and build vocab."""
|
||
|
vocab = {}
|
||
|
index2word = []
|
||
|
all_relations = [] # List of all relation pairs
|
||
|
node_relations = defaultdict(set) # Mapping from node index to its related node indices
|
||
|
|
||
|
logger.info("loading relations from train data..")
|
||
|
for relation in self.train_data:
|
||
|
if len(relation) != 2:
|
||
|
raise ValueError('Relation pair "%s" should have exactly two items' % repr(relation))
|
||
|
for item in relation:
|
||
|
if item in vocab:
|
||
|
vocab[item].count += 1
|
||
|
else:
|
||
|
vocab[item] = Vocab(count=1, index=len(index2word))
|
||
|
index2word.append(item)
|
||
|
node_1, node_2 = relation
|
||
|
node_1_index, node_2_index = vocab[node_1].index, vocab[node_2].index
|
||
|
node_relations[node_1_index].add(node_2_index)
|
||
|
relation = (node_1_index, node_2_index)
|
||
|
all_relations.append(relation)
|
||
|
logger.info("loaded %d relations from train data, %d nodes", len(all_relations), len(vocab))
|
||
|
self.kv.vocab = vocab
|
||
|
self.kv.index2word = index2word
|
||
|
self.indices_set = set((range(len(index2word)))) # Set of all node indices
|
||
|
self.indices_array = np.array(range(len(index2word))) # Numpy array of all node indices
|
||
|
self.all_relations = all_relations
|
||
|
self.node_relations = node_relations
|
||
|
self._init_node_probabilities()
|
||
|
self._negatives_buffer = NegativesBuffer([]) # Buffer for negative samples, to reduce calls to sampling method
|
||
|
self._negatives_buffer_size = 2000
|
||
|
|
||
|
def _init_embeddings(self):
|
||
|
"""Randomly initialize vectors for the items in the vocab."""
|
||
|
shape = (len(self.kv.index2word), self.size)
|
||
|
self.kv.syn0 = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype)
|
||
|
|
||
|
def _init_node_probabilities(self):
|
||
|
"""Initialize a-priori probabilities."""
|
||
|
counts = np.array([
|
||
|
self.kv.vocab[self.kv.index2word[i]].count
|
||
|
for i in range(len(self.kv.index2word))
|
||
|
],
|
||
|
dtype=np.float64)
|
||
|
self._node_counts_cumsum = np.cumsum(counts)
|
||
|
self._node_probabilities = counts / counts.sum()
|
||
|
|
||
|
def _get_candidate_negatives(self):
|
||
|
"""Get candidate negatives of size `self.negative` from the negative examples buffer.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
numpy.array
|
||
|
Array of shape (`self.negative`,) containing indices of negative nodes.
|
||
|
|
||
|
"""
|
||
|
if self._negatives_buffer.num_items() < self.negative:
|
||
|
# cumsum table of counts used instead of the standard approach of a probability cumsum table
|
||
|
# this is to avoid floating point errors that result when the number of nodes is very high
|
||
|
# for reference: https://github.com/RaRe-Technologies/gensim/issues/1917
|
||
|
max_cumsum_value = self._node_counts_cumsum[-1]
|
||
|
uniform_numbers = self._np_random.randint(1, max_cumsum_value + 1, self._negatives_buffer_size)
|
||
|
cumsum_table_indices = np.searchsorted(self._node_counts_cumsum, uniform_numbers)
|
||
|
self._negatives_buffer = NegativesBuffer(cumsum_table_indices)
|
||
|
return self._negatives_buffer.get_items(self.negative)
|
||
|
|
||
|
def _sample_negatives(self, node_index):
|
||
|
"""Get a sample of negatives for the given node.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node_index : int
|
||
|
Index of the positive node for which negative samples are to be returned.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
numpy.array
|
||
|
Array of shape (self.negative,) containing indices of negative nodes for the given node index.
|
||
|
|
||
|
"""
|
||
|
node_relations = self.node_relations[node_index]
|
||
|
num_remaining_nodes = len(self.kv.vocab) - len(node_relations)
|
||
|
if num_remaining_nodes < self.negative:
|
||
|
raise ValueError(
|
||
|
'Cannot sample %d negative nodes from a set of %d negative nodes for %s' %
|
||
|
(self.negative, num_remaining_nodes, self.kv.index2word[node_index])
|
||
|
)
|
||
|
|
||
|
positive_fraction = float(len(node_relations)) / len(self.kv.vocab)
|
||
|
if positive_fraction < 0.01:
|
||
|
# If number of positive relations is a small fraction of total nodes
|
||
|
# re-sample till no positively connected nodes are chosen
|
||
|
indices = self._get_candidate_negatives()
|
||
|
unique_indices = set(indices)
|
||
|
times_sampled = 1
|
||
|
while (len(indices) != len(unique_indices)) or (unique_indices & node_relations):
|
||
|
times_sampled += 1
|
||
|
indices = self._get_candidate_negatives()
|
||
|
unique_indices = set(indices)
|
||
|
if times_sampled > 1:
|
||
|
logger.debug('sampled %d times, positive fraction %.5f', times_sampled, positive_fraction)
|
||
|
else:
|
||
|
# If number of positive relations is a significant fraction of total nodes
|
||
|
# subtract positively connected nodes from set of choices and sample from the remaining
|
||
|
valid_negatives = np.array(list(self.indices_set - node_relations))
|
||
|
probs = self._node_probabilities[valid_negatives]
|
||
|
probs /= probs.sum()
|
||
|
indices = self._np_random.choice(valid_negatives, size=self.negative, p=probs, replace=False)
|
||
|
|
||
|
return list(indices)
|
||
|
|
||
|
@staticmethod
|
||
|
def _loss_fn(matrix, regularization_coeff=1.0):
|
||
|
"""Computes loss value.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
matrix : numpy.array
|
||
|
Array containing vectors for u, v and negative samples, of shape (2 + negative_size, dim).
|
||
|
regularization_coeff : float, optional
|
||
|
Coefficient to use for l2-regularization
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
Computed loss value.
|
||
|
|
||
|
Warnings
|
||
|
--------
|
||
|
Only used for autograd gradients, since autograd requires a specific function signature.
|
||
|
|
||
|
"""
|
||
|
vector_u = matrix[0]
|
||
|
vectors_v = matrix[1:]
|
||
|
euclidean_dists = grad_np.linalg.norm(vector_u - vectors_v, axis=1)
|
||
|
norm = grad_np.linalg.norm(vector_u)
|
||
|
all_norms = grad_np.linalg.norm(vectors_v, axis=1)
|
||
|
poincare_dists = grad_np.arccosh(
|
||
|
1 + 2 * (
|
||
|
(euclidean_dists ** 2) / ((1 - norm ** 2) * (1 - all_norms ** 2))
|
||
|
)
|
||
|
)
|
||
|
exp_negative_distances = grad_np.exp(-poincare_dists)
|
||
|
regularization_term = regularization_coeff * grad_np.linalg.norm(vectors_v[0]) ** 2
|
||
|
return -grad_np.log(exp_negative_distances[0] / (exp_negative_distances.sum())) + regularization_term
|
||
|
|
||
|
@staticmethod
|
||
|
def _clip_vectors(vectors, epsilon):
|
||
|
"""Clip vectors to have a norm of less than one.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
vectors : numpy.array
|
||
|
Can be 1-D, or 2-D (in which case the norm for each row is checked).
|
||
|
epsilon : float
|
||
|
Parameter for numerical stability, each dimension of the vector is reduced by `epsilon`
|
||
|
if the norm of the vector is greater than or equal to 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
numpy.array
|
||
|
Array with norms clipped below 1.
|
||
|
|
||
|
"""
|
||
|
one_d = len(vectors.shape) == 1
|
||
|
threshold = 1 - epsilon
|
||
|
if one_d:
|
||
|
norm = np.linalg.norm(vectors)
|
||
|
if norm < threshold:
|
||
|
return vectors
|
||
|
else:
|
||
|
return vectors / norm - (np.sign(vectors) * epsilon)
|
||
|
else:
|
||
|
norms = np.linalg.norm(vectors, axis=1)
|
||
|
if (norms < threshold).all():
|
||
|
return vectors
|
||
|
else:
|
||
|
vectors[norms >= threshold] *= (threshold / norms[norms >= threshold])[:, np.newaxis]
|
||
|
vectors[norms >= threshold] -= np.sign(vectors[norms >= threshold]) * epsilon
|
||
|
return vectors
|
||
|
|
||
|
def save(self, *args, **kwargs):
|
||
|
"""Save complete model to disk, inherited from :class:`~gensim.utils.SaveLoad`.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
:meth:`~gensim.models.poincare.PoincareModel.load`
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
*args
|
||
|
Positional arguments passed to :meth:`~gensim.utils.SaveLoad.save`.
|
||
|
**kwargs
|
||
|
Keyword arguments passed to :meth:`~gensim.utils.SaveLoad.save`.
|
||
|
|
||
|
"""
|
||
|
self._loss_grad = None # Can't pickle autograd fn to disk
|
||
|
attrs_to_ignore = ['_node_probabilities', '_node_counts_cumsum']
|
||
|
kwargs['ignore'] = set(list(kwargs.get('ignore', [])) + attrs_to_ignore)
|
||
|
super(PoincareModel, self).save(*args, **kwargs)
|
||
|
|
||
|
@classmethod
|
||
|
def load(cls, *args, **kwargs):
|
||
|
"""Load model from disk, inherited from :class:`~gensim.utils.SaveLoad`.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
:meth:`~gensim.models.poincare.PoincareModel.save`
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
*args
|
||
|
Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`.
|
||
|
**kwargs
|
||
|
Keyword arguments passed to :meth:`~gensim.utils.SaveLoad.load`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`~gensim.models.poincare.PoincareModel`
|
||
|
The loaded model.
|
||
|
|
||
|
"""
|
||
|
model = super(PoincareModel, cls).load(*args, **kwargs)
|
||
|
model._init_node_probabilities()
|
||
|
return model
|
||
|
|
||
|
def _prepare_training_batch(self, relations, all_negatives, check_gradients=False):
|
||
|
"""Create a training batch and compute gradients and loss for the batch.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
relations : list of tuples
|
||
|
List of tuples of positive examples of the form (node_1_index, node_2_index).
|
||
|
all_negatives : list of lists
|
||
|
List of lists of negative samples for each node_1 in the positive examples.
|
||
|
check_gradients : bool, optional
|
||
|
Whether to compare the computed gradients to autograd gradients for this batch.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`~gensim.models.poincare.PoincareBatch`
|
||
|
Node indices, computed gradients and loss for the batch.
|
||
|
|
||
|
"""
|
||
|
batch_size = len(relations)
|
||
|
indices_u, indices_v = [], []
|
||
|
for relation, negatives in zip(relations, all_negatives):
|
||
|
u, v = relation
|
||
|
indices_u.append(u)
|
||
|
indices_v.append(v)
|
||
|
indices_v.extend(negatives)
|
||
|
|
||
|
vectors_u = self.kv.syn0[indices_u]
|
||
|
vectors_v = self.kv.syn0[indices_v].reshape((batch_size, 1 + self.negative, self.size))
|
||
|
vectors_v = vectors_v.swapaxes(0, 1).swapaxes(1, 2)
|
||
|
batch = PoincareBatch(vectors_u, vectors_v, indices_u, indices_v, self.regularization_coeff)
|
||
|
batch.compute_all()
|
||
|
|
||
|
if check_gradients:
|
||
|
self._check_gradients(relations, all_negatives, batch)
|
||
|
|
||
|
return batch
|
||
|
|
||
|
def _check_gradients(self, relations, all_negatives, batch, tol=1e-8):
|
||
|
"""Compare computed gradients for batch to autograd gradients.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
relations : list of tuples
|
||
|
List of tuples of positive examples of the form (node_1_index, node_2_index).
|
||
|
all_negatives : list of lists
|
||
|
List of lists of negative samples for each node_1 in the positive examples.
|
||
|
batch : :class:`~gensim.models.poincare.PoincareBatch`
|
||
|
Batch for which computed gradients are to be checked.
|
||
|
tol : float, optional
|
||
|
The maximum error between our computed gradients and the reference ones from autograd.
|
||
|
|
||
|
"""
|
||
|
if not AUTOGRAD_PRESENT:
|
||
|
logger.warning('autograd could not be imported, cannot do gradient checking')
|
||
|
logger.warning('please install autograd to enable gradient checking')
|
||
|
return
|
||
|
|
||
|
if self._loss_grad is None:
|
||
|
self._loss_grad = grad(PoincareModel._loss_fn)
|
||
|
|
||
|
max_diff = 0.0
|
||
|
for i, (relation, negatives) in enumerate(zip(relations, all_negatives)):
|
||
|
u, v = relation
|
||
|
auto_gradients = self._loss_grad(
|
||
|
np.vstack((self.kv.syn0[u], self.kv.syn0[[v] + negatives])), self.regularization_coeff)
|
||
|
computed_gradients = np.vstack((batch.gradients_u[:, i], batch.gradients_v[:, :, i]))
|
||
|
diff = np.abs(auto_gradients - computed_gradients).max()
|
||
|
if diff > max_diff:
|
||
|
max_diff = diff
|
||
|
logger.info('max difference between computed gradients and autograd gradients: %.10f', max_diff)
|
||
|
assert max_diff < tol, (
|
||
|
'Max difference between computed gradients and autograd gradients %.10f, '
|
||
|
'greater than tolerance %.10f' % (max_diff, tol))
|
||
|
|
||
|
def _sample_negatives_batch(self, nodes):
|
||
|
"""Get negative examples for each node.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
nodes : list of int
|
||
|
List of node indices for which negative samples are to be returned.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of lists
|
||
|
Each inner list is a list of negative samples for a single node in the input list.
|
||
|
|
||
|
"""
|
||
|
all_indices = [self._sample_negatives(node) for node in nodes]
|
||
|
return all_indices
|
||
|
|
||
|
def _train_on_batch(self, relations, check_gradients=False):
|
||
|
"""Perform training for a single training batch.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
relations : list of tuples of (int, int)
|
||
|
List of tuples of positive examples of the form (node_1_index, node_2_index).
|
||
|
check_gradients : bool, optional
|
||
|
Whether to compare the computed gradients to autograd gradients for this batch.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`~gensim.models.poincare.PoincareBatch`
|
||
|
The batch that was just trained on, contains computed loss for the batch.
|
||
|
|
||
|
"""
|
||
|
all_negatives = self._sample_negatives_batch([relation[0] for relation in relations])
|
||
|
batch = self._prepare_training_batch(relations, all_negatives, check_gradients)
|
||
|
self._update_vectors_batch(batch)
|
||
|
return batch
|
||
|
|
||
|
@staticmethod
|
||
|
def _handle_duplicates(vector_updates, node_indices):
|
||
|
"""Handle occurrences of multiple updates to the same node in a batch of vector updates.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
vector_updates : numpy.array
|
||
|
Array with each row containing updates to be performed on a certain node.
|
||
|
node_indices : list of int
|
||
|
Node indices on which the above updates are to be performed on.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Mutates the `vector_updates` array.
|
||
|
|
||
|
Required because vectors[[2, 1, 2]] += np.array([-0.5, 1.0, 0.5]) performs only the last update
|
||
|
on the row at index 2.
|
||
|
|
||
|
"""
|
||
|
counts = Counter(node_indices)
|
||
|
for node_index, count in counts.items():
|
||
|
if count == 1:
|
||
|
continue
|
||
|
positions = [i for i, index in enumerate(node_indices) if index == node_index]
|
||
|
# Move all updates to the same node to the last such update, zeroing all the others
|
||
|
vector_updates[positions[-1]] = vector_updates[positions].sum(axis=0)
|
||
|
vector_updates[positions[:-1]] = 0
|
||
|
|
||
|
def _update_vectors_batch(self, batch):
|
||
|
"""Update vectors for nodes in the given batch.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
batch : :class:`~gensim.models.poincare.PoincareBatch`
|
||
|
Batch containing computed gradients and node indices of the batch for which updates are to be done.
|
||
|
|
||
|
"""
|
||
|
grad_u, grad_v = batch.gradients_u, batch.gradients_v
|
||
|
indices_u, indices_v = batch.indices_u, batch.indices_v
|
||
|
batch_size = len(indices_u)
|
||
|
|
||
|
u_updates = (self.alpha * (batch.alpha ** 2) / 4 * grad_u).T
|
||
|
self._handle_duplicates(u_updates, indices_u)
|
||
|
|
||
|
self.kv.syn0[indices_u] -= u_updates
|
||
|
self.kv.syn0[indices_u] = self._clip_vectors(self.kv.syn0[indices_u], self.epsilon)
|
||
|
|
||
|
v_updates = self.alpha * (batch.beta ** 2)[:, np.newaxis] / 4 * grad_v
|
||
|
v_updates = v_updates.swapaxes(1, 2).swapaxes(0, 1)
|
||
|
v_updates = v_updates.reshape(((1 + self.negative) * batch_size, self.size))
|
||
|
self._handle_duplicates(v_updates, indices_v)
|
||
|
|
||
|
self.kv.syn0[indices_v] -= v_updates
|
||
|
self.kv.syn0[indices_v] = self._clip_vectors(self.kv.syn0[indices_v], self.epsilon)
|
||
|
|
||
|
def train(self, epochs, batch_size=10, print_every=1000, check_gradients_every=None):
|
||
|
"""Train Poincare embeddings using loaded data and model parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
epochs : int
|
||
|
Number of iterations (epochs) over the corpus.
|
||
|
batch_size : int, optional
|
||
|
Number of examples to train on in a single batch.
|
||
|
|
||
|
print_every : int, optional
|
||
|
Prints progress and average loss after every `print_every` batches.
|
||
|
check_gradients_every : int or None, optional
|
||
|
Compares computed gradients and autograd gradients after every `check_gradients_every` batches.
|
||
|
Useful for debugging, doesn't compare by default.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.models.poincare import PoincareModel
|
||
|
>>> relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal'), ('gib', 'cat')]
|
||
|
>>> model = PoincareModel(relations, negative=2)
|
||
|
>>> model.train(epochs=50)
|
||
|
|
||
|
"""
|
||
|
if self.workers > 1:
|
||
|
raise NotImplementedError("Multi-threaded version not implemented yet")
|
||
|
# Some divide-by-zero results are handled explicitly
|
||
|
old_settings = np.seterr(divide='ignore', invalid='ignore')
|
||
|
|
||
|
logger.info(
|
||
|
"training model of size %d with %d workers on %d relations for %d epochs and %d burn-in epochs, "
|
||
|
"using lr=%.5f burn-in lr=%.5f negative=%d",
|
||
|
self.size, self.workers, len(self.all_relations), epochs, self.burn_in,
|
||
|
self.alpha, self.burn_in_alpha, self.negative
|
||
|
)
|
||
|
|
||
|
if self.burn_in > 0 and not self._burn_in_done:
|
||
|
logger.info("starting burn-in (%d epochs)----------------------------------------", self.burn_in)
|
||
|
self.alpha = self.burn_in_alpha
|
||
|
self._train_batchwise(
|
||
|
epochs=self.burn_in, batch_size=batch_size, print_every=print_every,
|
||
|
check_gradients_every=check_gradients_every)
|
||
|
self._burn_in_done = True
|
||
|
logger.info("burn-in finished")
|
||
|
|
||
|
self.alpha = self.train_alpha
|
||
|
logger.info("starting training (%d epochs)----------------------------------------", epochs)
|
||
|
self._train_batchwise(
|
||
|
epochs=epochs, batch_size=batch_size, print_every=print_every,
|
||
|
check_gradients_every=check_gradients_every)
|
||
|
logger.info("training finished")
|
||
|
|
||
|
np.seterr(**old_settings)
|
||
|
|
||
|
def _train_batchwise(self, epochs, batch_size=10, print_every=1000, check_gradients_every=None):
|
||
|
"""Train Poincare embeddings using specified parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
epochs : int
|
||
|
Number of iterations (epochs) over the corpus.
|
||
|
batch_size : int, optional
|
||
|
Number of examples to train on in a single batch.
|
||
|
print_every : int, optional
|
||
|
Prints progress and average loss after every `print_every` batches.
|
||
|
check_gradients_every : int or None, optional
|
||
|
Compares computed gradients and autograd gradients after every `check_gradients_every` batches.
|
||
|
Useful for debugging, doesn't compare by default.
|
||
|
|
||
|
"""
|
||
|
if self.workers > 1:
|
||
|
raise NotImplementedError("Multi-threaded version not implemented yet")
|
||
|
for epoch in range(1, epochs + 1):
|
||
|
indices = list(range(len(self.all_relations)))
|
||
|
self._np_random.shuffle(indices)
|
||
|
avg_loss = 0.0
|
||
|
last_time = time.time()
|
||
|
for batch_num, i in enumerate(range(0, len(indices), batch_size), start=1):
|
||
|
should_print = not (batch_num % print_every)
|
||
|
check_gradients = bool(check_gradients_every) and (batch_num % check_gradients_every) == 0
|
||
|
batch_indices = indices[i:i + batch_size]
|
||
|
relations = [self.all_relations[idx] for idx in batch_indices]
|
||
|
result = self._train_on_batch(relations, check_gradients=check_gradients)
|
||
|
avg_loss += result.loss
|
||
|
if should_print:
|
||
|
avg_loss /= print_every
|
||
|
time_taken = time.time() - last_time
|
||
|
speed = print_every * batch_size / time_taken
|
||
|
logger.info(
|
||
|
'training on epoch %d, examples #%d-#%d, loss: %.2f'
|
||
|
% (epoch, i, i + batch_size, avg_loss))
|
||
|
logger.info(
|
||
|
'time taken for %d examples: %.2f s, %.2f examples / s'
|
||
|
% (print_every * batch_size, time_taken, speed))
|
||
|
last_time = time.time()
|
||
|
avg_loss = 0.0
|
||
|
|
||
|
|
||
|
class PoincareBatch(object):
|
||
|
"""Compute Poincare distances, gradients and loss for a training batch.
|
||
|
|
||
|
Store intermediate state to avoid recomputing multiple times.
|
||
|
|
||
|
"""
|
||
|
def __init__(self, vectors_u, vectors_v, indices_u, indices_v, regularization_coeff=1.0):
|
||
|
"""
|
||
|
Initialize instance with sets of vectors for which distances are to be computed.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
vectors_u : numpy.array
|
||
|
Vectors of all nodes `u` in the batch. Expected shape (batch_size, dim).
|
||
|
vectors_v : numpy.array
|
||
|
Vectors of all positively related nodes `v` and negatively sampled nodes `v'`,
|
||
|
for each node `u` in the batch. Expected shape (1 + neg_size, dim, batch_size).
|
||
|
indices_u : list of int
|
||
|
List of node indices for each of the vectors in `vectors_u`.
|
||
|
indices_v : list of lists of int
|
||
|
Nested list of lists, each of which is a list of node indices
|
||
|
for each of the vectors in `vectors_v` for a specific node `u`.
|
||
|
regularization_coeff : float, optional
|
||
|
Coefficient to use for l2-regularization
|
||
|
|
||
|
"""
|
||
|
self.vectors_u = vectors_u.T[np.newaxis, :, :] # (1, dim, batch_size)
|
||
|
self.vectors_v = vectors_v # (1 + neg_size, dim, batch_size)
|
||
|
self.indices_u = indices_u
|
||
|
self.indices_v = indices_v
|
||
|
self.regularization_coeff = regularization_coeff
|
||
|
|
||
|
self.poincare_dists = None
|
||
|
self.euclidean_dists = None
|
||
|
|
||
|
self.norms_u = None
|
||
|
self.norms_v = None
|
||
|
self.alpha = None
|
||
|
self.beta = None
|
||
|
self.gamma = None
|
||
|
|
||
|
self.gradients_u = None
|
||
|
self.distance_gradients_u = None
|
||
|
self.gradients_v = None
|
||
|
self.distance_gradients_v = None
|
||
|
|
||
|
self.loss = None
|
||
|
|
||
|
self._distances_computed = False
|
||
|
self._gradients_computed = False
|
||
|
self._distance_gradients_computed = False
|
||
|
self._loss_computed = False
|
||
|
|
||
|
def compute_all(self):
|
||
|
"""Convenience method to perform all computations."""
|
||
|
self.compute_distances()
|
||
|
self.compute_distance_gradients()
|
||
|
self.compute_gradients()
|
||
|
self.compute_loss()
|
||
|
|
||
|
def compute_distances(self):
|
||
|
"""Compute and store norms, euclidean distances and poincare distances between input vectors."""
|
||
|
if self._distances_computed:
|
||
|
return
|
||
|
euclidean_dists = np.linalg.norm(self.vectors_u - self.vectors_v, axis=1) # (1 + neg_size, batch_size)
|
||
|
norms_u = np.linalg.norm(self.vectors_u, axis=1) # (1, batch_size)
|
||
|
norms_v = np.linalg.norm(self.vectors_v, axis=1) # (1 + neg_size, batch_size)
|
||
|
alpha = 1 - norms_u ** 2 # (1, batch_size)
|
||
|
beta = 1 - norms_v ** 2 # (1 + neg_size, batch_size)
|
||
|
gamma = 1 + 2 * (
|
||
|
(euclidean_dists ** 2) / (alpha * beta)
|
||
|
) # (1 + neg_size, batch_size)
|
||
|
poincare_dists = np.arccosh(gamma) # (1 + neg_size, batch_size)
|
||
|
exp_negative_distances = np.exp(-poincare_dists) # (1 + neg_size, batch_size)
|
||
|
Z = exp_negative_distances.sum(axis=0) # (batch_size)
|
||
|
|
||
|
self.euclidean_dists = euclidean_dists
|
||
|
self.poincare_dists = poincare_dists
|
||
|
self.exp_negative_distances = exp_negative_distances
|
||
|
self.Z = Z
|
||
|
self.gamma = gamma
|
||
|
self.norms_u = norms_u
|
||
|
self.norms_v = norms_v
|
||
|
self.alpha = alpha
|
||
|
self.beta = beta
|
||
|
self.gamma = gamma
|
||
|
|
||
|
self._distances_computed = True
|
||
|
|
||
|
def compute_gradients(self):
|
||
|
"""Compute and store gradients of loss function for all input vectors."""
|
||
|
if self._gradients_computed:
|
||
|
return
|
||
|
self.compute_distances()
|
||
|
self.compute_distance_gradients()
|
||
|
|
||
|
# (1 + neg_size, dim, batch_size)
|
||
|
gradients_v = -self.exp_negative_distances[:, np.newaxis, :] * self.distance_gradients_v
|
||
|
gradients_v /= self.Z # (1 + neg_size, dim, batch_size)
|
||
|
gradients_v[0] += self.distance_gradients_v[0]
|
||
|
gradients_v[0] += self.regularization_coeff * 2 * self.vectors_v[0]
|
||
|
|
||
|
# (1 + neg_size, dim, batch_size)
|
||
|
gradients_u = -self.exp_negative_distances[:, np.newaxis, :] * self.distance_gradients_u
|
||
|
gradients_u /= self.Z # (1 + neg_size, dim, batch_size)
|
||
|
gradients_u = gradients_u.sum(axis=0) # (dim, batch_size)
|
||
|
gradients_u += self.distance_gradients_u[0]
|
||
|
|
||
|
assert not np.isnan(gradients_u).any()
|
||
|
assert not np.isnan(gradients_v).any()
|
||
|
self.gradients_u = gradients_u
|
||
|
self.gradients_v = gradients_v
|
||
|
|
||
|
self._gradients_computed = True
|
||
|
|
||
|
def compute_distance_gradients(self):
|
||
|
"""Compute and store partial derivatives of poincare distance d(u, v) w.r.t all u and all v."""
|
||
|
if self._distance_gradients_computed:
|
||
|
return
|
||
|
self.compute_distances()
|
||
|
|
||
|
euclidean_dists_squared = self.euclidean_dists ** 2 # (1 + neg_size, batch_size)
|
||
|
# (1 + neg_size, 1, batch_size)
|
||
|
c_ = (4 / (self.alpha * self.beta * np.sqrt(self.gamma ** 2 - 1)))[:, np.newaxis, :]
|
||
|
# (1 + neg_size, 1, batch_size)
|
||
|
u_coeffs = ((euclidean_dists_squared + self.alpha) / self.alpha)[:, np.newaxis, :]
|
||
|
distance_gradients_u = u_coeffs * self.vectors_u - self.vectors_v # (1 + neg_size, dim, batch_size)
|
||
|
distance_gradients_u *= c_ # (1 + neg_size, dim, batch_size)
|
||
|
|
||
|
nan_gradients = self.gamma == 1 # (1 + neg_size, batch_size)
|
||
|
if nan_gradients.any():
|
||
|
distance_gradients_u.swapaxes(1, 2)[nan_gradients] = 0
|
||
|
self.distance_gradients_u = distance_gradients_u
|
||
|
|
||
|
# (1 + neg_size, 1, batch_size)
|
||
|
v_coeffs = ((euclidean_dists_squared + self.beta) / self.beta)[:, np.newaxis, :]
|
||
|
distance_gradients_v = v_coeffs * self.vectors_v - self.vectors_u # (1 + neg_size, dim, batch_size)
|
||
|
distance_gradients_v *= c_ # (1 + neg_size, dim, batch_size)
|
||
|
|
||
|
if nan_gradients.any():
|
||
|
distance_gradients_v.swapaxes(1, 2)[nan_gradients] = 0
|
||
|
self.distance_gradients_v = distance_gradients_v
|
||
|
|
||
|
self._distance_gradients_computed = True
|
||
|
|
||
|
def compute_loss(self):
|
||
|
"""Compute and store loss value for the given batch of examples."""
|
||
|
if self._loss_computed:
|
||
|
return
|
||
|
self.compute_distances()
|
||
|
|
||
|
self.loss = -np.log(self.exp_negative_distances[0] / self.Z).sum() # scalar
|
||
|
self._loss_computed = True
|
||
|
|
||
|
|
||
|
class PoincareKeyedVectors(BaseKeyedVectors):
|
||
|
"""Vectors and vocab for the :class:`~gensim.models.poincare.PoincareModel` training class.
|
||
|
|
||
|
Used to perform operations on the vectors such as vector lookup, distance calculations etc.
|
||
|
|
||
|
"""
|
||
|
def __init__(self, vector_size):
|
||
|
super(PoincareKeyedVectors, self).__init__(vector_size)
|
||
|
self.max_distance = 0
|
||
|
self.index2word = []
|
||
|
|
||
|
@property
|
||
|
def vectors(self):
|
||
|
return self.syn0
|
||
|
|
||
|
@vectors.setter
|
||
|
def vectors(self, value):
|
||
|
self.syn0 = value
|
||
|
|
||
|
@property
|
||
|
def index2entity(self):
|
||
|
return self.index2word
|
||
|
|
||
|
@index2entity.setter
|
||
|
def index2entity(self, value):
|
||
|
self.index2word = value
|
||
|
|
||
|
def word_vec(self, word):
|
||
|
"""Get the word's representations in vector space, as a 1D numpy array.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> # Read the sample relations file and train the model
|
||
|
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
|
||
|
>>> model = PoincareModel(train_data=relations)
|
||
|
>>> model.train(epochs=50)
|
||
|
>>>
|
||
|
>>> # Query the trained model.
|
||
|
>>> wv = model.kv.word_vec('kangaroo.n.01')
|
||
|
|
||
|
"""
|
||
|
return super(PoincareKeyedVectors, self).get_vector(word)
|
||
|
|
||
|
def words_closer_than(self, w1, w2):
|
||
|
"""Get all words that are closer to `w1` than `w2` is to `w1`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
w1 : str
|
||
|
Input word.
|
||
|
w2 : str
|
||
|
Input word.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list (str)
|
||
|
List of words that are closer to `w1` than `w2` is to `w1`.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> # Read the sample relations file and train the model
|
||
|
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
|
||
|
>>> model = PoincareModel(train_data=relations)
|
||
|
>>> model.train(epochs=50)
|
||
|
>>>
|
||
|
>>> # Which term is closer to 'kangaroo' than 'metatherian' is to 'kangaroo'?
|
||
|
>>> model.kv.words_closer_than('kangaroo.n.01', 'metatherian.n.01')
|
||
|
[u'marsupial.n.01', u'phalanger.n.01']
|
||
|
|
||
|
"""
|
||
|
return super(PoincareKeyedVectors, self).closer_than(w1, w2)
|
||
|
|
||
|
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
|
||
|
"""Store the input-hidden weight matrix in the same format used by the original
|
||
|
C word2vec-tool, for compatibility, using :func:`~gensim.models.utils_any2vec._save_word2vec_format`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname : str
|
||
|
Path to file that will be used for storing.
|
||
|
fvocab : str, optional
|
||
|
File path used to save the vocabulary.
|
||
|
binary : bool, optional
|
||
|
If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
|
||
|
total_vec : int, optional
|
||
|
Explicitly specify total number of vectors
|
||
|
(in case word vectors are appended with document vectors afterwards).
|
||
|
|
||
|
"""
|
||
|
_save_word2vec_format(fname, self.vocab, self.syn0, fvocab=fvocab, binary=binary, total_vec=total_vec)
|
||
|
|
||
|
@classmethod
|
||
|
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
|
||
|
limit=None, datatype=REAL):
|
||
|
"""Load the input-hidden weight matrix from the original C word2vec-tool format.
|
||
|
Use :func:`~gensim.models.utils_any2vec._load_word2vec_format`.
|
||
|
|
||
|
Note that the information stored in the file is incomplete (the binary tree is missing),
|
||
|
so while you can query for word similarity etc., you cannot continue training
|
||
|
with a model loaded this way.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname : str
|
||
|
The file path to the saved word2vec-format file.
|
||
|
fvocab : str, optional
|
||
|
File path to the vocabulary.Word counts are read from `fvocab` filename, if set
|
||
|
(this is the file generated by `-save-vocab` flag of the original C tool).
|
||
|
binary : bool, optional
|
||
|
If True, indicates whether the data is in binary word2vec format.
|
||
|
encoding : str, optional
|
||
|
If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`.
|
||
|
unicode_errors : str, optional
|
||
|
default 'strict', is a string suitable to be passed as the `errors`
|
||
|
argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
|
||
|
file may include word tokens truncated in the middle of a multibyte unicode character
|
||
|
(as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
|
||
|
limit : int, optional
|
||
|
Sets a maximum number of word-vectors to read from the file. The default,
|
||
|
None, means read all.
|
||
|
datatype : type, optional
|
||
|
(Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory.
|
||
|
Such types may result in much slower bulk operations or incompatibility with optimized routines.)
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`~gensim.models.poincare.PoincareModel`
|
||
|
Loaded Poincare model.
|
||
|
|
||
|
"""
|
||
|
return _load_word2vec_format(
|
||
|
cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
|
||
|
limit=limit, datatype=datatype)
|
||
|
|
||
|
@staticmethod
|
||
|
def vector_distance(vector_1, vector_2):
|
||
|
"""Compute poincare distance between two input vectors. Convenience method over `vector_distance_batch`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
vector_1 : numpy.array
|
||
|
Input vector.
|
||
|
vector_2 : numpy.array
|
||
|
Input vector.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
numpy.float
|
||
|
Poincare distance between `vector_1` and `vector_2`.
|
||
|
|
||
|
"""
|
||
|
return PoincareKeyedVectors.vector_distance_batch(vector_1, vector_2[np.newaxis, :])[0]
|
||
|
|
||
|
@staticmethod
|
||
|
def vector_distance_batch(vector_1, vectors_all):
|
||
|
"""Compute poincare distances between one vector and a set of other vectors.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
vector_1 : numpy.array
|
||
|
vector from which Poincare distances are to be computed, expected shape (dim,).
|
||
|
vectors_all : numpy.array
|
||
|
for each row in vectors_all, distance from vector_1 is computed, expected shape (num_vectors, dim).
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
numpy.array
|
||
|
Poincare distance between `vector_1` and each row in `vectors_all`, shape (num_vectors,).
|
||
|
|
||
|
"""
|
||
|
euclidean_dists = np.linalg.norm(vector_1 - vectors_all, axis=1)
|
||
|
norm = np.linalg.norm(vector_1)
|
||
|
all_norms = np.linalg.norm(vectors_all, axis=1)
|
||
|
return np.arccosh(
|
||
|
1 + 2 * (
|
||
|
(euclidean_dists ** 2) / ((1 - norm ** 2) * (1 - all_norms ** 2))
|
||
|
)
|
||
|
)
|
||
|
|
||
|
def closest_child(self, node):
|
||
|
"""Get the node closest to `node` that is lower in the hierarchy than `node`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node : {str, int}
|
||
|
Key for node for which closest child is to be found.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
{str, None}
|
||
|
Node closest to `node` that is lower in the hierarchy than `node`.
|
||
|
If there are no nodes lower in the hierarchy, None is returned.
|
||
|
|
||
|
"""
|
||
|
all_distances = self.distances(node)
|
||
|
all_norms = np.linalg.norm(self.syn0, axis=1)
|
||
|
node_norm = all_norms[self.vocab[node].index]
|
||
|
mask = node_norm >= all_norms
|
||
|
if mask.all(): # No nodes lower in the hierarchy
|
||
|
return None
|
||
|
all_distances = np.ma.array(all_distances, mask=mask)
|
||
|
closest_child_index = np.ma.argmin(all_distances)
|
||
|
return self.index2word[closest_child_index]
|
||
|
|
||
|
def closest_parent(self, node):
|
||
|
"""Get the node closest to `node` that is higher in the hierarchy than `node`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node : {str, int}
|
||
|
Key for node for which closest parent is to be found.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
{str, None}
|
||
|
Node closest to `node` that is higher in the hierarchy than `node`.
|
||
|
If there are no nodes higher in the hierarchy, None is returned.
|
||
|
|
||
|
"""
|
||
|
all_distances = self.distances(node)
|
||
|
all_norms = np.linalg.norm(self.syn0, axis=1)
|
||
|
node_norm = all_norms[self.vocab[node].index]
|
||
|
mask = node_norm <= all_norms
|
||
|
if mask.all(): # No nodes higher in the hierarchy
|
||
|
return None
|
||
|
all_distances = np.ma.array(all_distances, mask=mask)
|
||
|
closest_child_index = np.ma.argmin(all_distances)
|
||
|
return self.index2word[closest_child_index]
|
||
|
|
||
|
def descendants(self, node, max_depth=5):
|
||
|
"""Get the list of recursively closest children from the given node, up to a max depth of `max_depth`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node : {str, int}
|
||
|
Key for node for which descendants are to be found.
|
||
|
max_depth : int
|
||
|
Maximum number of descendants to return.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Descendant nodes from the node `node`.
|
||
|
|
||
|
"""
|
||
|
depth = 0
|
||
|
descendants = []
|
||
|
current_node = node
|
||
|
while depth < max_depth:
|
||
|
descendants.append(self.closest_child(current_node))
|
||
|
current_node = descendants[-1]
|
||
|
depth += 1
|
||
|
return descendants
|
||
|
|
||
|
def ancestors(self, node):
|
||
|
"""Get the list of recursively closest parents from the given node.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node : {str, int}
|
||
|
Key for node for which ancestors are to be found.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Ancestor nodes of the node `node`.
|
||
|
|
||
|
"""
|
||
|
ancestors = []
|
||
|
current_node = node
|
||
|
ancestor = self.closest_parent(current_node)
|
||
|
while ancestor is not None:
|
||
|
ancestors.append(ancestor)
|
||
|
ancestor = self.closest_parent(ancestors[-1])
|
||
|
return ancestors
|
||
|
|
||
|
def distance(self, w1, w2):
|
||
|
"""Calculate Poincare distance between vectors for nodes `w1` and `w2`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
w1 : {str, int}
|
||
|
Key for first node.
|
||
|
w2 : {str, int}
|
||
|
Key for second node.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
Poincare distance between the vectors for nodes `w1` and `w2`.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> # Read the sample relations file and train the model
|
||
|
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
|
||
|
>>> model = PoincareModel(train_data=relations)
|
||
|
>>> model.train(epochs=50)
|
||
|
>>>
|
||
|
>>> # What is the distance between the words 'mammal' and 'carnivore'?
|
||
|
>>> model.kv.distance('mammal.n.01', 'carnivore.n.01')
|
||
|
2.9742298803339304
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
KeyError
|
||
|
If either of `w1` and `w2` is absent from vocab.
|
||
|
|
||
|
"""
|
||
|
vector_1 = self.word_vec(w1)
|
||
|
vector_2 = self.word_vec(w2)
|
||
|
return self.vector_distance(vector_1, vector_2)
|
||
|
|
||
|
def similarity(self, w1, w2):
|
||
|
"""Compute similarity based on Poincare distance between vectors for nodes `w1` and `w2`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
w1 : {str, int}
|
||
|
Key for first node.
|
||
|
w2 : {str, int}
|
||
|
Key for second node.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
Similarity between the between the vectors for nodes `w1` and `w2` (between 0 and 1).
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> # Read the sample relations file and train the model
|
||
|
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
|
||
|
>>> model = PoincareModel(train_data=relations)
|
||
|
>>> model.train(epochs=50)
|
||
|
>>>
|
||
|
>>> # What is the similarity between the words 'mammal' and 'carnivore'?
|
||
|
>>> model.kv.similarity('mammal.n.01', 'carnivore.n.01')
|
||
|
0.25162107631176484
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
KeyError
|
||
|
If either of `w1` and `w2` is absent from vocab.
|
||
|
|
||
|
"""
|
||
|
return 1 / (1 + self.distance(w1, w2))
|
||
|
|
||
|
def most_similar(self, node_or_vector, topn=10, restrict_vocab=None):
|
||
|
"""Find the top-N most similar nodes to the given node or vector, sorted in increasing order of distance.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node_or_vector : {str, int, numpy.array}
|
||
|
node key or vector for which similar nodes are to be found.
|
||
|
topn : int or None, optional
|
||
|
number of similar nodes to return, if `None`, returns all.
|
||
|
restrict_vocab : int or None, optional
|
||
|
Optional integer which limits the range of vectors which are searched for most-similar values.
|
||
|
For example, restrict_vocab=10000 would only check the first 10000 node vectors in the vocabulary order.
|
||
|
This may be meaningful if vocabulary is sorted by descending frequency.
|
||
|
|
||
|
Returns
|
||
|
--------
|
||
|
list of (str, float)
|
||
|
List of tuples containing (node, distance) pairs in increasing order of distance.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> # Read the sample relations file and train the model
|
||
|
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
|
||
|
>>> model = PoincareModel(train_data=relations)
|
||
|
>>> model.train(epochs=50)
|
||
|
>>>
|
||
|
>>> # Which words are most similar to 'kangaroo'?
|
||
|
>>> model.kv.most_similar('kangaroo.n.01', topn=2)
|
||
|
[(u'kangaroo.n.01', 0.0), (u'marsupial.n.01', 0.26524229460827725)]
|
||
|
|
||
|
"""
|
||
|
if not restrict_vocab:
|
||
|
all_distances = self.distances(node_or_vector)
|
||
|
else:
|
||
|
nodes_to_use = self.index2word[:restrict_vocab]
|
||
|
all_distances = self.distances(node_or_vector, nodes_to_use)
|
||
|
|
||
|
if isinstance(node_or_vector, string_types + (int,)):
|
||
|
node_index = self.vocab[node_or_vector].index
|
||
|
else:
|
||
|
node_index = None
|
||
|
if not topn:
|
||
|
closest_indices = matutils.argsort(all_distances)
|
||
|
else:
|
||
|
closest_indices = matutils.argsort(all_distances, topn=1 + topn)
|
||
|
result = [
|
||
|
(self.index2word[index], float(all_distances[index]))
|
||
|
for index in closest_indices if (not node_index or index != node_index) # ignore the input node
|
||
|
]
|
||
|
if topn:
|
||
|
result = result[:topn]
|
||
|
return result
|
||
|
|
||
|
def distances(self, node_or_vector, other_nodes=()):
|
||
|
"""Compute Poincare distances from given `node_or_vector` to all nodes in `other_nodes`.
|
||
|
If `other_nodes` is empty, return distance between `node_or_vector` and all nodes in vocab.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node_or_vector : {str, int, numpy.array}
|
||
|
Node key or vector from which distances are to be computed.
|
||
|
other_nodes : {iterable of str, iterable of int, None}, optional
|
||
|
For each node in `other_nodes` distance from `node_or_vector` is computed.
|
||
|
If None or empty, distance of `node_or_vector` from all nodes in vocab is computed (including itself).
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
numpy.array
|
||
|
Array containing distances to all nodes in `other_nodes` from input `node_or_vector`,
|
||
|
in the same order as `other_nodes`.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> # Read the sample relations file and train the model
|
||
|
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
|
||
|
>>> model = PoincareModel(train_data=relations)
|
||
|
>>> model.train(epochs=50)
|
||
|
>>>
|
||
|
>>> # Check the distances between a word and a list of other words.
|
||
|
>>> model.kv.distances('mammal.n.01', ['carnivore.n.01', 'dog.n.01'])
|
||
|
array([2.97422988, 2.83007402])
|
||
|
|
||
|
>>> # Check the distances between a word and every other word in the vocab.
|
||
|
>>> all_distances = model.kv.distances('mammal.n.01')
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
KeyError
|
||
|
If either `node_or_vector` or any node in `other_nodes` is absent from vocab.
|
||
|
|
||
|
"""
|
||
|
if isinstance(node_or_vector, string_types):
|
||
|
input_vector = self.word_vec(node_or_vector)
|
||
|
else:
|
||
|
input_vector = node_or_vector
|
||
|
if not other_nodes:
|
||
|
other_vectors = self.syn0
|
||
|
else:
|
||
|
other_indices = [self.vocab[node].index for node in other_nodes]
|
||
|
other_vectors = self.syn0[other_indices]
|
||
|
return self.vector_distance_batch(input_vector, other_vectors)
|
||
|
|
||
|
def norm(self, node_or_vector):
|
||
|
"""Compute absolute position in hierarchy of input node or vector.
|
||
|
Values range between 0 and 1. A lower value indicates the input node or vector is higher in the hierarchy.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node_or_vector : {str, int, numpy.array}
|
||
|
Input node key or vector for which position in hierarchy is to be returned.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
Absolute position in the hierarchy of the input vector or node.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> # Read the sample relations file and train the model
|
||
|
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
|
||
|
>>> model = PoincareModel(train_data=relations)
|
||
|
>>> model.train(epochs=50)
|
||
|
>>>
|
||
|
>>> # Get the norm of the embedding of the word `mammal`.
|
||
|
>>> model.kv.norm('mammal.n.01')
|
||
|
0.6423008703542398
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The position in hierarchy is based on the norm of the vector for the node.
|
||
|
|
||
|
"""
|
||
|
if isinstance(node_or_vector, string_types):
|
||
|
input_vector = self.word_vec(node_or_vector)
|
||
|
else:
|
||
|
input_vector = node_or_vector
|
||
|
return np.linalg.norm(input_vector)
|
||
|
|
||
|
def difference_in_hierarchy(self, node_or_vector_1, node_or_vector_2):
|
||
|
"""Compute relative position in hierarchy of `node_or_vector_1` relative to `node_or_vector_2`.
|
||
|
A positive value indicates `node_or_vector_1` is higher in the hierarchy than `node_or_vector_2`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
node_or_vector_1 : {str, int, numpy.array}
|
||
|
Input node key or vector.
|
||
|
node_or_vector_2 : {str, int, numpy.array}
|
||
|
Input node key or vector.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
Relative position in hierarchy of `node_or_vector_1` relative to `node_or_vector_2`.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from gensim.test.utils import datapath
|
||
|
>>>
|
||
|
>>> # Read the sample relations file and train the model
|
||
|
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
|
||
|
>>> model = PoincareModel(train_data=relations)
|
||
|
>>> model.train(epochs=50)
|
||
|
>>>
|
||
|
>>> model.kv.difference_in_hierarchy('mammal.n.01', 'dog.n.01')
|
||
|
0.05382517902410999
|
||
|
|
||
|
>>> model.kv.difference_in_hierarchy('dog.n.01', 'mammal.n.01')
|
||
|
-0.05382517902410999
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The returned value can be positive or negative, depending on whether `node_or_vector_1` is higher
|
||
|
or lower in the hierarchy than `node_or_vector_2`.
|
||
|
|
||
|
"""
|
||
|
return self.norm(node_or_vector_2) - self.norm(node_or_vector_1)
|
||
|
|
||
|
|
||
|
class PoincareRelations(object):
|
||
|
"""Stream relations for `PoincareModel` from a tsv-like file."""
|
||
|
|
||
|
def __init__(self, file_path, encoding='utf8', delimiter='\t'):
|
||
|
"""Initialize instance from file containing a pair of nodes (a relation) per line.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
file_path : str
|
||
|
Path to file containing a pair of nodes (a relation) per line, separated by `delimiter`.
|
||
|
encoding : str, optional
|
||
|
Character encoding of the input file.
|
||
|
delimiter : str, optional
|
||
|
Delimiter character for each relation.
|
||
|
|
||
|
"""
|
||
|
|
||
|
self.file_path = file_path
|
||
|
self.encoding = encoding
|
||
|
self.delimiter = delimiter
|
||
|
|
||
|
def __iter__(self):
|
||
|
"""Stream relations from self.file_path decoded into unicode strings.
|
||
|
|
||
|
Yields
|
||
|
-------
|
||
|
(unicode, unicode)
|
||
|
Relation from input file.
|
||
|
|
||
|
"""
|
||
|
with smart_open(self.file_path) as file_obj:
|
||
|
if sys.version_info[0] < 3:
|
||
|
lines = file_obj
|
||
|
else:
|
||
|
lines = (l.decode(self.encoding) for l in file_obj)
|
||
|
# csv.reader requires bytestring input in python2, unicode input in python3
|
||
|
reader = csv.reader(lines, delimiter=self.delimiter)
|
||
|
for row in reader:
|
||
|
if sys.version_info[0] < 3:
|
||
|
row = [value.decode(self.encoding) for value in row]
|
||
|
yield tuple(row)
|
||
|
|
||
|
|
||
|
class NegativesBuffer(object):
|
||
|
"""Buffer and return negative samples."""
|
||
|
|
||
|
def __init__(self, items):
|
||
|
"""Initialize instance from list or numpy array of samples.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
items : list/numpy.array
|
||
|
List or array containing negative samples.
|
||
|
|
||
|
"""
|
||
|
self._items = items
|
||
|
self._current_index = 0
|
||
|
|
||
|
def num_items(self):
|
||
|
"""Get the number of items remaining in the buffer.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
int
|
||
|
Number of items in the buffer that haven't been consumed yet.
|
||
|
|
||
|
"""
|
||
|
return len(self._items) - self._current_index
|
||
|
|
||
|
def get_items(self, num_items):
|
||
|
"""Get the next `num_items` from buffer.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
num_items : int
|
||
|
Number of items to fetch.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
numpy.array or list
|
||
|
Slice containing `num_items` items from the original data.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
No error is raised if less than `num_items` items are remaining,
|
||
|
simply all the remaining items are returned.
|
||
|
|
||
|
"""
|
||
|
start_index = self._current_index
|
||
|
end_index = start_index + num_items
|
||
|
self._current_index += num_items
|
||
|
return self._items[start_index:end_index]
|
||
|
|
||
|
|
||
|
class ReconstructionEvaluation(object):
|
||
|
"""Evaluate reconstruction on given network for given embedding."""
|
||
|
|
||
|
def __init__(self, file_path, embedding):
|
||
|
"""Initialize evaluation instance with tsv file containing relation pairs and embedding to be evaluated.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
file_path : str
|
||
|
Path to tsv file containing relation pairs.
|
||
|
embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors`
|
||
|
Embedding to be evaluated.
|
||
|
|
||
|
"""
|
||
|
items = set()
|
||
|
embedding_vocab = embedding.vocab
|
||
|
relations = defaultdict(set)
|
||
|
with smart_open(file_path, 'r') as f:
|
||
|
reader = csv.reader(f, delimiter='\t')
|
||
|
for row in reader:
|
||
|
assert len(row) == 2, 'Hypernym pair has more than two items'
|
||
|
item_1_index = embedding_vocab[row[0]].index
|
||
|
item_2_index = embedding_vocab[row[1]].index
|
||
|
relations[item_1_index].add(item_2_index)
|
||
|
items.update([item_1_index, item_2_index])
|
||
|
self.items = items
|
||
|
self.relations = relations
|
||
|
self.embedding = embedding
|
||
|
|
||
|
@staticmethod
|
||
|
def get_positive_relation_ranks_and_avg_prec(all_distances, positive_relations):
|
||
|
"""Compute ranks and Average Precision of positive relations.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
all_distances : numpy.array of float
|
||
|
Array of all distances (floats) for a specific item.
|
||
|
positive_relations : list
|
||
|
List of indices of positive relations for the item.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(list of int, float)
|
||
|
The list contains ranks of positive relations in the same order as `positive_relations`.
|
||
|
The float is the Average Precision of the ranking, e.g. ([1, 2, 3, 20], 0.610).
|
||
|
|
||
|
"""
|
||
|
positive_relation_distances = all_distances[positive_relations]
|
||
|
negative_relation_distances = np.ma.array(all_distances, mask=False)
|
||
|
negative_relation_distances.mask[positive_relations] = True
|
||
|
# Compute how many negative relation distances are less than each positive relation distance, plus 1 for rank
|
||
|
ranks = (negative_relation_distances < positive_relation_distances[:, np.newaxis]).sum(axis=1) + 1
|
||
|
map_ranks = np.sort(ranks) + np.arange(len(ranks))
|
||
|
avg_precision = ((np.arange(1, len(map_ranks) + 1) / np.sort(map_ranks)).mean())
|
||
|
return list(ranks), avg_precision
|
||
|
|
||
|
def evaluate(self, max_n=None):
|
||
|
"""Evaluate all defined metrics for the reconstruction task.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
max_n : int, optional
|
||
|
Maximum number of positive relations to evaluate, all if `max_n` is None.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dict of (str, float)
|
||
|
(metric_name, metric_value) pairs, e.g. {'mean_rank': 50.3, 'MAP': 0.31}.
|
||
|
|
||
|
"""
|
||
|
mean_rank, map_ = self.evaluate_mean_rank_and_map(max_n)
|
||
|
return {'mean_rank': mean_rank, 'MAP': map_}
|
||
|
|
||
|
def evaluate_mean_rank_and_map(self, max_n=None):
|
||
|
"""Evaluate mean rank and MAP for reconstruction.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
max_n : int, optional
|
||
|
Maximum number of positive relations to evaluate, all if `max_n` is None.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(float, float)
|
||
|
(mean_rank, MAP), e.g (50.3, 0.31).
|
||
|
|
||
|
"""
|
||
|
ranks = []
|
||
|
avg_precision_scores = []
|
||
|
for i, item in enumerate(self.items, start=1):
|
||
|
if item not in self.relations:
|
||
|
continue
|
||
|
item_relations = list(self.relations[item])
|
||
|
item_term = self.embedding.index2word[item]
|
||
|
item_distances = self.embedding.distances(item_term)
|
||
|
positive_relation_ranks, avg_precision = \
|
||
|
self.get_positive_relation_ranks_and_avg_prec(item_distances, item_relations)
|
||
|
ranks += positive_relation_ranks
|
||
|
avg_precision_scores.append(avg_precision)
|
||
|
if max_n is not None and i > max_n:
|
||
|
break
|
||
|
return np.mean(ranks), np.mean(avg_precision_scores)
|
||
|
|
||
|
|
||
|
class LinkPredictionEvaluation(object):
|
||
|
"""Evaluate reconstruction on given network for given embedding."""
|
||
|
|
||
|
def __init__(self, train_path, test_path, embedding):
|
||
|
"""Initialize evaluation instance with tsv file containing relation pairs and embedding to be evaluated.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
train_path : str
|
||
|
Path to tsv file containing relation pairs used for training.
|
||
|
test_path : str
|
||
|
Path to tsv file containing relation pairs to evaluate.
|
||
|
embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors`
|
||
|
Embedding to be evaluated.
|
||
|
|
||
|
"""
|
||
|
items = set()
|
||
|
embedding_vocab = embedding.vocab
|
||
|
relations = {'known': defaultdict(set), 'unknown': defaultdict(set)}
|
||
|
data_files = {'known': train_path, 'unknown': test_path}
|
||
|
for relation_type, data_file in data_files.items():
|
||
|
with smart_open(data_file, 'r') as f:
|
||
|
reader = csv.reader(f, delimiter='\t')
|
||
|
for row in reader:
|
||
|
assert len(row) == 2, 'Hypernym pair has more than two items'
|
||
|
item_1_index = embedding_vocab[row[0]].index
|
||
|
item_2_index = embedding_vocab[row[1]].index
|
||
|
relations[relation_type][item_1_index].add(item_2_index)
|
||
|
items.update([item_1_index, item_2_index])
|
||
|
self.items = items
|
||
|
self.relations = relations
|
||
|
self.embedding = embedding
|
||
|
|
||
|
@staticmethod
|
||
|
def get_unknown_relation_ranks_and_avg_prec(all_distances, unknown_relations, known_relations):
|
||
|
"""Compute ranks and Average Precision of unknown positive relations.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
all_distances : numpy.array of float
|
||
|
Array of all distances for a specific item.
|
||
|
unknown_relations : list of int
|
||
|
List of indices of unknown positive relations.
|
||
|
known_relations : list of int
|
||
|
List of indices of known positive relations.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
tuple (list of int, float)
|
||
|
The list contains ranks of positive relations in the same order as `positive_relations`.
|
||
|
The float is the Average Precision of the ranking, e.g. ([1, 2, 3, 20], 0.610).
|
||
|
|
||
|
"""
|
||
|
unknown_relation_distances = all_distances[unknown_relations]
|
||
|
negative_relation_distances = np.ma.array(all_distances, mask=False)
|
||
|
negative_relation_distances.mask[unknown_relations] = True
|
||
|
negative_relation_distances.mask[known_relations] = True
|
||
|
# Compute how many negative relation distances are less than each unknown relation distance, plus 1 for rank
|
||
|
ranks = (negative_relation_distances < unknown_relation_distances[:, np.newaxis]).sum(axis=1) + 1
|
||
|
map_ranks = np.sort(ranks) + np.arange(len(ranks))
|
||
|
avg_precision = ((np.arange(1, len(map_ranks) + 1) / np.sort(map_ranks)).mean())
|
||
|
return list(ranks), avg_precision
|
||
|
|
||
|
def evaluate(self, max_n=None):
|
||
|
"""Evaluate all defined metrics for the link prediction task.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
max_n : int, optional
|
||
|
Maximum number of positive relations to evaluate, all if `max_n` is None.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dict of (str, float)
|
||
|
(metric_name, metric_value) pairs, e.g. {'mean_rank': 50.3, 'MAP': 0.31}.
|
||
|
|
||
|
"""
|
||
|
mean_rank, map_ = self.evaluate_mean_rank_and_map(max_n)
|
||
|
return {'mean_rank': mean_rank, 'MAP': map_}
|
||
|
|
||
|
def evaluate_mean_rank_and_map(self, max_n=None):
|
||
|
"""Evaluate mean rank and MAP for link prediction.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
max_n : int, optional
|
||
|
Maximum number of positive relations to evaluate, all if `max_n` is None.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
tuple (float, float)
|
||
|
(mean_rank, MAP), e.g (50.3, 0.31).
|
||
|
|
||
|
"""
|
||
|
ranks = []
|
||
|
avg_precision_scores = []
|
||
|
for i, item in enumerate(self.items, start=1):
|
||
|
if item not in self.relations['unknown']: # No positive relations to predict for this node
|
||
|
continue
|
||
|
unknown_relations = list(self.relations['unknown'][item])
|
||
|
known_relations = list(self.relations['known'][item])
|
||
|
item_term = self.embedding.index2word[item]
|
||
|
item_distances = self.embedding.distances(item_term)
|
||
|
unknown_relation_ranks, avg_precision = \
|
||
|
self.get_unknown_relation_ranks_and_avg_prec(item_distances, unknown_relations, known_relations)
|
||
|
ranks += unknown_relation_ranks
|
||
|
avg_precision_scores.append(avg_precision)
|
||
|
if max_n is not None and i > max_n:
|
||
|
break
|
||
|
return np.mean(ranks), np.mean(avg_precision_scores)
|
||
|
|
||
|
|
||
|
class LexicalEntailmentEvaluation(object):
|
||
|
"""Evaluate reconstruction on given network for any embedding."""
|
||
|
|
||
|
def __init__(self, filepath):
|
||
|
"""Initialize evaluation instance with HyperLex text file containing relation pairs.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
filepath : str
|
||
|
Path to HyperLex text file.
|
||
|
|
||
|
"""
|
||
|
expected_scores = {}
|
||
|
with smart_open(filepath, 'r') as f:
|
||
|
reader = csv.DictReader(f, delimiter=' ')
|
||
|
for row in reader:
|
||
|
word_1, word_2 = row['WORD1'], row['WORD2']
|
||
|
expected_scores[(word_1, word_2)] = float(row['AVG_SCORE'])
|
||
|
self.scores = expected_scores
|
||
|
self.alpha = 1000
|
||
|
|
||
|
def score_function(self, embedding, trie, term_1, term_2):
|
||
|
"""Compute predicted score - extent to which `term_1` is a type of `term_2`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors`
|
||
|
Embedding to use for computing predicted score.
|
||
|
trie : :class:`pygtrie.Trie`
|
||
|
Trie to use for finding matching vocab terms for input terms.
|
||
|
term_1 : str
|
||
|
Input term.
|
||
|
term_2 : str
|
||
|
Input term.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
Predicted score (the extent to which `term_1` is a type of `term_2`).
|
||
|
|
||
|
"""
|
||
|
try:
|
||
|
word_1_terms = self.find_matching_terms(trie, term_1)
|
||
|
word_2_terms = self.find_matching_terms(trie, term_2)
|
||
|
except KeyError:
|
||
|
raise ValueError("No matching terms found for either %s or %s" % (term_1, term_2))
|
||
|
min_distance = np.inf
|
||
|
min_term_1, min_term_2 = None, None
|
||
|
for term_1 in word_1_terms:
|
||
|
for term_2 in word_2_terms:
|
||
|
distance = embedding.distance(term_1, term_2)
|
||
|
if distance < min_distance:
|
||
|
min_term_1, min_term_2 = term_1, term_2
|
||
|
min_distance = distance
|
||
|
assert min_term_1 is not None and min_term_2 is not None
|
||
|
vector_1, vector_2 = embedding.word_vec(min_term_1), embedding.word_vec(min_term_2)
|
||
|
norm_1, norm_2 = np.linalg.norm(vector_1), np.linalg.norm(vector_2)
|
||
|
return -1 * (1 + self.alpha * (norm_2 - norm_1)) * min_distance
|
||
|
|
||
|
@staticmethod
|
||
|
def find_matching_terms(trie, word):
|
||
|
"""Find terms in the `trie` beginning with the `word`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
trie : :class:`pygtrie.Trie`
|
||
|
Trie to use for finding matching terms.
|
||
|
word : str
|
||
|
Input word to use for prefix search.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
List of matching terms.
|
||
|
|
||
|
"""
|
||
|
matches = trie.items('%s.' % word)
|
||
|
matching_terms = [''.join(key_chars) for key_chars, value in matches]
|
||
|
return matching_terms
|
||
|
|
||
|
@staticmethod
|
||
|
def create_vocab_trie(embedding):
|
||
|
"""Create trie with vocab terms of the given embedding to enable quick prefix searches.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors`
|
||
|
Embedding for which trie is to be created.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`pygtrie.Trie`
|
||
|
Trie containing vocab terms of the input embedding.
|
||
|
|
||
|
"""
|
||
|
try:
|
||
|
from pygtrie import Trie
|
||
|
except ImportError:
|
||
|
raise ImportError(
|
||
|
'pygtrie could not be imported, please install pygtrie in order to use LexicalEntailmentEvaluation')
|
||
|
|
||
|
vocab_trie = Trie()
|
||
|
for key in embedding.vocab:
|
||
|
vocab_trie[key] = True
|
||
|
return vocab_trie
|
||
|
|
||
|
def evaluate_spearman(self, embedding):
|
||
|
"""Evaluate spearman scores for lexical entailment for given embedding.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
embedding : :class:`~gensim.models.poincare.PoincareKeyedVectors`
|
||
|
Embedding for which evaluation is to be done.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
Spearman correlation score for the task for input embedding.
|
||
|
|
||
|
"""
|
||
|
predicted_scores = []
|
||
|
expected_scores = []
|
||
|
skipped = 0
|
||
|
count = 0
|
||
|
vocab_trie = self.create_vocab_trie(embedding)
|
||
|
for (word_1, word_2), expected_score in self.scores.items():
|
||
|
try:
|
||
|
predicted_score = self.score_function(embedding, vocab_trie, word_1, word_2)
|
||
|
except ValueError:
|
||
|
skipped += 1
|
||
|
continue
|
||
|
count += 1
|
||
|
predicted_scores.append(predicted_score)
|
||
|
expected_scores.append(expected_score)
|
||
|
logger.info('skipped pairs: %d out of %d' % (skipped, len(self.scores)))
|
||
|
spearman = spearmanr(expected_scores, predicted_scores)
|
||
|
return spearman.correlation
|