laywerrobot/lib/python3.6/site-packages/gensim/models/wrappers/varembed.py
2020-08-27 21:55:39 +02:00

129 lines
4.7 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2017 Anmol Gulati <anmol01gulati@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
"""Python wrapper around `Varembed model <https://github.com/rguthrie3/MorphologicalPriorsForWordEmbeddings>`_.
Original paper:`"Morphological Priors for Probabilistic Neural Word Embeddings" <http://arxiv.org/pdf/1608.01056.pdf>`_.
Notes
-----
* This module allows ability to obtain word vectors for out-of-vocabulary words, for the Varembed model.
* The wrapped model can not be updated with new documents for online training.
"""
import logging
import numpy as np
from gensim import utils
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.word2vec import Vocab
logger = logging.getLogger(__name__)
class VarEmbed(KeyedVectors):
"""Python wrapper using `Varembed <https://github.com/rguthrie3/MorphologicalPriorsForWordEmbeddings>`_.
Warnings
--------
This is **only** python wrapper for `Varembed <https://github.com/rguthrie3/MorphologicalPriorsForWordEmbeddings>`_,
this allows to load pre-trained models only.
"""
def __init__(self):
self.vector_size = 0
self.vocab_size = 0
@classmethod
def load_varembed_format(cls, vectors, morfessor_model=None):
"""Load the word vectors into matrix from the varembed output vector files.
Parameters
----------
vectors : dict
Pickle file containing the word vectors.
morfessor_model : str, optional
Path to the trained morfessor model.
Returns
-------
:class:`~gensim.models.wrappers.varembed.VarEmbed`
Ready to use instance.
"""
result = cls()
if vectors is None:
raise Exception("Please provide vectors binary to load varembed model")
d = utils.unpickle(vectors)
word_to_ix = d['word_to_ix']
morpho_to_ix = d['morpho_to_ix']
word_embeddings = d['word_embeddings']
morpho_embeddings = d['morpheme_embeddings']
result.load_word_embeddings(word_embeddings, word_to_ix)
if morfessor_model:
try:
import morfessor
morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model)
result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix)
except ImportError:
# Morfessor Package not found.
logger.error('Could not import morfessor. Not using morpheme embeddings')
raise ImportError('Could not import morfessor.')
logger.info('Loaded varembed model vectors from %s', vectors)
return result
def load_word_embeddings(self, word_embeddings, word_to_ix):
"""Loads the word embeddings.
Parameters
----------
word_embeddings : numpy.ndarray
Matrix with word-embeddings.
word_to_ix : dict of (str, int)
Mapping word to index.
"""
logger.info("Loading the vocabulary")
self.vocab = {}
self.index2word = []
counts = {}
for word in word_to_ix:
counts[word] = counts.get(word, 0) + 1
self.vocab_size = len(counts)
self.vector_size = word_embeddings.shape[1]
self.syn0 = np.zeros((self.vocab_size, self.vector_size))
self.index2word = [None] * self.vocab_size
logger.info("Corpus has %i words", len(self.vocab))
for word_id, word in enumerate(counts):
self.vocab[word] = Vocab(index=word_id, count=counts[word])
self.syn0[word_id] = word_embeddings[word_to_ix[word]]
self.index2word[word_id] = word
assert((len(self.vocab), self.vector_size) == self.syn0.shape)
logger.info("Loaded matrix of %d size and %d dimensions", self.vocab_size, self.vector_size)
def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho_to_ix):
"""Include morpheme embeddings into vectors.
Parameters
----------
morfessor_model : :class:`morfessor.baseline.BaselineModel`
Morfessor model.
morpho_embeddings : dict
Pickle file containing morpheme embeddings.
morpho_to_ix : dict
Mapping morpheme to index.
"""
for word in self.vocab:
morpheme_embedding = np.array(
[
morpho_embeddings[morpho_to_ix.get(m, -1)]
for m in morfessor_model.viterbi_segment(word)[0]
]
).sum(axis=0)
self.syn0[self.vocab[word].index] += morpheme_embedding
logger.info("Added morphemes to word vectors")