laywerrobot/lib/python3.6/site-packages/gensim/test/test_word2vec.py
2020-08-27 21:55:39 +02:00

994 lines
46 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Automated tests for checking transformation algorithms (the models package).
"""
import logging
import unittest
import os
import bz2
import sys
import numpy as np
from gensim import utils
from gensim.models import word2vec, keyedvectors
from gensim.test.utils import datapath, get_tmpfile, common_texts as sentences
from testfixtures import log_capture
try:
from pyemd import emd # noqa:F401
PYEMD_EXT = True
except ImportError:
PYEMD_EXT = False
class LeeCorpus(object):
def __iter__(self):
with open(datapath('lee_background.cor')) as f:
for line in f:
yield utils.simple_preprocess(line)
list_corpus = list(LeeCorpus())
new_sentences = [
['computer', 'artificial', 'intelligence'],
['artificial', 'trees'],
['human', 'intelligence'],
['artificial', 'graph'],
['intelligence'],
['artificial', 'intelligence', 'system']
]
def _rule(word, count, min_count):
if word == "human":
return utils.RULE_DISCARD # throw out
else:
return utils.RULE_DEFAULT # apply default rule, i.e. min_count
def load_on_instance():
# Save and load a Word2Vec Model on instance for test
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.save(tmpf)
model = word2vec.Word2Vec() # should fail at this point
return model.load(tmpf)
class TestWord2VecModel(unittest.TestCase):
def testBuildVocabFromFreq(self):
"""Test that the algorithm is able to build vocabulary from given
frequency table"""
freq_dict = {
'minors': 2, 'graph': 3, 'system': 4,
'trees': 3, 'eps': 2, 'computer': 2,
'survey': 2, 'user': 3, 'human': 2,
'time': 2, 'interface': 2, 'response': 2
}
model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0)
model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
model_hs.build_vocab_from_freq(freq_dict)
model_neg.build_vocab_from_freq(freq_dict)
self.assertEqual(len(model_hs.wv.vocab), 12)
self.assertEqual(len(model_neg.wv.vocab), 12)
self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
self.assertEqual(model_hs.wv.vocab['system'].count, 4)
self.assertEqual(model_hs.wv.vocab['trees'].count, 3)
self.assertEqual(model_hs.wv.vocab['eps'].count, 2)
self.assertEqual(model_hs.wv.vocab['computer'].count, 2)
self.assertEqual(model_hs.wv.vocab['survey'].count, 2)
self.assertEqual(model_hs.wv.vocab['user'].count, 3)
self.assertEqual(model_hs.wv.vocab['human'].count, 2)
self.assertEqual(model_hs.wv.vocab['time'].count, 2)
self.assertEqual(model_hs.wv.vocab['interface'].count, 2)
self.assertEqual(model_hs.wv.vocab['response'].count, 2)
self.assertEqual(model_neg.wv.vocab['minors'].count, 2)
self.assertEqual(model_neg.wv.vocab['graph'].count, 3)
self.assertEqual(model_neg.wv.vocab['system'].count, 4)
self.assertEqual(model_neg.wv.vocab['trees'].count, 3)
self.assertEqual(model_neg.wv.vocab['eps'].count, 2)
self.assertEqual(model_neg.wv.vocab['computer'].count, 2)
self.assertEqual(model_neg.wv.vocab['survey'].count, 2)
self.assertEqual(model_neg.wv.vocab['user'].count, 3)
self.assertEqual(model_neg.wv.vocab['human'].count, 2)
self.assertEqual(model_neg.wv.vocab['time'].count, 2)
self.assertEqual(model_neg.wv.vocab['interface'].count, 2)
self.assertEqual(model_neg.wv.vocab['response'].count, 2)
new_freq_dict = {
'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1
}
model_hs.build_vocab_from_freq(new_freq_dict, update=True)
model_neg.build_vocab_from_freq(new_freq_dict, update=True)
self.assertEqual(model_hs.wv.vocab['graph'].count, 4)
self.assertEqual(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(len(model_hs.wv.vocab), 14)
self.assertEqual(len(model_neg.wv.vocab), 14)
def testPruneVocab(self):
"""Test Prune vocab while scanning sentences"""
sentences = [
["graph", "system"],
["graph", "system"],
["system", "eps"],
["graph", "system"]
]
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
self.assertEqual(len(model.wv.vocab), 2)
self.assertEqual(model.wv.vocab['graph'].count, 3)
self.assertEqual(model.wv.vocab['system'].count, 4)
sentences = [
["graph", "system"],
["graph", "system"],
["system", "eps"],
["graph", "system"],
["minors", "survey", "minors", "survey", "minors"]
]
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
self.assertEqual(len(model.wv.vocab), 3)
self.assertEqual(model.wv.vocab['graph'].count, 3)
self.assertEqual(model.wv.vocab['minors'].count, 3)
self.assertEqual(model.wv.vocab['system'].count, 4)
def testTotalWordCount(self):
model = word2vec.Word2Vec(size=10, min_count=0, seed=42)
total_words = model.vocabulary.scan_vocab(sentences)[0]
self.assertEqual(total_words, 29)
def testMaxFinalVocab(self):
# Test for less restricting effect of max_final_vocab
# max_final_vocab is specified but has no effect
model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=4, sample=0)
model.vocabulary.scan_vocab(sentences)
reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0)
self.assertEqual(reported_values['drop_unique'], 11)
self.assertEqual(reported_values['retain_total'], 4)
self.assertEqual(reported_values['num_retained_words'], 1)
self.assertEqual(model.vocabulary.effective_min_count, 4)
# Test for more restricting effect of max_final_vocab
# results in setting a min_count more restricting than specified min_count
model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=2, sample=0)
model.vocabulary.scan_vocab(sentences)
reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0)
self.assertEqual(reported_values['drop_unique'], 8)
self.assertEqual(reported_values['retain_total'], 13)
self.assertEqual(reported_values['num_retained_words'], 4)
self.assertEqual(model.vocabulary.effective_min_count, 3)
def testOnlineLearning(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
model_hs = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=1, negative=0)
model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
model_hs.build_vocab(new_sentences, update=True)
model_neg.build_vocab(new_sentences, update=True)
self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(len(model_hs.wv.vocab), 14)
self.assertEqual(len(model_neg.wv.vocab), 14)
def testOnlineLearningAfterSave(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
tmpf = get_tmpfile('gensim_word2vec.tst')
model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
model_neg.save(tmpf)
model_neg = word2vec.Word2Vec.load(tmpf)
self.assertTrue(len(model_neg.wv.vocab), 12)
model_neg.build_vocab(new_sentences, update=True)
model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
self.assertEqual(len(model_neg.wv.vocab), 14)
def onlineSanity(self, model, trained_model=False):
terro, others = [], []
for l in list_corpus:
if 'terrorism' in l:
terro.append(l)
else:
others.append(l)
self.assertTrue(all(['terrorism' not in l for l in others]))
model.build_vocab(others, update=trained_model)
model.train(others, total_examples=model.corpus_count, epochs=model.iter)
self.assertFalse('terrorism' in model.wv.vocab)
model.build_vocab(terro, update=True)
self.assertTrue('terrorism' in model.wv.vocab)
orig0 = np.copy(model.wv.syn0)
model.train(terro, total_examples=len(terro), epochs=model.iter)
self.assertFalse(np.allclose(model.wv.syn0, orig0))
sim = model.n_similarity(['war'], ['terrorism'])
self.assertLess(0., sim)
def test_sg_hs_online(self):
"""Test skipgram w/ hierarchical softmax"""
model = word2vec.Word2Vec(sg=1, window=5, hs=1, negative=0, min_count=3, iter=10, seed=42, workers=2)
self.onlineSanity(model)
def test_sg_neg_online(self):
"""Test skipgram w/ negative sampling"""
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=3, iter=10, seed=42, workers=2)
self.onlineSanity(model)
def test_cbow_hs_online(self):
"""Test CBOW w/ hierarchical softmax"""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=3, iter=10, seed=42, workers=2
)
self.onlineSanity(model)
def test_cbow_neg_online(self):
"""Test CBOW w/ negative sampling"""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
min_count=5, iter=10, seed=42, workers=2, sample=0
)
self.onlineSanity(model)
def testPersistence(self):
"""Test storing/loading the entire model."""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.save(tmpf)
self.models_equal(model, word2vec.Word2Vec.load(tmpf))
# test persistence of the KeyedVectors of a model
wv = model.wv
wv.save(tmpf)
loaded_wv = keyedvectors.KeyedVectors.load(tmpf)
self.assertTrue(np.allclose(wv.syn0, loaded_wv.syn0))
self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def testPersistenceWithConstructorRule(self):
"""Test storing/loading the entire model with a vocab trimming rule passed in the constructor."""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule)
model.save(tmpf)
self.models_equal(model, word2vec.Word2Vec.load(tmpf))
def testRuleWithMinCount(self):
"""Test that returning RULE_DEFAULT from trim_rule triggers min_count."""
model = word2vec.Word2Vec(sentences + [["occurs_only_once"]], min_count=2, trim_rule=_rule)
self.assertTrue("human" not in model.wv.vocab)
self.assertTrue("occurs_only_once" not in model.wv.vocab)
self.assertTrue("interface" in model.wv.vocab)
def testRule(self):
"""Test applying vocab trim_rule to build_vocab instead of constructor."""
model = word2vec.Word2Vec(min_count=1)
model.build_vocab(sentences, trim_rule=_rule)
self.assertTrue("human" not in model.wv.vocab)
def testLambdaRule(self):
"""Test that lambda trim_rule works."""
def rule(word, count, min_count):
return utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT
model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule)
self.assertTrue("human" not in model.wv.vocab)
def testSyn0NormNotSaved(self):
"""Test syn0norm isn't saved in model file"""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save(tmpf)
loaded_model = word2vec.Word2Vec.load(tmpf)
self.assertTrue(loaded_model.wv.syn0norm is None)
wv = model.wv
wv.save(tmpf)
loaded_kv = keyedvectors.KeyedVectors.load(tmpf)
self.assertTrue(loaded_kv.syn0norm is None)
def testLoadPreKeyedVectorModel(self):
"""Test loading pre-KeyedVectors word2vec model"""
if sys.version_info[:2] == (3, 4):
model_file_suffix = '_py3_4'
elif sys.version_info < (3,):
model_file_suffix = '_py2'
else:
model_file_suffix = '_py3'
# Model stored in one file
model_file = 'word2vec_pre_kv%s' % model_file_suffix
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), model.vector_size))
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
# Model stored in multiple files
model_file = 'word2vec_pre_kv_sep%s' % model_file_suffix
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), model.vector_size))
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
def testLoadPreKeyedVectorModelCFormat(self):
"""Test loading pre-KeyedVectors word2vec model saved in word2vec format"""
model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'))
self.assertTrue(model.syn0.shape[0] == len(model.vocab))
def testPersistenceWord2VecFormat(self):
"""Test storing/loading the entire model in word2vec format."""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.wv.save_word2vec_format(tmpf, binary=True)
binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
binary_model_kv.init_sims(replace=False)
self.assertTrue(np.allclose(model['human'], binary_model_kv['human']))
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
norm_only_model.init_sims(replace=True)
self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human']))
limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True, limit=3)
self.assertEqual(len(limited_model_kv.syn0), 3)
half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(
tmpf, binary=True, datatype=np.float16
)
self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
def testNoTrainingCFormat(self):
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.wv.save_word2vec_format(tmpf, binary=True)
kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
binary_model = word2vec.Word2Vec()
binary_model.wv = kv
self.assertRaises(ValueError, binary_model.train, sentences)
def testTooShortBinaryWord2VecFormat(self):
tfile = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.wv.save_word2vec_format(tfile, binary=True)
f = open(tfile, 'r+b')
f.write(b'13') # write wrong (too-long) vector count
f.close()
self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=True)
def testTooShortTextWord2VecFormat(self):
tfile = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.wv.save_word2vec_format(tfile, binary=False)
f = open(tfile, 'r+b')
f.write(b'13') # write wrong (too-long) vector count
f.close()
self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=False)
def testPersistenceWord2VecFormatNonBinary(self):
"""Test storing/loading the entire model in word2vec non-binary format."""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.wv.save_word2vec_format(tmpf, binary=False)
text_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False)
text_model.init_sims(False)
self.assertTrue(np.allclose(model['human'], text_model['human'], atol=1e-6))
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False)
norm_only_model.init_sims(True)
self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6))
self.assertTrue(np.allclose(
model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4
))
def testPersistenceWord2VecFormatWithVocab(self):
"""Test storing/loading the entire model and vocabulary in word2vec format."""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
testvocab = get_tmpfile('gensim_word2vec.vocab')
model.wv.save_word2vec_format(tmpf, testvocab, binary=True)
binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True)
self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab_kv.vocab['human'].count)
def testPersistenceKeyedVectorsFormatWithVocab(self):
"""Test storing/loading the entire model and vocabulary in word2vec format."""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
testvocab = get_tmpfile('gensim_word2vec.vocab')
model.wv.save_word2vec_format(tmpf, testvocab, binary=True)
kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True)
self.assertEqual(model.wv.vocab['human'].count, kv_binary_model_with_vocab.vocab['human'].count)
def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self):
"""Test storing/loading the entire model and vocabulary in word2vec format chained with
saving and loading via `save` and `load` methods`.
It was possible prior to 1.0.0 release, now raises Exception"""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
testvocab = get_tmpfile('gensim_word2vec.vocab')
model.wv.save_word2vec_format(tmpf, testvocab, binary=True)
binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True)
binary_model_with_vocab_kv.save(tmpf)
self.assertRaises(AttributeError, word2vec.Word2Vec.load, tmpf)
def testLargeMmap(self):
"""Test storing/loading the entire model."""
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
# test storing the internal arrays into separate files
model.save(tmpf, sep_limit=0)
self.models_equal(model, word2vec.Word2Vec.load(tmpf))
# make sure mmaping the arrays back works, too
self.models_equal(model, word2vec.Word2Vec.load(tmpf, mmap='r'))
def testVocab(self):
"""Test word2vec vocabulary building."""
corpus = LeeCorpus()
total_words = sum(len(sentence) for sentence in corpus)
# try vocab building explicitly, using all words
model = word2vec.Word2Vec(min_count=1, hs=1, negative=0)
model.build_vocab(corpus)
self.assertTrue(len(model.wv.vocab) == 6981)
# with min_count=1, we're not throwing away anything,
# so make sure the word counts add up to be the entire corpus
self.assertEqual(sum(v.count for v in model.wv.vocab.values()), total_words)
# make sure the binary codes are correct
np.allclose(model.wv.vocab['the'].code, [1, 1, 0, 0])
# test building vocab with default params
model = word2vec.Word2Vec(hs=1, negative=0)
model.build_vocab(corpus)
self.assertTrue(len(model.wv.vocab) == 1750)
np.allclose(model.wv.vocab['the'].code, [1, 1, 1, 0])
# no input => "RuntimeError: you must first build vocabulary before training the model"
self.assertRaises(RuntimeError, word2vec.Word2Vec, [])
# input not empty, but rather completely filtered out
self.assertRaises(RuntimeError, word2vec.Word2Vec, corpus, min_count=total_words + 1)
def testTraining(self):
"""Test word2vec training."""
# build vocabulary, don't train yet
model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0)
model.build_vocab(sentences)
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2))
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
sims = model.most_similar('graph', topn=10)
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
# test querying for "most similar" by vector
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
sims2 = model.most_similar(positive=[graph_vector], topn=11)
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
self.assertEqual(sims, sims2)
# build vocab and train in one step; must be the same as above
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
self.models_equal(model, model2)
def testScoring(self):
"""Test word2vec scoring."""
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
# just score and make sure they exist
scores = model.score(sentences, len(sentences))
self.assertEqual(len(scores), len(sentences))
def testLocking(self):
"""Test word2vec training doesn't change locked vectors."""
corpus = LeeCorpus()
# build vocabulary, don't train yet
for sg in range(2): # test both cbow and sg
model = word2vec.Word2Vec(size=4, hs=1, negative=5, min_count=1, sg=sg, window=5)
model.build_vocab(corpus)
# remember two vectors
locked0 = np.copy(model.wv.syn0[0])
unlocked1 = np.copy(model.wv.syn0[1])
# lock the vector in slot 0 against change
model.syn0_lockf[0] = 0.0
model.train(corpus, total_examples=model.corpus_count, epochs=model.iter)
self.assertFalse((unlocked1 == model.wv.syn0[1]).all()) # unlocked vector should vary
self.assertTrue((locked0 == model.wv.syn0[0]).all()) # locked vector should not vary
def testAccuracy(self):
"""Test Word2Vec accuracy and KeyedVectors accuracy give the same result"""
model = word2vec.Word2Vec(LeeCorpus())
w2v_accuracy = model.accuracy(datapath('questions-words.txt'))
kv_accuracy = model.wv.accuracy(datapath('questions-words.txt'))
self.assertEqual(w2v_accuracy, kv_accuracy)
def testEvaluateWordPairs(self):
"""Test Spearman and Pearson correlation coefficients give sane results on similarity datasets"""
corpus = word2vec.LineSentence(datapath('head500.noblanks.cor.bz2'))
model = word2vec.Word2Vec(corpus, min_count=3, iter=10)
correlation = model.evaluate_word_pairs(datapath('wordsim353.tsv'))
pearson = correlation[0][0]
spearman = correlation[1][0]
oov = correlation[2]
self.assertTrue(0.1 < pearson < 1.0)
self.assertTrue(0.1 < spearman < 1.0)
self.assertTrue(0.0 <= oov < 90.0)
def model_sanity(self, model, train=True):
"""Even tiny models trained on LeeCorpus should pass these sanity checks"""
# run extra before/after training tests if train=True
if train:
model.build_vocab(list_corpus)
orig0 = np.copy(model.wv.syn0[0])
model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
self.assertFalse((orig0 == model.wv.syn0[1]).all()) # vector should vary after training
sims = model.most_similar('war', topn=len(model.wv.index2word))
t_rank = [word for word, score in sims].index('terrorism')
# in >200 calibration runs w/ calling parameters, 'terrorism' in 50-most_sim for 'war'
self.assertLess(t_rank, 50)
war_vec = model['war']
sims2 = model.most_similar([war_vec], topn=51)
self.assertTrue('war' in [word for word, score in sims2])
self.assertTrue('terrorism' in [word for word, score in sims2])
def test_sg_hs(self):
"""Test skipgram w/ hierarchical softmax"""
model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2)
self.model_sanity(model)
def test_sg_neg(self):
"""Test skipgram w/ negative sampling"""
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2)
self.model_sanity(model)
def test_cbow_hs(self):
"""Test CBOW w/ hierarchical softmax"""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0,
min_count=5, iter=10, workers=2, batch_words=1000
)
self.model_sanity(model)
def test_cbow_neg(self):
"""Test CBOW w/ negative sampling"""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
min_count=5, iter=10, workers=2, sample=0
)
self.model_sanity(model)
def test_cosmul(self):
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
sims = model.most_similar_cosmul('graph', topn=10)
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
# test querying for "most similar" by vector
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
sims2 = model.most_similar_cosmul(positive=[graph_vector], topn=11)
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
self.assertEqual(sims, sims2)
def testTrainingCbow(self):
"""Test CBOW word2vec training."""
# to test training, make the corpus larger by repeating its sentences over and over
# build vocabulary, don't train yet
model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0)
model.build_vocab(sentences)
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2))
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
sims = model.most_similar('graph', topn=10)
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
# test querying for "most similar" by vector
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
sims2 = model.most_similar(positive=[graph_vector], topn=11)
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
self.assertEqual(sims, sims2)
# build vocab and train in one step; must be the same as above
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0)
self.models_equal(model, model2)
def testTrainingSgNegative(self):
"""Test skip-gram (negative sampling) word2vec training."""
# to test training, make the corpus larger by repeating its sentences over and over
# build vocabulary, don't train yet
model = word2vec.Word2Vec(size=2, min_count=1, sg=1, hs=0, negative=2)
model.build_vocab(sentences)
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2))
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
sims = model.most_similar('graph', topn=10)
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
# test querying for "most similar" by vector
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
sims2 = model.most_similar(positive=[graph_vector], topn=11)
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
self.assertEqual(sims, sims2)
# build vocab and train in one step; must be the same as above
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=1, hs=0, negative=2)
self.models_equal(model, model2)
def testTrainingCbowNegative(self):
"""Test CBOW (negative sampling) word2vec training."""
# to test training, make the corpus larger by repeating its sentences over and over
# build vocabulary, don't train yet
model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
model.build_vocab(sentences)
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2))
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
sims = model.most_similar('graph', topn=10)
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
# test querying for "most similar" by vector
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
sims2 = model.most_similar(positive=[graph_vector], topn=11)
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
self.assertEqual(sims, sims2)
# build vocab and train in one step; must be the same as above
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2)
self.models_equal(model, model2)
def testSimilarities(self):
"""Test similarity and n_similarity methods."""
# The model is trained using CBOW
model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph']))
self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees'))
self.assertRaises(ZeroDivisionError, model.n_similarity, ['graph', 'trees'], [])
self.assertRaises(ZeroDivisionError, model.n_similarity, [], ['graph', 'trees'])
self.assertRaises(ZeroDivisionError, model.n_similarity, [], [])
def testSimilarBy(self):
"""Test word2vec similar_by_word and similar_by_vector."""
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
wordsims = model.similar_by_word('graph', topn=10)
wordsims2 = model.most_similar(positive='graph', topn=10)
vectorsims = model.similar_by_vector(model['graph'], topn=10)
vectorsims2 = model.most_similar([model['graph']], topn=10)
self.assertEqual(wordsims, wordsims2)
self.assertEqual(vectorsims, vectorsims2)
def testParallel(self):
"""Test word2vec parallel training."""
if word2vec.FAST_VERSION < 0: # don't test the plain np version for parallelism (too slow)
return
corpus = utils.RepeatCorpus(LeeCorpus(), 10000)
for workers in [2, 4]:
model = word2vec.Word2Vec(corpus, workers=workers)
sims = model.most_similar('israeli') # noqa:F841
# the exact vectors and therefore similarities may differ, due to different thread collisions/randomization
# so let's test only for top3
# TODO: commented out for now; find a more robust way to compare against "gold standard"
# self.assertTrue('palestinian' in [sims[i][0] for i in range(3)])
def testRNG(self):
"""Test word2vec results identical with identical RNG seed."""
model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
model2 = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
self.models_equal(model, model2)
def models_equal(self, model, model2):
self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab))
self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0))
if model.hs:
self.assertTrue(np.allclose(model.syn1, model2.syn1))
if model.negative:
self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg))
most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0]
self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word]))
def testDeleteTemporaryTrainingData(self):
"""Test word2vec model after delete_temporary_training_data"""
for i in [0, 1]:
for j in [0, 1]:
model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j)
if i:
self.assertTrue(hasattr(model, 'syn1'))
if j:
self.assertTrue(hasattr(model, 'syn1neg'))
self.assertTrue(hasattr(model, 'syn0_lockf'))
model.delete_temporary_training_data(replace_word_vectors_with_normalized=True)
self.assertTrue(len(model['human']), 10)
self.assertTrue(len(model.wv.vocab), 12)
self.assertTrue(model.wv.vocab['graph'].count, 3)
self.assertTrue(not hasattr(model, 'syn1'))
self.assertTrue(not hasattr(model, 'syn1neg'))
self.assertTrue(not hasattr(model, 'syn0_lockf'))
def testNormalizeAfterTrainingData(self):
tmpf = get_tmpfile('gensim_word2vec.tst')
model = word2vec.Word2Vec(sentences, min_count=1)
model.save(tmpf)
norm_only_model = word2vec.Word2Vec.load(tmpf)
norm_only_model.delete_temporary_training_data(replace_word_vectors_with_normalized=True)
self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
def testPredictOutputWord(self):
'''Test word2vec predict_output_word method handling for negative sampling scheme'''
# under normal circumstances
model_with_neg = word2vec.Word2Vec(sentences, min_count=1)
predictions_with_neg = model_with_neg.predict_output_word(['system', 'human'], topn=5)
self.assertTrue(len(predictions_with_neg) == 5)
# out-of-vobaculary scenario
predictions_out_of_vocab = model_with_neg.predict_output_word(['some', 'random', 'words'], topn=5)
self.assertEqual(predictions_out_of_vocab, None)
# when required model parameters have been deleted
tmpf = get_tmpfile('gensim_word2vec.tst')
model_with_neg.init_sims()
model_with_neg.wv.save_word2vec_format(tmpf, binary=True)
kv_model_with_neg = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
binary_model_with_neg = word2vec.Word2Vec()
binary_model_with_neg.wv = kv_model_with_neg
self.assertRaises(RuntimeError, binary_model_with_neg.predict_output_word, ['system', 'human'])
# negative sampling scheme not used
model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0)
self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human'])
def testLoadOldModel(self):
"""Test loading word2vec models from previous version"""
model_file = 'word2vec_old'
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertTrue(model.wv.vectors.shape == (12, 100))
self.assertTrue(len(model.wv.vocab) == 12)
self.assertTrue(len(model.wv.index2word) == 12)
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size))
self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
self.assertTrue(model.vocabulary.cum_table.shape == (12,))
self.onlineSanity(model, trained_model=True)
# Model stored in multiple files
model_file = 'word2vec_old_sep'
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertTrue(model.wv.vectors.shape == (12, 100))
self.assertTrue(len(model.wv.vocab) == 12)
self.assertTrue(len(model.wv.index2word) == 12)
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size))
self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
self.assertTrue(model.vocabulary.cum_table.shape == (12,))
self.onlineSanity(model, trained_model=True)
# load really old model
model_file = 'w2v-lee-v0.12.0'
model = word2vec.Word2Vec.load(datapath(model_file))
self.onlineSanity(model, trained_model=True)
# test for max_final_vocab for model saved in 3.3
model_file = 'word2vec_3.3'
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertEqual(model.max_final_vocab, None)
self.assertEqual(model.vocabulary.max_final_vocab, None)
# Test loading word2vec models from all previous versions
old_versions = [
'0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
'0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
'1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
'3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
]
saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl')
for old_version in old_versions:
model = word2vec.Word2Vec.load(saved_models_dir.format(old_version))
self.assertTrue(len(model.wv.vocab) == 3)
self.assertTrue(model.wv.vectors.shape == (3, 4))
# check if similarity search and online training works.
self.assertTrue(len(model.wv.most_similar('sentence')) == 2)
model.build_vocab(list_corpus, update=True)
model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
# check if similarity search and online training works after saving and loading back the model.
tmpf = get_tmpfile('gensim_word2vec.tst')
model.save(tmpf)
loaded_model = word2vec.Word2Vec.load(tmpf)
loaded_model.build_vocab(list_corpus, update=True)
loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
@log_capture()
def testBuildVocabWarning(self, l):
"""Test if warning is raised on non-ideal input to a word2vec model"""
sentences = ['human', 'machine']
model = word2vec.Word2Vec()
model.build_vocab(sentences)
warning = "Each 'sentences' item should be a list of words (usually unicode strings)."
self.assertTrue(warning in str(l))
@log_capture()
def testTrainWarning(self, l):
"""Test if warning is raised if alpha rises during subsequent calls to train()"""
sentences = [
['human'],
['graph', 'trees']
]
model = word2vec.Word2Vec(min_count=1)
model.build_vocab(sentences)
for epoch in range(10):
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
model.alpha -= 0.002
model.min_alpha = model.alpha
if epoch == 5:
model.alpha += 0.05
warning = "Effective 'alpha' higher than previous training cycles"
self.assertTrue(warning in str(l))
def test_train_with_explicit_param(self):
model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0)
model.build_vocab(sentences)
with self.assertRaises(ValueError):
model.train(sentences, total_examples=model.corpus_count)
with self.assertRaises(ValueError):
model.train(sentences, epochs=model.iter)
with self.assertRaises(ValueError):
model.train(sentences)
def test_sentences_should_not_be_a_generator(self):
"""
Is sentences a generator object?
"""
gen = (s for s in sentences)
self.assertRaises(TypeError, word2vec.Word2Vec, (gen,))
def testLoadOnClassError(self):
"""Test if exception is raised when loading word2vec model on instance"""
self.assertRaises(AttributeError, load_on_instance)
def test_reset_from(self):
"""Test if reset_from() uses pre-built structures from other model"""
model = word2vec.Word2Vec(sentences, min_count=1)
other_model = word2vec.Word2Vec(new_sentences, min_count=1)
other_vocab = other_model.wv.vocab
model.reset_from(other_model)
self.assertEqual(model.wv.vocab, other_vocab)
def test_compute_training_loss(self):
model = word2vec.Word2Vec(min_count=1, sg=1, negative=5, hs=1)
model.build_vocab(sentences)
model.train(sentences, compute_loss=True, total_examples=model.corpus_count, epochs=model.iter)
training_loss_val = model.get_latest_training_loss()
self.assertTrue(training_loss_val > 0.0)
# endclass TestWord2VecModel
class TestWMD(unittest.TestCase):
def testNonzero(self):
'''Test basic functionality with a test sentence.'''
if not PYEMD_EXT:
return
model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
sentence1 = ['human', 'interface', 'computer']
sentence2 = ['survey', 'user', 'computer', 'system', 'response', 'time']
distance = model.wmdistance(sentence1, sentence2)
# Check that distance is non-zero.
self.assertFalse(distance == 0.0)
def testSymmetry(self):
'''Check that distance is symmetric.'''
if not PYEMD_EXT:
return
model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
sentence1 = ['human', 'interface', 'computer']
sentence2 = ['survey', 'user', 'computer', 'system', 'response', 'time']
distance1 = model.wmdistance(sentence1, sentence2)
distance2 = model.wmdistance(sentence2, sentence1)
self.assertTrue(np.allclose(distance1, distance2))
def testIdenticalSentences(self):
'''Check that the distance from a sentence to itself is zero.'''
if not PYEMD_EXT:
return
model = word2vec.Word2Vec(sentences, min_count=1)
sentence = ['survey', 'user', 'computer', 'system', 'response', 'time']
distance = model.wmdistance(sentence, sentence)
self.assertEqual(0.0, distance)
class TestWord2VecSentenceIterators(unittest.TestCase):
def testLineSentenceWorksWithFilename(self):
"""Does LineSentence work with a filename argument?"""
with utils.smart_open(datapath('lee_background.cor')) as orig:
sentences = word2vec.LineSentence(datapath('lee_background.cor'))
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def testLineSentenceWorksWithCompressedFile(self):
"""Does LineSentence work with a compressed file object argument?"""
with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def testLineSentenceWorksWithNormalFile(self):
"""Does LineSentence work with a file object argument, rather than filename?"""
with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
sentences = word2vec.LineSentence(fin)
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def testPathLineSentences(self):
"""Does PathLineSentences work with a path argument?"""
with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\
utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2:
sentences = word2vec.PathLineSentences(datapath('PathLineSentences'))
orig = orig1.readlines() + orig2.readlines()
orig_counter = 0 # to go through orig while matching PathLineSentences
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split())
orig_counter += 1
def testPathLineSentencesOneFile(self):
"""Does PathLineSentences work with a single file argument?"""
test_file = os.path.join(datapath('PathLineSentences'), '1.txt')
with utils.smart_open(test_file) as orig:
sentences = word2vec.PathLineSentences(test_file)
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
# endclass TestWord2VecSentenceIterators
# TODO: get correct path to Python binary
# class TestWord2VecScripts(unittest.TestCase):
# def testWord2VecStandAloneScript(self):
# """Does Word2Vec script launch standalone?"""
# cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + \
# ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1'
# output = check_output(cmd, stderr=PIPE)
# self.assertEqual(output, '0')
# #endclass TestWord2VecScripts
if not hasattr(TestWord2VecModel, 'assertLess'):
# workaround for python 2.6
def assertLess(self, a, b, msg=None):
self.assertTrue(a < b, msg="%s is not less than %s" % (a, b))
setattr(TestWord2VecModel, 'assertLess', assertLess)
if __name__ == '__main__':
logging.basicConfig(
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
level=logging.DEBUG
)
logging.info("using optimization %s", word2vec.FAST_VERSION)
unittest.main()