994 lines
46 KiB
Python
994 lines
46 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""
|
|
Automated tests for checking transformation algorithms (the models package).
|
|
"""
|
|
|
|
|
|
import logging
|
|
import unittest
|
|
import os
|
|
import bz2
|
|
import sys
|
|
|
|
import numpy as np
|
|
|
|
from gensim import utils
|
|
from gensim.models import word2vec, keyedvectors
|
|
from gensim.test.utils import datapath, get_tmpfile, common_texts as sentences
|
|
from testfixtures import log_capture
|
|
|
|
try:
|
|
from pyemd import emd # noqa:F401
|
|
PYEMD_EXT = True
|
|
except ImportError:
|
|
PYEMD_EXT = False
|
|
|
|
|
|
class LeeCorpus(object):
|
|
def __iter__(self):
|
|
with open(datapath('lee_background.cor')) as f:
|
|
for line in f:
|
|
yield utils.simple_preprocess(line)
|
|
|
|
|
|
list_corpus = list(LeeCorpus())
|
|
|
|
new_sentences = [
|
|
['computer', 'artificial', 'intelligence'],
|
|
['artificial', 'trees'],
|
|
['human', 'intelligence'],
|
|
['artificial', 'graph'],
|
|
['intelligence'],
|
|
['artificial', 'intelligence', 'system']
|
|
]
|
|
|
|
|
|
def _rule(word, count, min_count):
|
|
if word == "human":
|
|
return utils.RULE_DISCARD # throw out
|
|
else:
|
|
return utils.RULE_DEFAULT # apply default rule, i.e. min_count
|
|
|
|
|
|
def load_on_instance():
|
|
# Save and load a Word2Vec Model on instance for test
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.save(tmpf)
|
|
model = word2vec.Word2Vec() # should fail at this point
|
|
return model.load(tmpf)
|
|
|
|
|
|
class TestWord2VecModel(unittest.TestCase):
|
|
def testBuildVocabFromFreq(self):
|
|
"""Test that the algorithm is able to build vocabulary from given
|
|
frequency table"""
|
|
freq_dict = {
|
|
'minors': 2, 'graph': 3, 'system': 4,
|
|
'trees': 3, 'eps': 2, 'computer': 2,
|
|
'survey': 2, 'user': 3, 'human': 2,
|
|
'time': 2, 'interface': 2, 'response': 2
|
|
}
|
|
model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0)
|
|
model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
|
|
model_hs.build_vocab_from_freq(freq_dict)
|
|
model_neg.build_vocab_from_freq(freq_dict)
|
|
self.assertEqual(len(model_hs.wv.vocab), 12)
|
|
self.assertEqual(len(model_neg.wv.vocab), 12)
|
|
self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
|
|
self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
|
|
self.assertEqual(model_hs.wv.vocab['system'].count, 4)
|
|
self.assertEqual(model_hs.wv.vocab['trees'].count, 3)
|
|
self.assertEqual(model_hs.wv.vocab['eps'].count, 2)
|
|
self.assertEqual(model_hs.wv.vocab['computer'].count, 2)
|
|
self.assertEqual(model_hs.wv.vocab['survey'].count, 2)
|
|
self.assertEqual(model_hs.wv.vocab['user'].count, 3)
|
|
self.assertEqual(model_hs.wv.vocab['human'].count, 2)
|
|
self.assertEqual(model_hs.wv.vocab['time'].count, 2)
|
|
self.assertEqual(model_hs.wv.vocab['interface'].count, 2)
|
|
self.assertEqual(model_hs.wv.vocab['response'].count, 2)
|
|
self.assertEqual(model_neg.wv.vocab['minors'].count, 2)
|
|
self.assertEqual(model_neg.wv.vocab['graph'].count, 3)
|
|
self.assertEqual(model_neg.wv.vocab['system'].count, 4)
|
|
self.assertEqual(model_neg.wv.vocab['trees'].count, 3)
|
|
self.assertEqual(model_neg.wv.vocab['eps'].count, 2)
|
|
self.assertEqual(model_neg.wv.vocab['computer'].count, 2)
|
|
self.assertEqual(model_neg.wv.vocab['survey'].count, 2)
|
|
self.assertEqual(model_neg.wv.vocab['user'].count, 3)
|
|
self.assertEqual(model_neg.wv.vocab['human'].count, 2)
|
|
self.assertEqual(model_neg.wv.vocab['time'].count, 2)
|
|
self.assertEqual(model_neg.wv.vocab['interface'].count, 2)
|
|
self.assertEqual(model_neg.wv.vocab['response'].count, 2)
|
|
new_freq_dict = {
|
|
'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1
|
|
}
|
|
model_hs.build_vocab_from_freq(new_freq_dict, update=True)
|
|
model_neg.build_vocab_from_freq(new_freq_dict, update=True)
|
|
self.assertEqual(model_hs.wv.vocab['graph'].count, 4)
|
|
self.assertEqual(model_hs.wv.vocab['artificial'].count, 4)
|
|
self.assertEqual(len(model_hs.wv.vocab), 14)
|
|
self.assertEqual(len(model_neg.wv.vocab), 14)
|
|
|
|
def testPruneVocab(self):
|
|
"""Test Prune vocab while scanning sentences"""
|
|
sentences = [
|
|
["graph", "system"],
|
|
["graph", "system"],
|
|
["system", "eps"],
|
|
["graph", "system"]
|
|
]
|
|
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
|
|
self.assertEqual(len(model.wv.vocab), 2)
|
|
self.assertEqual(model.wv.vocab['graph'].count, 3)
|
|
self.assertEqual(model.wv.vocab['system'].count, 4)
|
|
|
|
sentences = [
|
|
["graph", "system"],
|
|
["graph", "system"],
|
|
["system", "eps"],
|
|
["graph", "system"],
|
|
["minors", "survey", "minors", "survey", "minors"]
|
|
]
|
|
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
|
|
self.assertEqual(len(model.wv.vocab), 3)
|
|
self.assertEqual(model.wv.vocab['graph'].count, 3)
|
|
self.assertEqual(model.wv.vocab['minors'].count, 3)
|
|
self.assertEqual(model.wv.vocab['system'].count, 4)
|
|
|
|
def testTotalWordCount(self):
|
|
model = word2vec.Word2Vec(size=10, min_count=0, seed=42)
|
|
total_words = model.vocabulary.scan_vocab(sentences)[0]
|
|
self.assertEqual(total_words, 29)
|
|
|
|
def testMaxFinalVocab(self):
|
|
# Test for less restricting effect of max_final_vocab
|
|
# max_final_vocab is specified but has no effect
|
|
model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=4, sample=0)
|
|
model.vocabulary.scan_vocab(sentences)
|
|
reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0)
|
|
self.assertEqual(reported_values['drop_unique'], 11)
|
|
self.assertEqual(reported_values['retain_total'], 4)
|
|
self.assertEqual(reported_values['num_retained_words'], 1)
|
|
self.assertEqual(model.vocabulary.effective_min_count, 4)
|
|
|
|
# Test for more restricting effect of max_final_vocab
|
|
# results in setting a min_count more restricting than specified min_count
|
|
model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=2, sample=0)
|
|
model.vocabulary.scan_vocab(sentences)
|
|
reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0)
|
|
self.assertEqual(reported_values['drop_unique'], 8)
|
|
self.assertEqual(reported_values['retain_total'], 13)
|
|
self.assertEqual(reported_values['num_retained_words'], 4)
|
|
self.assertEqual(model.vocabulary.effective_min_count, 3)
|
|
|
|
def testOnlineLearning(self):
|
|
"""Test that the algorithm is able to add new words to the
|
|
vocabulary and to a trained model when using a sorted vocabulary"""
|
|
model_hs = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=1, negative=0)
|
|
model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
|
|
self.assertTrue(len(model_hs.wv.vocab), 12)
|
|
self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
|
|
model_hs.build_vocab(new_sentences, update=True)
|
|
model_neg.build_vocab(new_sentences, update=True)
|
|
self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
|
|
self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
|
|
self.assertEqual(len(model_hs.wv.vocab), 14)
|
|
self.assertEqual(len(model_neg.wv.vocab), 14)
|
|
|
|
def testOnlineLearningAfterSave(self):
|
|
"""Test that the algorithm is able to add new words to the
|
|
vocabulary and to a trained model when using a sorted vocabulary"""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
|
|
model_neg.save(tmpf)
|
|
model_neg = word2vec.Word2Vec.load(tmpf)
|
|
self.assertTrue(len(model_neg.wv.vocab), 12)
|
|
model_neg.build_vocab(new_sentences, update=True)
|
|
model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
|
|
self.assertEqual(len(model_neg.wv.vocab), 14)
|
|
|
|
def onlineSanity(self, model, trained_model=False):
|
|
terro, others = [], []
|
|
for l in list_corpus:
|
|
if 'terrorism' in l:
|
|
terro.append(l)
|
|
else:
|
|
others.append(l)
|
|
self.assertTrue(all(['terrorism' not in l for l in others]))
|
|
model.build_vocab(others, update=trained_model)
|
|
model.train(others, total_examples=model.corpus_count, epochs=model.iter)
|
|
self.assertFalse('terrorism' in model.wv.vocab)
|
|
model.build_vocab(terro, update=True)
|
|
self.assertTrue('terrorism' in model.wv.vocab)
|
|
orig0 = np.copy(model.wv.syn0)
|
|
model.train(terro, total_examples=len(terro), epochs=model.iter)
|
|
self.assertFalse(np.allclose(model.wv.syn0, orig0))
|
|
sim = model.n_similarity(['war'], ['terrorism'])
|
|
self.assertLess(0., sim)
|
|
|
|
def test_sg_hs_online(self):
|
|
"""Test skipgram w/ hierarchical softmax"""
|
|
model = word2vec.Word2Vec(sg=1, window=5, hs=1, negative=0, min_count=3, iter=10, seed=42, workers=2)
|
|
self.onlineSanity(model)
|
|
|
|
def test_sg_neg_online(self):
|
|
"""Test skipgram w/ negative sampling"""
|
|
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=3, iter=10, seed=42, workers=2)
|
|
self.onlineSanity(model)
|
|
|
|
def test_cbow_hs_online(self):
|
|
"""Test CBOW w/ hierarchical softmax"""
|
|
model = word2vec.Word2Vec(
|
|
sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
|
|
min_count=3, iter=10, seed=42, workers=2
|
|
)
|
|
self.onlineSanity(model)
|
|
|
|
def test_cbow_neg_online(self):
|
|
"""Test CBOW w/ negative sampling"""
|
|
model = word2vec.Word2Vec(
|
|
sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
|
|
min_count=5, iter=10, seed=42, workers=2, sample=0
|
|
)
|
|
self.onlineSanity(model)
|
|
|
|
def testPersistence(self):
|
|
"""Test storing/loading the entire model."""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.save(tmpf)
|
|
self.models_equal(model, word2vec.Word2Vec.load(tmpf))
|
|
# test persistence of the KeyedVectors of a model
|
|
wv = model.wv
|
|
wv.save(tmpf)
|
|
loaded_wv = keyedvectors.KeyedVectors.load(tmpf)
|
|
self.assertTrue(np.allclose(wv.syn0, loaded_wv.syn0))
|
|
self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
|
|
|
|
def testPersistenceWithConstructorRule(self):
|
|
"""Test storing/loading the entire model with a vocab trimming rule passed in the constructor."""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule)
|
|
model.save(tmpf)
|
|
self.models_equal(model, word2vec.Word2Vec.load(tmpf))
|
|
|
|
def testRuleWithMinCount(self):
|
|
"""Test that returning RULE_DEFAULT from trim_rule triggers min_count."""
|
|
model = word2vec.Word2Vec(sentences + [["occurs_only_once"]], min_count=2, trim_rule=_rule)
|
|
self.assertTrue("human" not in model.wv.vocab)
|
|
self.assertTrue("occurs_only_once" not in model.wv.vocab)
|
|
self.assertTrue("interface" in model.wv.vocab)
|
|
|
|
def testRule(self):
|
|
"""Test applying vocab trim_rule to build_vocab instead of constructor."""
|
|
model = word2vec.Word2Vec(min_count=1)
|
|
model.build_vocab(sentences, trim_rule=_rule)
|
|
self.assertTrue("human" not in model.wv.vocab)
|
|
|
|
def testLambdaRule(self):
|
|
"""Test that lambda trim_rule works."""
|
|
def rule(word, count, min_count):
|
|
return utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT
|
|
|
|
model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule)
|
|
self.assertTrue("human" not in model.wv.vocab)
|
|
|
|
def testSyn0NormNotSaved(self):
|
|
"""Test syn0norm isn't saved in model file"""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
model.save(tmpf)
|
|
loaded_model = word2vec.Word2Vec.load(tmpf)
|
|
self.assertTrue(loaded_model.wv.syn0norm is None)
|
|
|
|
wv = model.wv
|
|
wv.save(tmpf)
|
|
loaded_kv = keyedvectors.KeyedVectors.load(tmpf)
|
|
self.assertTrue(loaded_kv.syn0norm is None)
|
|
|
|
def testLoadPreKeyedVectorModel(self):
|
|
"""Test loading pre-KeyedVectors word2vec model"""
|
|
|
|
if sys.version_info[:2] == (3, 4):
|
|
model_file_suffix = '_py3_4'
|
|
elif sys.version_info < (3,):
|
|
model_file_suffix = '_py2'
|
|
else:
|
|
model_file_suffix = '_py3'
|
|
|
|
# Model stored in one file
|
|
model_file = 'word2vec_pre_kv%s' % model_file_suffix
|
|
model = word2vec.Word2Vec.load(datapath(model_file))
|
|
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), model.vector_size))
|
|
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
|
|
|
|
# Model stored in multiple files
|
|
model_file = 'word2vec_pre_kv_sep%s' % model_file_suffix
|
|
model = word2vec.Word2Vec.load(datapath(model_file))
|
|
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), model.vector_size))
|
|
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
|
|
|
|
def testLoadPreKeyedVectorModelCFormat(self):
|
|
"""Test loading pre-KeyedVectors word2vec model saved in word2vec format"""
|
|
model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'))
|
|
self.assertTrue(model.syn0.shape[0] == len(model.vocab))
|
|
|
|
def testPersistenceWord2VecFormat(self):
|
|
"""Test storing/loading the entire model in word2vec format."""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
model.wv.save_word2vec_format(tmpf, binary=True)
|
|
binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
|
|
binary_model_kv.init_sims(replace=False)
|
|
self.assertTrue(np.allclose(model['human'], binary_model_kv['human']))
|
|
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
|
|
norm_only_model.init_sims(replace=True)
|
|
self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
|
|
self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human']))
|
|
limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True, limit=3)
|
|
self.assertEqual(len(limited_model_kv.syn0), 3)
|
|
half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(
|
|
tmpf, binary=True, datatype=np.float16
|
|
)
|
|
self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)
|
|
|
|
def testNoTrainingCFormat(self):
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
model.wv.save_word2vec_format(tmpf, binary=True)
|
|
kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
|
|
binary_model = word2vec.Word2Vec()
|
|
binary_model.wv = kv
|
|
self.assertRaises(ValueError, binary_model.train, sentences)
|
|
|
|
def testTooShortBinaryWord2VecFormat(self):
|
|
tfile = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
model.wv.save_word2vec_format(tfile, binary=True)
|
|
f = open(tfile, 'r+b')
|
|
f.write(b'13') # write wrong (too-long) vector count
|
|
f.close()
|
|
self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=True)
|
|
|
|
def testTooShortTextWord2VecFormat(self):
|
|
tfile = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
model.wv.save_word2vec_format(tfile, binary=False)
|
|
f = open(tfile, 'r+b')
|
|
f.write(b'13') # write wrong (too-long) vector count
|
|
f.close()
|
|
self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=False)
|
|
|
|
def testPersistenceWord2VecFormatNonBinary(self):
|
|
"""Test storing/loading the entire model in word2vec non-binary format."""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
model.wv.save_word2vec_format(tmpf, binary=False)
|
|
text_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False)
|
|
text_model.init_sims(False)
|
|
self.assertTrue(np.allclose(model['human'], text_model['human'], atol=1e-6))
|
|
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False)
|
|
norm_only_model.init_sims(True)
|
|
self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6))
|
|
self.assertTrue(np.allclose(
|
|
model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4
|
|
))
|
|
|
|
def testPersistenceWord2VecFormatWithVocab(self):
|
|
"""Test storing/loading the entire model and vocabulary in word2vec format."""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
testvocab = get_tmpfile('gensim_word2vec.vocab')
|
|
model.wv.save_word2vec_format(tmpf, testvocab, binary=True)
|
|
binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True)
|
|
self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab_kv.vocab['human'].count)
|
|
|
|
def testPersistenceKeyedVectorsFormatWithVocab(self):
|
|
"""Test storing/loading the entire model and vocabulary in word2vec format."""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
testvocab = get_tmpfile('gensim_word2vec.vocab')
|
|
model.wv.save_word2vec_format(tmpf, testvocab, binary=True)
|
|
kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True)
|
|
self.assertEqual(model.wv.vocab['human'].count, kv_binary_model_with_vocab.vocab['human'].count)
|
|
|
|
def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self):
|
|
"""Test storing/loading the entire model and vocabulary in word2vec format chained with
|
|
saving and loading via `save` and `load` methods`.
|
|
It was possible prior to 1.0.0 release, now raises Exception"""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.init_sims()
|
|
testvocab = get_tmpfile('gensim_word2vec.vocab')
|
|
model.wv.save_word2vec_format(tmpf, testvocab, binary=True)
|
|
binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True)
|
|
binary_model_with_vocab_kv.save(tmpf)
|
|
self.assertRaises(AttributeError, word2vec.Word2Vec.load, tmpf)
|
|
|
|
def testLargeMmap(self):
|
|
"""Test storing/loading the entire model."""
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
|
|
# test storing the internal arrays into separate files
|
|
model.save(tmpf, sep_limit=0)
|
|
self.models_equal(model, word2vec.Word2Vec.load(tmpf))
|
|
|
|
# make sure mmaping the arrays back works, too
|
|
self.models_equal(model, word2vec.Word2Vec.load(tmpf, mmap='r'))
|
|
|
|
def testVocab(self):
|
|
"""Test word2vec vocabulary building."""
|
|
corpus = LeeCorpus()
|
|
total_words = sum(len(sentence) for sentence in corpus)
|
|
|
|
# try vocab building explicitly, using all words
|
|
model = word2vec.Word2Vec(min_count=1, hs=1, negative=0)
|
|
model.build_vocab(corpus)
|
|
self.assertTrue(len(model.wv.vocab) == 6981)
|
|
# with min_count=1, we're not throwing away anything,
|
|
# so make sure the word counts add up to be the entire corpus
|
|
self.assertEqual(sum(v.count for v in model.wv.vocab.values()), total_words)
|
|
# make sure the binary codes are correct
|
|
np.allclose(model.wv.vocab['the'].code, [1, 1, 0, 0])
|
|
|
|
# test building vocab with default params
|
|
model = word2vec.Word2Vec(hs=1, negative=0)
|
|
model.build_vocab(corpus)
|
|
self.assertTrue(len(model.wv.vocab) == 1750)
|
|
np.allclose(model.wv.vocab['the'].code, [1, 1, 1, 0])
|
|
|
|
# no input => "RuntimeError: you must first build vocabulary before training the model"
|
|
self.assertRaises(RuntimeError, word2vec.Word2Vec, [])
|
|
|
|
# input not empty, but rather completely filtered out
|
|
self.assertRaises(RuntimeError, word2vec.Word2Vec, corpus, min_count=total_words + 1)
|
|
|
|
def testTraining(self):
|
|
"""Test word2vec training."""
|
|
# build vocabulary, don't train yet
|
|
model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0)
|
|
model.build_vocab(sentences)
|
|
|
|
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
|
|
self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2))
|
|
|
|
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
|
|
sims = model.most_similar('graph', topn=10)
|
|
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
|
|
|
|
# test querying for "most similar" by vector
|
|
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
|
|
sims2 = model.most_similar(positive=[graph_vector], topn=11)
|
|
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
|
|
self.assertEqual(sims, sims2)
|
|
|
|
# build vocab and train in one step; must be the same as above
|
|
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
|
|
self.models_equal(model, model2)
|
|
|
|
def testScoring(self):
|
|
"""Test word2vec scoring."""
|
|
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
|
|
|
|
# just score and make sure they exist
|
|
scores = model.score(sentences, len(sentences))
|
|
self.assertEqual(len(scores), len(sentences))
|
|
|
|
def testLocking(self):
|
|
"""Test word2vec training doesn't change locked vectors."""
|
|
corpus = LeeCorpus()
|
|
# build vocabulary, don't train yet
|
|
for sg in range(2): # test both cbow and sg
|
|
model = word2vec.Word2Vec(size=4, hs=1, negative=5, min_count=1, sg=sg, window=5)
|
|
model.build_vocab(corpus)
|
|
|
|
# remember two vectors
|
|
locked0 = np.copy(model.wv.syn0[0])
|
|
unlocked1 = np.copy(model.wv.syn0[1])
|
|
# lock the vector in slot 0 against change
|
|
model.syn0_lockf[0] = 0.0
|
|
|
|
model.train(corpus, total_examples=model.corpus_count, epochs=model.iter)
|
|
self.assertFalse((unlocked1 == model.wv.syn0[1]).all()) # unlocked vector should vary
|
|
self.assertTrue((locked0 == model.wv.syn0[0]).all()) # locked vector should not vary
|
|
|
|
def testAccuracy(self):
|
|
"""Test Word2Vec accuracy and KeyedVectors accuracy give the same result"""
|
|
model = word2vec.Word2Vec(LeeCorpus())
|
|
w2v_accuracy = model.accuracy(datapath('questions-words.txt'))
|
|
kv_accuracy = model.wv.accuracy(datapath('questions-words.txt'))
|
|
self.assertEqual(w2v_accuracy, kv_accuracy)
|
|
|
|
def testEvaluateWordPairs(self):
|
|
"""Test Spearman and Pearson correlation coefficients give sane results on similarity datasets"""
|
|
corpus = word2vec.LineSentence(datapath('head500.noblanks.cor.bz2'))
|
|
model = word2vec.Word2Vec(corpus, min_count=3, iter=10)
|
|
correlation = model.evaluate_word_pairs(datapath('wordsim353.tsv'))
|
|
pearson = correlation[0][0]
|
|
spearman = correlation[1][0]
|
|
oov = correlation[2]
|
|
self.assertTrue(0.1 < pearson < 1.0)
|
|
self.assertTrue(0.1 < spearman < 1.0)
|
|
self.assertTrue(0.0 <= oov < 90.0)
|
|
|
|
def model_sanity(self, model, train=True):
|
|
"""Even tiny models trained on LeeCorpus should pass these sanity checks"""
|
|
# run extra before/after training tests if train=True
|
|
if train:
|
|
model.build_vocab(list_corpus)
|
|
orig0 = np.copy(model.wv.syn0[0])
|
|
model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
|
|
self.assertFalse((orig0 == model.wv.syn0[1]).all()) # vector should vary after training
|
|
sims = model.most_similar('war', topn=len(model.wv.index2word))
|
|
t_rank = [word for word, score in sims].index('terrorism')
|
|
# in >200 calibration runs w/ calling parameters, 'terrorism' in 50-most_sim for 'war'
|
|
self.assertLess(t_rank, 50)
|
|
war_vec = model['war']
|
|
sims2 = model.most_similar([war_vec], topn=51)
|
|
self.assertTrue('war' in [word for word, score in sims2])
|
|
self.assertTrue('terrorism' in [word for word, score in sims2])
|
|
|
|
def test_sg_hs(self):
|
|
"""Test skipgram w/ hierarchical softmax"""
|
|
model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2)
|
|
self.model_sanity(model)
|
|
|
|
def test_sg_neg(self):
|
|
"""Test skipgram w/ negative sampling"""
|
|
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2)
|
|
self.model_sanity(model)
|
|
|
|
def test_cbow_hs(self):
|
|
"""Test CBOW w/ hierarchical softmax"""
|
|
model = word2vec.Word2Vec(
|
|
sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0,
|
|
min_count=5, iter=10, workers=2, batch_words=1000
|
|
)
|
|
self.model_sanity(model)
|
|
|
|
def test_cbow_neg(self):
|
|
"""Test CBOW w/ negative sampling"""
|
|
model = word2vec.Word2Vec(
|
|
sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
|
|
min_count=5, iter=10, workers=2, sample=0
|
|
)
|
|
self.model_sanity(model)
|
|
|
|
def test_cosmul(self):
|
|
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
|
|
sims = model.most_similar_cosmul('graph', topn=10)
|
|
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
|
|
|
|
# test querying for "most similar" by vector
|
|
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
|
|
sims2 = model.most_similar_cosmul(positive=[graph_vector], topn=11)
|
|
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
|
|
self.assertEqual(sims, sims2)
|
|
|
|
def testTrainingCbow(self):
|
|
"""Test CBOW word2vec training."""
|
|
# to test training, make the corpus larger by repeating its sentences over and over
|
|
# build vocabulary, don't train yet
|
|
model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0)
|
|
model.build_vocab(sentences)
|
|
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
|
|
self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2))
|
|
|
|
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
|
|
sims = model.most_similar('graph', topn=10)
|
|
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
|
|
|
|
# test querying for "most similar" by vector
|
|
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
|
|
sims2 = model.most_similar(positive=[graph_vector], topn=11)
|
|
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
|
|
self.assertEqual(sims, sims2)
|
|
|
|
# build vocab and train in one step; must be the same as above
|
|
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0)
|
|
self.models_equal(model, model2)
|
|
|
|
def testTrainingSgNegative(self):
|
|
"""Test skip-gram (negative sampling) word2vec training."""
|
|
# to test training, make the corpus larger by repeating its sentences over and over
|
|
# build vocabulary, don't train yet
|
|
model = word2vec.Word2Vec(size=2, min_count=1, sg=1, hs=0, negative=2)
|
|
model.build_vocab(sentences)
|
|
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
|
|
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2))
|
|
|
|
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
|
|
sims = model.most_similar('graph', topn=10)
|
|
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
|
|
|
|
# test querying for "most similar" by vector
|
|
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
|
|
sims2 = model.most_similar(positive=[graph_vector], topn=11)
|
|
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
|
|
self.assertEqual(sims, sims2)
|
|
|
|
# build vocab and train in one step; must be the same as above
|
|
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=1, hs=0, negative=2)
|
|
self.models_equal(model, model2)
|
|
|
|
def testTrainingCbowNegative(self):
|
|
"""Test CBOW (negative sampling) word2vec training."""
|
|
# to test training, make the corpus larger by repeating its sentences over and over
|
|
# build vocabulary, don't train yet
|
|
model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
|
|
model.build_vocab(sentences)
|
|
self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
|
|
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2))
|
|
|
|
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
|
|
sims = model.most_similar('graph', topn=10)
|
|
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar
|
|
|
|
# test querying for "most similar" by vector
|
|
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
|
|
sims2 = model.most_similar(positive=[graph_vector], topn=11)
|
|
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
|
|
self.assertEqual(sims, sims2)
|
|
|
|
# build vocab and train in one step; must be the same as above
|
|
model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2)
|
|
self.models_equal(model, model2)
|
|
|
|
def testSimilarities(self):
|
|
"""Test similarity and n_similarity methods."""
|
|
# The model is trained using CBOW
|
|
model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
|
|
model.build_vocab(sentences)
|
|
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
|
|
|
|
self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph']))
|
|
self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees'))
|
|
self.assertRaises(ZeroDivisionError, model.n_similarity, ['graph', 'trees'], [])
|
|
self.assertRaises(ZeroDivisionError, model.n_similarity, [], ['graph', 'trees'])
|
|
self.assertRaises(ZeroDivisionError, model.n_similarity, [], [])
|
|
|
|
def testSimilarBy(self):
|
|
"""Test word2vec similar_by_word and similar_by_vector."""
|
|
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
|
|
wordsims = model.similar_by_word('graph', topn=10)
|
|
wordsims2 = model.most_similar(positive='graph', topn=10)
|
|
vectorsims = model.similar_by_vector(model['graph'], topn=10)
|
|
vectorsims2 = model.most_similar([model['graph']], topn=10)
|
|
self.assertEqual(wordsims, wordsims2)
|
|
self.assertEqual(vectorsims, vectorsims2)
|
|
|
|
def testParallel(self):
|
|
"""Test word2vec parallel training."""
|
|
if word2vec.FAST_VERSION < 0: # don't test the plain np version for parallelism (too slow)
|
|
return
|
|
|
|
corpus = utils.RepeatCorpus(LeeCorpus(), 10000)
|
|
|
|
for workers in [2, 4]:
|
|
model = word2vec.Word2Vec(corpus, workers=workers)
|
|
sims = model.most_similar('israeli') # noqa:F841
|
|
# the exact vectors and therefore similarities may differ, due to different thread collisions/randomization
|
|
# so let's test only for top3
|
|
# TODO: commented out for now; find a more robust way to compare against "gold standard"
|
|
# self.assertTrue('palestinian' in [sims[i][0] for i in range(3)])
|
|
|
|
def testRNG(self):
|
|
"""Test word2vec results identical with identical RNG seed."""
|
|
model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
|
|
model2 = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
|
|
self.models_equal(model, model2)
|
|
|
|
def models_equal(self, model, model2):
|
|
self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab))
|
|
self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0))
|
|
if model.hs:
|
|
self.assertTrue(np.allclose(model.syn1, model2.syn1))
|
|
if model.negative:
|
|
self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg))
|
|
most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0]
|
|
self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word]))
|
|
|
|
def testDeleteTemporaryTrainingData(self):
|
|
"""Test word2vec model after delete_temporary_training_data"""
|
|
for i in [0, 1]:
|
|
for j in [0, 1]:
|
|
model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j)
|
|
if i:
|
|
self.assertTrue(hasattr(model, 'syn1'))
|
|
if j:
|
|
self.assertTrue(hasattr(model, 'syn1neg'))
|
|
self.assertTrue(hasattr(model, 'syn0_lockf'))
|
|
model.delete_temporary_training_data(replace_word_vectors_with_normalized=True)
|
|
self.assertTrue(len(model['human']), 10)
|
|
self.assertTrue(len(model.wv.vocab), 12)
|
|
self.assertTrue(model.wv.vocab['graph'].count, 3)
|
|
self.assertTrue(not hasattr(model, 'syn1'))
|
|
self.assertTrue(not hasattr(model, 'syn1neg'))
|
|
self.assertTrue(not hasattr(model, 'syn0_lockf'))
|
|
|
|
def testNormalizeAfterTrainingData(self):
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
model.save(tmpf)
|
|
norm_only_model = word2vec.Word2Vec.load(tmpf)
|
|
norm_only_model.delete_temporary_training_data(replace_word_vectors_with_normalized=True)
|
|
self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
|
|
|
|
def testPredictOutputWord(self):
|
|
'''Test word2vec predict_output_word method handling for negative sampling scheme'''
|
|
# under normal circumstances
|
|
model_with_neg = word2vec.Word2Vec(sentences, min_count=1)
|
|
predictions_with_neg = model_with_neg.predict_output_word(['system', 'human'], topn=5)
|
|
self.assertTrue(len(predictions_with_neg) == 5)
|
|
|
|
# out-of-vobaculary scenario
|
|
predictions_out_of_vocab = model_with_neg.predict_output_word(['some', 'random', 'words'], topn=5)
|
|
self.assertEqual(predictions_out_of_vocab, None)
|
|
|
|
# when required model parameters have been deleted
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model_with_neg.init_sims()
|
|
model_with_neg.wv.save_word2vec_format(tmpf, binary=True)
|
|
kv_model_with_neg = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
|
|
binary_model_with_neg = word2vec.Word2Vec()
|
|
binary_model_with_neg.wv = kv_model_with_neg
|
|
self.assertRaises(RuntimeError, binary_model_with_neg.predict_output_word, ['system', 'human'])
|
|
|
|
# negative sampling scheme not used
|
|
model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0)
|
|
self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human'])
|
|
|
|
def testLoadOldModel(self):
|
|
"""Test loading word2vec models from previous version"""
|
|
|
|
model_file = 'word2vec_old'
|
|
model = word2vec.Word2Vec.load(datapath(model_file))
|
|
self.assertTrue(model.wv.vectors.shape == (12, 100))
|
|
self.assertTrue(len(model.wv.vocab) == 12)
|
|
self.assertTrue(len(model.wv.index2word) == 12)
|
|
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size))
|
|
self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
|
|
self.assertTrue(model.vocabulary.cum_table.shape == (12,))
|
|
|
|
self.onlineSanity(model, trained_model=True)
|
|
|
|
# Model stored in multiple files
|
|
model_file = 'word2vec_old_sep'
|
|
model = word2vec.Word2Vec.load(datapath(model_file))
|
|
self.assertTrue(model.wv.vectors.shape == (12, 100))
|
|
self.assertTrue(len(model.wv.vocab) == 12)
|
|
self.assertTrue(len(model.wv.index2word) == 12)
|
|
self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size))
|
|
self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
|
|
self.assertTrue(model.vocabulary.cum_table.shape == (12,))
|
|
|
|
self.onlineSanity(model, trained_model=True)
|
|
|
|
# load really old model
|
|
model_file = 'w2v-lee-v0.12.0'
|
|
model = word2vec.Word2Vec.load(datapath(model_file))
|
|
self.onlineSanity(model, trained_model=True)
|
|
|
|
# test for max_final_vocab for model saved in 3.3
|
|
model_file = 'word2vec_3.3'
|
|
model = word2vec.Word2Vec.load(datapath(model_file))
|
|
self.assertEqual(model.max_final_vocab, None)
|
|
self.assertEqual(model.vocabulary.max_final_vocab, None)
|
|
|
|
# Test loading word2vec models from all previous versions
|
|
old_versions = [
|
|
'0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
|
|
'0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
|
|
'1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
|
|
'3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
|
|
]
|
|
|
|
saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl')
|
|
for old_version in old_versions:
|
|
model = word2vec.Word2Vec.load(saved_models_dir.format(old_version))
|
|
self.assertTrue(len(model.wv.vocab) == 3)
|
|
self.assertTrue(model.wv.vectors.shape == (3, 4))
|
|
# check if similarity search and online training works.
|
|
self.assertTrue(len(model.wv.most_similar('sentence')) == 2)
|
|
model.build_vocab(list_corpus, update=True)
|
|
model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
|
|
# check if similarity search and online training works after saving and loading back the model.
|
|
tmpf = get_tmpfile('gensim_word2vec.tst')
|
|
model.save(tmpf)
|
|
loaded_model = word2vec.Word2Vec.load(tmpf)
|
|
loaded_model.build_vocab(list_corpus, update=True)
|
|
loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
|
|
|
|
@log_capture()
|
|
def testBuildVocabWarning(self, l):
|
|
"""Test if warning is raised on non-ideal input to a word2vec model"""
|
|
sentences = ['human', 'machine']
|
|
model = word2vec.Word2Vec()
|
|
model.build_vocab(sentences)
|
|
warning = "Each 'sentences' item should be a list of words (usually unicode strings)."
|
|
self.assertTrue(warning in str(l))
|
|
|
|
@log_capture()
|
|
def testTrainWarning(self, l):
|
|
"""Test if warning is raised if alpha rises during subsequent calls to train()"""
|
|
sentences = [
|
|
['human'],
|
|
['graph', 'trees']
|
|
]
|
|
model = word2vec.Word2Vec(min_count=1)
|
|
model.build_vocab(sentences)
|
|
for epoch in range(10):
|
|
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
|
|
model.alpha -= 0.002
|
|
model.min_alpha = model.alpha
|
|
if epoch == 5:
|
|
model.alpha += 0.05
|
|
warning = "Effective 'alpha' higher than previous training cycles"
|
|
self.assertTrue(warning in str(l))
|
|
|
|
def test_train_with_explicit_param(self):
|
|
model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0)
|
|
model.build_vocab(sentences)
|
|
with self.assertRaises(ValueError):
|
|
model.train(sentences, total_examples=model.corpus_count)
|
|
|
|
with self.assertRaises(ValueError):
|
|
model.train(sentences, epochs=model.iter)
|
|
|
|
with self.assertRaises(ValueError):
|
|
model.train(sentences)
|
|
|
|
def test_sentences_should_not_be_a_generator(self):
|
|
"""
|
|
Is sentences a generator object?
|
|
"""
|
|
gen = (s for s in sentences)
|
|
self.assertRaises(TypeError, word2vec.Word2Vec, (gen,))
|
|
|
|
def testLoadOnClassError(self):
|
|
"""Test if exception is raised when loading word2vec model on instance"""
|
|
self.assertRaises(AttributeError, load_on_instance)
|
|
|
|
def test_reset_from(self):
|
|
"""Test if reset_from() uses pre-built structures from other model"""
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
other_model = word2vec.Word2Vec(new_sentences, min_count=1)
|
|
other_vocab = other_model.wv.vocab
|
|
model.reset_from(other_model)
|
|
self.assertEqual(model.wv.vocab, other_vocab)
|
|
|
|
def test_compute_training_loss(self):
|
|
model = word2vec.Word2Vec(min_count=1, sg=1, negative=5, hs=1)
|
|
model.build_vocab(sentences)
|
|
model.train(sentences, compute_loss=True, total_examples=model.corpus_count, epochs=model.iter)
|
|
training_loss_val = model.get_latest_training_loss()
|
|
self.assertTrue(training_loss_val > 0.0)
|
|
|
|
|
|
# endclass TestWord2VecModel
|
|
|
|
class TestWMD(unittest.TestCase):
|
|
def testNonzero(self):
|
|
'''Test basic functionality with a test sentence.'''
|
|
|
|
if not PYEMD_EXT:
|
|
return
|
|
|
|
model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
|
|
sentence1 = ['human', 'interface', 'computer']
|
|
sentence2 = ['survey', 'user', 'computer', 'system', 'response', 'time']
|
|
distance = model.wmdistance(sentence1, sentence2)
|
|
|
|
# Check that distance is non-zero.
|
|
self.assertFalse(distance == 0.0)
|
|
|
|
def testSymmetry(self):
|
|
'''Check that distance is symmetric.'''
|
|
|
|
if not PYEMD_EXT:
|
|
return
|
|
|
|
model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
|
|
sentence1 = ['human', 'interface', 'computer']
|
|
sentence2 = ['survey', 'user', 'computer', 'system', 'response', 'time']
|
|
distance1 = model.wmdistance(sentence1, sentence2)
|
|
distance2 = model.wmdistance(sentence2, sentence1)
|
|
self.assertTrue(np.allclose(distance1, distance2))
|
|
|
|
def testIdenticalSentences(self):
|
|
'''Check that the distance from a sentence to itself is zero.'''
|
|
|
|
if not PYEMD_EXT:
|
|
return
|
|
|
|
model = word2vec.Word2Vec(sentences, min_count=1)
|
|
sentence = ['survey', 'user', 'computer', 'system', 'response', 'time']
|
|
distance = model.wmdistance(sentence, sentence)
|
|
self.assertEqual(0.0, distance)
|
|
|
|
|
|
class TestWord2VecSentenceIterators(unittest.TestCase):
|
|
def testLineSentenceWorksWithFilename(self):
|
|
"""Does LineSentence work with a filename argument?"""
|
|
with utils.smart_open(datapath('lee_background.cor')) as orig:
|
|
sentences = word2vec.LineSentence(datapath('lee_background.cor'))
|
|
for words in sentences:
|
|
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
|
|
|
|
def testLineSentenceWorksWithCompressedFile(self):
|
|
"""Does LineSentence work with a compressed file object argument?"""
|
|
with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
|
|
sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
|
|
for words in sentences:
|
|
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
|
|
|
|
def testLineSentenceWorksWithNormalFile(self):
|
|
"""Does LineSentence work with a file object argument, rather than filename?"""
|
|
with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
|
|
with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
|
|
sentences = word2vec.LineSentence(fin)
|
|
for words in sentences:
|
|
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
|
|
|
|
def testPathLineSentences(self):
|
|
"""Does PathLineSentences work with a path argument?"""
|
|
with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\
|
|
utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2:
|
|
sentences = word2vec.PathLineSentences(datapath('PathLineSentences'))
|
|
orig = orig1.readlines() + orig2.readlines()
|
|
orig_counter = 0 # to go through orig while matching PathLineSentences
|
|
for words in sentences:
|
|
self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split())
|
|
orig_counter += 1
|
|
|
|
def testPathLineSentencesOneFile(self):
|
|
"""Does PathLineSentences work with a single file argument?"""
|
|
test_file = os.path.join(datapath('PathLineSentences'), '1.txt')
|
|
with utils.smart_open(test_file) as orig:
|
|
sentences = word2vec.PathLineSentences(test_file)
|
|
for words in sentences:
|
|
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
|
|
|
|
|
|
# endclass TestWord2VecSentenceIterators
|
|
|
|
# TODO: get correct path to Python binary
|
|
# class TestWord2VecScripts(unittest.TestCase):
|
|
# def testWord2VecStandAloneScript(self):
|
|
# """Does Word2Vec script launch standalone?"""
|
|
# cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + \
|
|
# ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1'
|
|
# output = check_output(cmd, stderr=PIPE)
|
|
# self.assertEqual(output, '0')
|
|
# #endclass TestWord2VecScripts
|
|
|
|
|
|
if not hasattr(TestWord2VecModel, 'assertLess'):
|
|
# workaround for python 2.6
|
|
def assertLess(self, a, b, msg=None):
|
|
self.assertTrue(a < b, msg="%s is not less than %s" % (a, b))
|
|
|
|
setattr(TestWord2VecModel, 'assertLess', assertLess)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
logging.basicConfig(
|
|
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
|
|
level=logging.DEBUG
|
|
)
|
|
logging.info("using optimization %s", word2vec.FAST_VERSION)
|
|
unittest.main()
|