#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ Automated tests for checking transformation algorithms (the models package). """ import logging import unittest import os import bz2 import sys import numpy as np from gensim import utils from gensim.models import word2vec, keyedvectors from gensim.test.utils import datapath, get_tmpfile, common_texts as sentences from testfixtures import log_capture try: from pyemd import emd # noqa:F401 PYEMD_EXT = True except ImportError: PYEMD_EXT = False class LeeCorpus(object): def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line) list_corpus = list(LeeCorpus()) new_sentences = [ ['computer', 'artificial', 'intelligence'], ['artificial', 'trees'], ['human', 'intelligence'], ['artificial', 'graph'], ['intelligence'], ['artificial', 'intelligence', 'system'] ] def _rule(word, count, min_count): if word == "human": return utils.RULE_DISCARD # throw out else: return utils.RULE_DEFAULT # apply default rule, i.e. min_count def load_on_instance(): # Save and load a Word2Vec Model on instance for test tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.save(tmpf) model = word2vec.Word2Vec() # should fail at this point return model.load(tmpf) class TestWord2VecModel(unittest.TestCase): def testBuildVocabFromFreq(self): """Test that the algorithm is able to build vocabulary from given frequency table""" freq_dict = { 'minors': 2, 'graph': 3, 'system': 4, 'trees': 3, 'eps': 2, 'computer': 2, 'survey': 2, 'user': 3, 'human': 2, 'time': 2, 'interface': 2, 'response': 2 } model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0) model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5) model_hs.build_vocab_from_freq(freq_dict) model_neg.build_vocab_from_freq(freq_dict) self.assertEqual(len(model_hs.wv.vocab), 12) self.assertEqual(len(model_neg.wv.vocab), 12) self.assertEqual(model_hs.wv.vocab['minors'].count, 2) self.assertEqual(model_hs.wv.vocab['graph'].count, 3) self.assertEqual(model_hs.wv.vocab['system'].count, 4) self.assertEqual(model_hs.wv.vocab['trees'].count, 3) self.assertEqual(model_hs.wv.vocab['eps'].count, 2) self.assertEqual(model_hs.wv.vocab['computer'].count, 2) self.assertEqual(model_hs.wv.vocab['survey'].count, 2) self.assertEqual(model_hs.wv.vocab['user'].count, 3) self.assertEqual(model_hs.wv.vocab['human'].count, 2) self.assertEqual(model_hs.wv.vocab['time'].count, 2) self.assertEqual(model_hs.wv.vocab['interface'].count, 2) self.assertEqual(model_hs.wv.vocab['response'].count, 2) self.assertEqual(model_neg.wv.vocab['minors'].count, 2) self.assertEqual(model_neg.wv.vocab['graph'].count, 3) self.assertEqual(model_neg.wv.vocab['system'].count, 4) self.assertEqual(model_neg.wv.vocab['trees'].count, 3) self.assertEqual(model_neg.wv.vocab['eps'].count, 2) self.assertEqual(model_neg.wv.vocab['computer'].count, 2) self.assertEqual(model_neg.wv.vocab['survey'].count, 2) self.assertEqual(model_neg.wv.vocab['user'].count, 3) self.assertEqual(model_neg.wv.vocab['human'].count, 2) self.assertEqual(model_neg.wv.vocab['time'].count, 2) self.assertEqual(model_neg.wv.vocab['interface'].count, 2) self.assertEqual(model_neg.wv.vocab['response'].count, 2) new_freq_dict = { 'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1 } model_hs.build_vocab_from_freq(new_freq_dict, update=True) model_neg.build_vocab_from_freq(new_freq_dict, update=True) self.assertEqual(model_hs.wv.vocab['graph'].count, 4) self.assertEqual(model_hs.wv.vocab['artificial'].count, 4) self.assertEqual(len(model_hs.wv.vocab), 14) self.assertEqual(len(model_neg.wv.vocab), 14) def testPruneVocab(self): """Test Prune vocab while scanning sentences""" sentences = [ ["graph", "system"], ["graph", "system"], ["system", "eps"], ["graph", "system"] ] model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) self.assertEqual(len(model.wv.vocab), 2) self.assertEqual(model.wv.vocab['graph'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) sentences = [ ["graph", "system"], ["graph", "system"], ["system", "eps"], ["graph", "system"], ["minors", "survey", "minors", "survey", "minors"] ] model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) self.assertEqual(len(model.wv.vocab), 3) self.assertEqual(model.wv.vocab['graph'].count, 3) self.assertEqual(model.wv.vocab['minors'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) def testTotalWordCount(self): model = word2vec.Word2Vec(size=10, min_count=0, seed=42) total_words = model.vocabulary.scan_vocab(sentences)[0] self.assertEqual(total_words, 29) def testMaxFinalVocab(self): # Test for less restricting effect of max_final_vocab # max_final_vocab is specified but has no effect model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=4, sample=0) model.vocabulary.scan_vocab(sentences) reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0) self.assertEqual(reported_values['drop_unique'], 11) self.assertEqual(reported_values['retain_total'], 4) self.assertEqual(reported_values['num_retained_words'], 1) self.assertEqual(model.vocabulary.effective_min_count, 4) # Test for more restricting effect of max_final_vocab # results in setting a min_count more restricting than specified min_count model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=2, sample=0) model.vocabulary.scan_vocab(sentences) reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0) self.assertEqual(reported_values['drop_unique'], 8) self.assertEqual(reported_values['retain_total'], 13) self.assertEqual(reported_values['num_retained_words'], 4) self.assertEqual(model.vocabulary.effective_min_count, 3) def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" model_hs = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=1, negative=0) model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(new_sentences, update=True) model_neg.build_vocab(new_sentences, update=True) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) self.assertEqual(len(model_hs.wv.vocab), 14) self.assertEqual(len(model_neg.wv.vocab), 14) def testOnlineLearningAfterSave(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" tmpf = get_tmpfile('gensim_word2vec.tst') model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = word2vec.Word2Vec.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) def onlineSanity(self, model, trained_model=False): terro, others = [], [] for l in list_corpus: if 'terrorism' in l: terro.append(l) else: others.append(l) self.assertTrue(all(['terrorism' not in l for l in others])) model.build_vocab(others, update=trained_model) model.train(others, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse('terrorism' in model.wv.vocab) model.build_vocab(terro, update=True) self.assertTrue('terrorism' in model.wv.vocab) orig0 = np.copy(model.wv.syn0) model.train(terro, total_examples=len(terro), epochs=model.iter) self.assertFalse(np.allclose(model.wv.syn0, orig0)) sim = model.n_similarity(['war'], ['terrorism']) self.assertLess(0., sim) def test_sg_hs_online(self): """Test skipgram w/ hierarchical softmax""" model = word2vec.Word2Vec(sg=1, window=5, hs=1, negative=0, min_count=3, iter=10, seed=42, workers=2) self.onlineSanity(model) def test_sg_neg_online(self): """Test skipgram w/ negative sampling""" model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=3, iter=10, seed=42, workers=2) self.onlineSanity(model) def test_cbow_hs_online(self): """Test CBOW w/ hierarchical softmax""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, min_count=3, iter=10, seed=42, workers=2 ) self.onlineSanity(model) def test_cbow_neg_online(self): """Test CBOW w/ negative sampling""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, min_count=5, iter=10, seed=42, workers=2, sample=0 ) self.onlineSanity(model) def testPersistence(self): """Test storing/loading the entire model.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.save(tmpf) self.models_equal(model, word2vec.Word2Vec.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = keyedvectors.KeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0, loaded_wv.syn0)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) def testPersistenceWithConstructorRule(self): """Test storing/loading the entire model with a vocab trimming rule passed in the constructor.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule) model.save(tmpf) self.models_equal(model, word2vec.Word2Vec.load(tmpf)) def testRuleWithMinCount(self): """Test that returning RULE_DEFAULT from trim_rule triggers min_count.""" model = word2vec.Word2Vec(sentences + [["occurs_only_once"]], min_count=2, trim_rule=_rule) self.assertTrue("human" not in model.wv.vocab) self.assertTrue("occurs_only_once" not in model.wv.vocab) self.assertTrue("interface" in model.wv.vocab) def testRule(self): """Test applying vocab trim_rule to build_vocab instead of constructor.""" model = word2vec.Word2Vec(min_count=1) model.build_vocab(sentences, trim_rule=_rule) self.assertTrue("human" not in model.wv.vocab) def testLambdaRule(self): """Test that lambda trim_rule works.""" def rule(word, count, min_count): return utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule) self.assertTrue("human" not in model.wv.vocab) def testSyn0NormNotSaved(self): """Test syn0norm isn't saved in model file""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.save(tmpf) loaded_model = word2vec.Word2Vec.load(tmpf) self.assertTrue(loaded_model.wv.syn0norm is None) wv = model.wv wv.save(tmpf) loaded_kv = keyedvectors.KeyedVectors.load(tmpf) self.assertTrue(loaded_kv.syn0norm is None) def testLoadPreKeyedVectorModel(self): """Test loading pre-KeyedVectors word2vec model""" if sys.version_info[:2] == (3, 4): model_file_suffix = '_py3_4' elif sys.version_info < (3,): model_file_suffix = '_py2' else: model_file_suffix = '_py3' # Model stored in one file model_file = 'word2vec_pre_kv%s' % model_file_suffix model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) # Model stored in multiple files model_file = 'word2vec_pre_kv_sep%s' % model_file_suffix model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) def testLoadPreKeyedVectorModelCFormat(self): """Test loading pre-KeyedVectors word2vec model saved in word2vec format""" model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c')) self.assertTrue(model.syn0.shape[0] == len(model.vocab)) def testPersistenceWord2VecFormat(self): """Test storing/loading the entire model in word2vec format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tmpf, binary=True) binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model_kv.init_sims(replace=False) self.assertTrue(np.allclose(model['human'], binary_model_kv['human'])) norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) norm_only_model.init_sims(replace=True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'])) limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True, limit=3) self.assertEqual(len(limited_model_kv.syn0), 3) half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format( tmpf, binary=True, datatype=np.float16 ) self.assertEqual(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) def testNoTrainingCFormat(self): tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tmpf, binary=True) kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model = word2vec.Word2Vec() binary_model.wv = kv self.assertRaises(ValueError, binary_model.train, sentences) def testTooShortBinaryWord2VecFormat(self): tfile = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tfile, binary=True) f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count f.close() self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=True) def testTooShortTextWord2VecFormat(self): tfile = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tfile, binary=False) f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count f.close() self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=False) def testPersistenceWord2VecFormatNonBinary(self): """Test storing/loading the entire model in word2vec non-binary format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tmpf, binary=False) text_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False) text_model.init_sims(False) self.assertTrue(np.allclose(model['human'], text_model['human'], atol=1e-6)) norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=False) norm_only_model.init_sims(True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6)) self.assertTrue(np.allclose( model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4 )) def testPersistenceWord2VecFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab_kv.vocab['human'].count) def testPersistenceKeyedVectorsFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) self.assertEqual(model.wv.vocab['human'].count, kv_binary_model_with_vocab.vocab['human'].count) def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): """Test storing/loading the entire model and vocabulary in word2vec format chained with saving and loading via `save` and `load` methods`. It was possible prior to 1.0.0 release, now raises Exception""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) binary_model_with_vocab_kv.save(tmpf) self.assertRaises(AttributeError, word2vec.Word2Vec.load, tmpf) def testLargeMmap(self): """Test storing/loading the entire model.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) # test storing the internal arrays into separate files model.save(tmpf, sep_limit=0) self.models_equal(model, word2vec.Word2Vec.load(tmpf)) # make sure mmaping the arrays back works, too self.models_equal(model, word2vec.Word2Vec.load(tmpf, mmap='r')) def testVocab(self): """Test word2vec vocabulary building.""" corpus = LeeCorpus() total_words = sum(len(sentence) for sentence in corpus) # try vocab building explicitly, using all words model = word2vec.Word2Vec(min_count=1, hs=1, negative=0) model.build_vocab(corpus) self.assertTrue(len(model.wv.vocab) == 6981) # with min_count=1, we're not throwing away anything, # so make sure the word counts add up to be the entire corpus self.assertEqual(sum(v.count for v in model.wv.vocab.values()), total_words) # make sure the binary codes are correct np.allclose(model.wv.vocab['the'].code, [1, 1, 0, 0]) # test building vocab with default params model = word2vec.Word2Vec(hs=1, negative=0) model.build_vocab(corpus) self.assertTrue(len(model.wv.vocab) == 1750) np.allclose(model.wv.vocab['the'].code, [1, 1, 1, 0]) # no input => "RuntimeError: you must first build vocabulary before training the model" self.assertRaises(RuntimeError, word2vec.Word2Vec, []) # input not empty, but rather completely filtered out self.assertRaises(RuntimeError, word2vec.Word2Vec, corpus, min_count=total_words + 1) def testTraining(self): """Test word2vec training.""" # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) self.models_equal(model, model2) def testScoring(self): """Test word2vec scoring.""" model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) # just score and make sure they exist scores = model.score(sentences, len(sentences)) self.assertEqual(len(scores), len(sentences)) def testLocking(self): """Test word2vec training doesn't change locked vectors.""" corpus = LeeCorpus() # build vocabulary, don't train yet for sg in range(2): # test both cbow and sg model = word2vec.Word2Vec(size=4, hs=1, negative=5, min_count=1, sg=sg, window=5) model.build_vocab(corpus) # remember two vectors locked0 = np.copy(model.wv.syn0[0]) unlocked1 = np.copy(model.wv.syn0[1]) # lock the vector in slot 0 against change model.syn0_lockf[0] = 0.0 model.train(corpus, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse((unlocked1 == model.wv.syn0[1]).all()) # unlocked vector should vary self.assertTrue((locked0 == model.wv.syn0[0]).all()) # locked vector should not vary def testAccuracy(self): """Test Word2Vec accuracy and KeyedVectors accuracy give the same result""" model = word2vec.Word2Vec(LeeCorpus()) w2v_accuracy = model.accuracy(datapath('questions-words.txt')) kv_accuracy = model.wv.accuracy(datapath('questions-words.txt')) self.assertEqual(w2v_accuracy, kv_accuracy) def testEvaluateWordPairs(self): """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets""" corpus = word2vec.LineSentence(datapath('head500.noblanks.cor.bz2')) model = word2vec.Word2Vec(corpus, min_count=3, iter=10) correlation = model.evaluate_word_pairs(datapath('wordsim353.tsv')) pearson = correlation[0][0] spearman = correlation[1][0] oov = correlation[2] self.assertTrue(0.1 < pearson < 1.0) self.assertTrue(0.1 < spearman < 1.0) self.assertTrue(0.0 <= oov < 90.0) def model_sanity(self, model, train=True): """Even tiny models trained on LeeCorpus should pass these sanity checks""" # run extra before/after training tests if train=True if train: model.build_vocab(list_corpus) orig0 = np.copy(model.wv.syn0[0]) model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse((orig0 == model.wv.syn0[1]).all()) # vector should vary after training sims = model.most_similar('war', topn=len(model.wv.index2word)) t_rank = [word for word, score in sims].index('terrorism') # in >200 calibration runs w/ calling parameters, 'terrorism' in 50-most_sim for 'war' self.assertLess(t_rank, 50) war_vec = model['war'] sims2 = model.most_similar([war_vec], topn=51) self.assertTrue('war' in [word for word, score in sims2]) self.assertTrue('terrorism' in [word for word, score in sims2]) def test_sg_hs(self): """Test skipgram w/ hierarchical softmax""" model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2) self.model_sanity(model) def test_sg_neg(self): """Test skipgram w/ negative sampling""" model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2) self.model_sanity(model) def test_cbow_hs(self): """Test CBOW w/ hierarchical softmax""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, min_count=5, iter=10, workers=2, batch_words=1000 ) self.model_sanity(model) def test_cbow_neg(self): """Test CBOW w/ negative sampling""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, min_count=5, iter=10, workers=2, sample=0 ) self.model_sanity(model) def test_cosmul(self): model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) sims = model.most_similar_cosmul('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar_cosmul(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) def testTrainingCbow(self): """Test CBOW word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0) model.build_vocab(sentences) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0) self.models_equal(model, model2) def testTrainingSgNegative(self): """Test skip-gram (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=1, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=1, hs=0, negative=2) self.models_equal(model, model2) def testTrainingCbowNegative(self): """Test CBOW (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2) self.models_equal(model, model2) def testSimilarities(self): """Test similarity and n_similarity methods.""" # The model is trained using CBOW model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) self.assertRaises(ZeroDivisionError, model.n_similarity, ['graph', 'trees'], []) self.assertRaises(ZeroDivisionError, model.n_similarity, [], ['graph', 'trees']) self.assertRaises(ZeroDivisionError, model.n_similarity, [], []) def testSimilarBy(self): """Test word2vec similar_by_word and similar_by_vector.""" model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) wordsims = model.similar_by_word('graph', topn=10) wordsims2 = model.most_similar(positive='graph', topn=10) vectorsims = model.similar_by_vector(model['graph'], topn=10) vectorsims2 = model.most_similar([model['graph']], topn=10) self.assertEqual(wordsims, wordsims2) self.assertEqual(vectorsims, vectorsims2) def testParallel(self): """Test word2vec parallel training.""" if word2vec.FAST_VERSION < 0: # don't test the plain np version for parallelism (too slow) return corpus = utils.RepeatCorpus(LeeCorpus(), 10000) for workers in [2, 4]: model = word2vec.Word2Vec(corpus, workers=workers) sims = model.most_similar('israeli') # noqa:F841 # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization # so let's test only for top3 # TODO: commented out for now; find a more robust way to compare against "gold standard" # self.assertTrue('palestinian' in [sims[i][0] for i in range(3)]) def testRNG(self): """Test word2vec results identical with identical RNG seed.""" model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1) model2 = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1) self.models_equal(model, model2) def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) if model.hs: self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word])) def testDeleteTemporaryTrainingData(self): """Test word2vec model after delete_temporary_training_data""" for i in [0, 1]: for j in [0, 1]: model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j) if i: self.assertTrue(hasattr(model, 'syn1')) if j: self.assertTrue(hasattr(model, 'syn1neg')) self.assertTrue(hasattr(model, 'syn0_lockf')) model.delete_temporary_training_data(replace_word_vectors_with_normalized=True) self.assertTrue(len(model['human']), 10) self.assertTrue(len(model.wv.vocab), 12) self.assertTrue(model.wv.vocab['graph'].count, 3) self.assertTrue(not hasattr(model, 'syn1')) self.assertTrue(not hasattr(model, 'syn1neg')) self.assertTrue(not hasattr(model, 'syn0_lockf')) def testNormalizeAfterTrainingData(self): tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.save(tmpf) norm_only_model = word2vec.Word2Vec.load(tmpf) norm_only_model.delete_temporary_training_data(replace_word_vectors_with_normalized=True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) def testPredictOutputWord(self): '''Test word2vec predict_output_word method handling for negative sampling scheme''' # under normal circumstances model_with_neg = word2vec.Word2Vec(sentences, min_count=1) predictions_with_neg = model_with_neg.predict_output_word(['system', 'human'], topn=5) self.assertTrue(len(predictions_with_neg) == 5) # out-of-vobaculary scenario predictions_out_of_vocab = model_with_neg.predict_output_word(['some', 'random', 'words'], topn=5) self.assertEqual(predictions_out_of_vocab, None) # when required model parameters have been deleted tmpf = get_tmpfile('gensim_word2vec.tst') model_with_neg.init_sims() model_with_neg.wv.save_word2vec_format(tmpf, binary=True) kv_model_with_neg = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model_with_neg = word2vec.Word2Vec() binary_model_with_neg.wv = kv_model_with_neg self.assertRaises(RuntimeError, binary_model_with_neg.predict_output_word, ['system', 'human']) # negative sampling scheme not used model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0) self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human']) def testLoadOldModel(self): """Test loading word2vec models from previous version""" model_file = 'word2vec_old' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) self.assertTrue(model.vocabulary.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) # Model stored in multiple files model_file = 'word2vec_old_sep' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) self.assertTrue(model.vocabulary.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) # load really old model model_file = 'w2v-lee-v0.12.0' model = word2vec.Word2Vec.load(datapath(model_file)) self.onlineSanity(model, trained_model=True) # test for max_final_vocab for model saved in 3.3 model_file = 'word2vec_3.3' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertEqual(model.max_final_vocab, None) self.assertEqual(model.vocabulary.max_final_vocab, None) # Test loading word2vec models from all previous versions old_versions = [ '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4', '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4', '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0', '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' ] saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl') for old_version in old_versions: model = word2vec.Word2Vec.load(saved_models_dir.format(old_version)) self.assertTrue(len(model.wv.vocab) == 3) self.assertTrue(model.wv.vectors.shape == (3, 4)) # check if similarity search and online training works. self.assertTrue(len(model.wv.most_similar('sentence')) == 2) model.build_vocab(list_corpus, update=True) model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) # check if similarity search and online training works after saving and loading back the model. tmpf = get_tmpfile('gensim_word2vec.tst') model.save(tmpf) loaded_model = word2vec.Word2Vec.load(tmpf) loaded_model.build_vocab(list_corpus, update=True) loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) @log_capture() def testBuildVocabWarning(self, l): """Test if warning is raised on non-ideal input to a word2vec model""" sentences = ['human', 'machine'] model = word2vec.Word2Vec() model.build_vocab(sentences) warning = "Each 'sentences' item should be a list of words (usually unicode strings)." self.assertTrue(warning in str(l)) @log_capture() def testTrainWarning(self, l): """Test if warning is raised if alpha rises during subsequent calls to train()""" sentences = [ ['human'], ['graph', 'trees'] ] model = word2vec.Word2Vec(min_count=1) model.build_vocab(sentences) for epoch in range(10): model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 model.min_alpha = model.alpha if epoch == 5: model.alpha += 0.05 warning = "Effective 'alpha' higher than previous training cycles" self.assertTrue(warning in str(l)) def test_train_with_explicit_param(self): model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) with self.assertRaises(ValueError): model.train(sentences, total_examples=model.corpus_count) with self.assertRaises(ValueError): model.train(sentences, epochs=model.iter) with self.assertRaises(ValueError): model.train(sentences) def test_sentences_should_not_be_a_generator(self): """ Is sentences a generator object? """ gen = (s for s in sentences) self.assertRaises(TypeError, word2vec.Word2Vec, (gen,)) def testLoadOnClassError(self): """Test if exception is raised when loading word2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) def test_reset_from(self): """Test if reset_from() uses pre-built structures from other model""" model = word2vec.Word2Vec(sentences, min_count=1) other_model = word2vec.Word2Vec(new_sentences, min_count=1) other_vocab = other_model.wv.vocab model.reset_from(other_model) self.assertEqual(model.wv.vocab, other_vocab) def test_compute_training_loss(self): model = word2vec.Word2Vec(min_count=1, sg=1, negative=5, hs=1) model.build_vocab(sentences) model.train(sentences, compute_loss=True, total_examples=model.corpus_count, epochs=model.iter) training_loss_val = model.get_latest_training_loss() self.assertTrue(training_loss_val > 0.0) # endclass TestWord2VecModel class TestWMD(unittest.TestCase): def testNonzero(self): '''Test basic functionality with a test sentence.''' if not PYEMD_EXT: return model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1) sentence1 = ['human', 'interface', 'computer'] sentence2 = ['survey', 'user', 'computer', 'system', 'response', 'time'] distance = model.wmdistance(sentence1, sentence2) # Check that distance is non-zero. self.assertFalse(distance == 0.0) def testSymmetry(self): '''Check that distance is symmetric.''' if not PYEMD_EXT: return model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1) sentence1 = ['human', 'interface', 'computer'] sentence2 = ['survey', 'user', 'computer', 'system', 'response', 'time'] distance1 = model.wmdistance(sentence1, sentence2) distance2 = model.wmdistance(sentence2, sentence1) self.assertTrue(np.allclose(distance1, distance2)) def testIdenticalSentences(self): '''Check that the distance from a sentence to itself is zero.''' if not PYEMD_EXT: return model = word2vec.Word2Vec(sentences, min_count=1) sentence = ['survey', 'user', 'computer', 'system', 'response', 'time'] distance = model.wmdistance(sentence, sentence) self.assertEqual(0.0, distance) class TestWord2VecSentenceIterators(unittest.TestCase): def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) def testPathLineSentences(self): """Does PathLineSentences work with a path argument?""" with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\ utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2: sentences = word2vec.PathLineSentences(datapath('PathLineSentences')) orig = orig1.readlines() + orig2.readlines() orig_counter = 0 # to go through orig while matching PathLineSentences for words in sentences: self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split()) orig_counter += 1 def testPathLineSentencesOneFile(self): """Does PathLineSentences work with a single file argument?""" test_file = os.path.join(datapath('PathLineSentences'), '1.txt') with utils.smart_open(test_file) as orig: sentences = word2vec.PathLineSentences(test_file) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) # endclass TestWord2VecSentenceIterators # TODO: get correct path to Python binary # class TestWord2VecScripts(unittest.TestCase): # def testWord2VecStandAloneScript(self): # """Does Word2Vec script launch standalone?""" # cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + \ # ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1' # output = check_output(cmd, stderr=PIPE) # self.assertEqual(output, '0') # #endclass TestWord2VecScripts if not hasattr(TestWord2VecModel, 'assertLess'): # workaround for python 2.6 def assertLess(self, a, b, msg=None): self.assertTrue(a < b, msg="%s is not less than %s" % (a, b)) setattr(TestWord2VecModel, 'assertLess', assertLess) if __name__ == '__main__': logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.DEBUG ) logging.info("using optimization %s", word2vec.FAST_VERSION) unittest.main()