laywerrobot/lib/python3.6/site-packages/gensim/test/test_similarity_metrics.py
2020-08-27 21:55:39 +02:00

275 lines
9.6 KiB
Python

#!/usr/bin/env python
# encoding: utf-8
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Automated test to check similarity functions and isbow function.
"""
import logging
import unittest
from gensim import matutils
from scipy.sparse import csr_matrix, csc_matrix
import numpy as np
import math
from gensim.corpora.mmcorpus import MmCorpus
from gensim.models import ldamodel
from gensim.test.utils import datapath, common_dictionary, common_corpus
class TestIsBow(unittest.TestCase):
def test_None(self):
# test None
result = matutils.isbow(None)
expected = False
self.assertEqual(expected, result)
def test_bow(self):
# test list words
# one bag of words
potentialbow = [(0, 0.4)]
result = matutils.isbow(potentialbow)
expected = True
self.assertEqual(expected, result)
# multiple bags
potentialbow = [(0, 4.), (1, 2.), (2, 5.), (3, 8.)]
result = matutils.isbow(potentialbow)
expected = True
self.assertEqual(expected, result)
# checking empty input
potentialbow = []
result = matutils.isbow(potentialbow)
expected = True
self.assertEqual(expected, result)
# checking corpus; should return false
potentialbow = [[(2, 1), (3, 1), (4, 1), (5, 1), (1, 1), (7, 1)]]
result = matutils.isbow(potentialbow)
expected = False
self.assertEqual(expected, result)
# not a bag of words, should return false
potentialbow = [(1, 3, 6)]
result = matutils.isbow(potentialbow)
expected = False
self.assertEqual(expected, result)
# checking sparse matrix format bag of words
potentialbow = csr_matrix([[1, 0.4], [0, 0.3], [2, 0.1]])
result = matutils.isbow(potentialbow)
expected = True
self.assertEqual(expected, result)
# checking np array format bag of words
potentialbow = np.array([[1, 0.4], [0, 0.2], [2, 0.2]])
result = matutils.isbow(potentialbow)
expected = True
self.assertEqual(expected, result)
class TestHellinger(unittest.TestCase):
def setUp(self):
self.corpus = MmCorpus(datapath('testcorpus.mm'))
self.class_ = ldamodel.LdaModel
self.model = self.class_(common_corpus, id2word=common_dictionary, num_topics=2, passes=100)
def test_inputs(self):
# checking empty inputs
vec_1 = []
vec_2 = []
result = matutils.hellinger(vec_1, vec_2)
expected = 0.0
self.assertEqual(expected, result)
# checking np array and list input
vec_1 = np.array([])
vec_2 = []
result = matutils.hellinger(vec_1, vec_2)
expected = 0.0
self.assertEqual(expected, result)
# checking scipy csr matrix and list input
vec_1 = csr_matrix([])
vec_2 = []
result = matutils.hellinger(vec_1, vec_2)
expected = 0.0
self.assertEqual(expected, result)
def test_distributions(self):
# checking different length bag of words as inputs
vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
result = matutils.hellinger(vec_1, vec_2)
expected = 0.484060507634
self.assertAlmostEqual(expected, result)
# checking symmetrical bag of words inputs return same distance
vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1), (8, 0.1), (10, 0.8), (9, 0.1)]
result = matutils.hellinger(vec_1, vec_2)
result_symmetric = matutils.hellinger(vec_2, vec_1)
expected = 0.856921568786
self.assertAlmostEqual(expected, result)
self.assertAlmostEqual(expected, result_symmetric)
# checking ndarray, csr_matrix as inputs
vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]])
vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]])
result = matutils.hellinger(vec_1, vec_2)
expected = 0.160618030536
self.assertAlmostEqual(expected, result)
# checking ndarray, list as inputs
vec_1 = np.array([0.6, 0.1, 0.1, 0.2])
vec_2 = [0.2, 0.2, 0.1, 0.5]
result = matutils.hellinger(vec_1, vec_2)
expected = 0.309742984153
self.assertAlmostEqual(expected, result)
# testing LDA distribution vectors
np.random.seed(0)
model = self.class_(self.corpus, id2word=common_dictionary, num_topics=2, passes=100)
lda_vec1 = model[[(1, 2), (2, 3)]]
lda_vec2 = model[[(2, 2), (1, 3)]]
result = matutils.hellinger(lda_vec1, lda_vec2)
expected = 1.0406845281146034e-06
self.assertAlmostEqual(expected, result)
class TestKL(unittest.TestCase):
def setUp(self):
self.corpus = MmCorpus(datapath('testcorpus.mm'))
self.class_ = ldamodel.LdaModel
self.model = self.class_(common_corpus, id2word=common_dictionary, num_topics=2, passes=100)
def test_inputs(self):
# checking empty inputs
vec_1 = []
vec_2 = []
result = matutils.kullback_leibler(vec_1, vec_2)
expected = 0.0
self.assertEqual(expected, result)
# checking np array and list input
vec_1 = np.array([])
vec_2 = []
result = matutils.kullback_leibler(vec_1, vec_2)
expected = 0.0
self.assertEqual(expected, result)
# checking scipy csr matrix and list input
vec_1 = csr_matrix([])
vec_2 = []
result = matutils.kullback_leibler(vec_1, vec_2)
expected = 0.0
self.assertEqual(expected, result)
def test_distributions(self):
# checking bag of words as inputs
vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
result = matutils.kullback_leibler(vec_2, vec_1, 8)
expected = 0.55451775
self.assertAlmostEqual(expected, result, places=5)
# KL is not symetric; vec1 compared with vec2 will contain log of zeros and return infinity
vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
result = matutils.kullback_leibler(vec_1, vec_2, 8)
self.assertTrue(math.isinf(result))
# checking ndarray, csr_matrix as inputs
vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]])
vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]])
result = matutils.kullback_leibler(vec_1, vec_2, 3)
expected = 0.0894502
self.assertAlmostEqual(expected, result, places=5)
# checking ndarray, list as inputs
vec_1 = np.array([0.6, 0.1, 0.1, 0.2])
vec_2 = [0.2, 0.2, 0.1, 0.5]
result = matutils.kullback_leibler(vec_1, vec_2)
expected = 0.40659450877
self.assertAlmostEqual(expected, result, places=5)
# testing LDA distribution vectors
np.random.seed(0)
model = self.class_(self.corpus, id2word=common_dictionary, num_topics=2, passes=100)
lda_vec1 = model[[(1, 2), (2, 3)]]
lda_vec2 = model[[(2, 2), (1, 3)]]
result = matutils.kullback_leibler(lda_vec1, lda_vec2)
expected = 4.283407e-12
self.assertAlmostEqual(expected, result, places=5)
class TestJaccard(unittest.TestCase):
def test_inputs(self):
# all empty inputs will give a divide by zero exception
vec_1 = []
vec_2 = []
self.assertRaises(ZeroDivisionError, matutils.jaccard, vec_1, vec_2)
def test_distributions(self):
# checking bag of words as inputs
vec_1 = [(2, 1), (3, 4), (4, 1), (5, 1), (1, 1), (7, 2)]
vec_2 = [(1, 1), (3, 8), (4, 1)]
result = matutils.jaccard(vec_2, vec_1)
expected = 1 - 0.3
self.assertAlmostEqual(expected, result)
# checking ndarray, csr_matrix as inputs
vec_1 = np.array([[1, 3], [0, 4], [2, 3]])
vec_2 = csr_matrix([[1, 4], [0, 2], [2, 2]])
result = matutils.jaccard(vec_1, vec_2)
expected = 1 - 0.388888888889
self.assertAlmostEqual(expected, result)
# checking ndarray, list as inputs
vec_1 = np.array([6, 1, 2, 3])
vec_2 = [4, 3, 2, 5]
result = matutils.jaccard(vec_1, vec_2)
expected = 1 - 0.333333333333
self.assertAlmostEqual(expected, result)
class TestSoftCosineSimilarity(unittest.TestCase):
def test_inputs(self):
# checking empty inputs
vec_1 = []
vec_2 = []
similarity_matrix = csc_matrix((0, 0))
result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
expected = 0.0
self.assertEqual(expected, result)
# checking CSR term similarity matrix format
similarity_matrix = csr_matrix((0, 0))
result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
expected = 0.0
self.assertEqual(expected, result)
# checking unknown term similarity matrix format
with self.assertRaises(ValueError):
matutils.softcossim(vec_1, vec_2, np.matrix([]))
def test_distributions(self):
# checking bag of words as inputs
vec_1 = [(0, 1.0), (2, 1.0)] # hello world
vec_2 = [(1, 1.0), (2, 1.0)] # hi world
similarity_matrix = csc_matrix([[1, 0.5, 0], [0.5, 1, 0], [0, 0, 1]])
result = matutils.softcossim(vec_1, vec_2, similarity_matrix)
expected = 0.75
self.assertAlmostEqual(expected, result)
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()