221 lines
7.6 KiB
Python
221 lines
7.6 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
|
||
|
"""
|
||
|
Automated tests for checking various utils functions.
|
||
|
"""
|
||
|
|
||
|
|
||
|
import logging
|
||
|
import unittest
|
||
|
|
||
|
import numpy as np
|
||
|
from six import iteritems
|
||
|
|
||
|
from gensim import utils
|
||
|
from gensim.test.utils import datapath
|
||
|
|
||
|
|
||
|
class TestIsCorpus(unittest.TestCase):
|
||
|
def test_None(self):
|
||
|
# test None
|
||
|
result = utils.is_corpus(None)
|
||
|
expected = (False, None)
|
||
|
self.assertEqual(expected, result)
|
||
|
|
||
|
def test_simple_lists_of_tuples(self):
|
||
|
# test list words
|
||
|
|
||
|
# one document, one word
|
||
|
potentialCorpus = [[(0, 4.)]]
|
||
|
result = utils.is_corpus(potentialCorpus)
|
||
|
expected = (True, potentialCorpus)
|
||
|
self.assertEqual(expected, result)
|
||
|
|
||
|
# one document, several words
|
||
|
potentialCorpus = [[(0, 4.), (1, 2.)]]
|
||
|
result = utils.is_corpus(potentialCorpus)
|
||
|
expected = (True, potentialCorpus)
|
||
|
self.assertEqual(expected, result)
|
||
|
|
||
|
potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]]
|
||
|
result = utils.is_corpus(potentialCorpus)
|
||
|
expected = (True, potentialCorpus)
|
||
|
self.assertEqual(expected, result)
|
||
|
|
||
|
# several documents, one word
|
||
|
potentialCorpus = [[(0, 4.)], [(1, 2.)]]
|
||
|
result = utils.is_corpus(potentialCorpus)
|
||
|
expected = (True, potentialCorpus)
|
||
|
self.assertEqual(expected, result)
|
||
|
|
||
|
potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]]
|
||
|
result = utils.is_corpus(potentialCorpus)
|
||
|
expected = (True, potentialCorpus)
|
||
|
self.assertEqual(expected, result)
|
||
|
|
||
|
def test_int_tuples(self):
|
||
|
potentialCorpus = [[(0, 4)]]
|
||
|
result = utils.is_corpus(potentialCorpus)
|
||
|
expected = (True, potentialCorpus)
|
||
|
self.assertEqual(expected, result)
|
||
|
|
||
|
def test_invalid_formats(self):
|
||
|
# test invalid formats
|
||
|
# these are no corpus, because they do not consists of 2-tuples with
|
||
|
# the form(int, float).
|
||
|
potentials = list()
|
||
|
potentials.append(["human"])
|
||
|
potentials.append("human")
|
||
|
potentials.append(["human", "star"])
|
||
|
potentials.append([1, 2, 3, 4, 5, 5])
|
||
|
potentials.append([[(0, 'string')]])
|
||
|
for noCorpus in potentials:
|
||
|
result = utils.is_corpus(noCorpus)
|
||
|
expected = (False, noCorpus)
|
||
|
self.assertEqual(expected, result)
|
||
|
|
||
|
|
||
|
class TestUtils(unittest.TestCase):
|
||
|
def test_decode_entities(self):
|
||
|
# create a string that fails to decode with unichr on narrow python builds
|
||
|
body = u'It’s the Year of the Horse. YES VIN DIESEL 🙌 💯'
|
||
|
expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af'
|
||
|
self.assertEqual(utils.decode_htmlentities(body), expected)
|
||
|
|
||
|
def test_open_file_existent_file(self):
|
||
|
number_of_lines_in_file = 30
|
||
|
with utils.open_file(datapath('testcorpus.mm')) as infile:
|
||
|
self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
|
||
|
|
||
|
def test_open_file_non_existent_file(self):
|
||
|
with self.assertRaises(Exception):
|
||
|
with utils.open_file('non_existent_file.txt'):
|
||
|
pass
|
||
|
|
||
|
def test_open_file_existent_file_object(self):
|
||
|
number_of_lines_in_file = 30
|
||
|
file_obj = open(datapath('testcorpus.mm'))
|
||
|
with utils.open_file(file_obj) as infile:
|
||
|
self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
|
||
|
|
||
|
def test_open_file_non_existent_file_object(self):
|
||
|
file_obj = None
|
||
|
with self.assertRaises(Exception):
|
||
|
with utils.open_file(file_obj):
|
||
|
pass
|
||
|
|
||
|
|
||
|
class TestSampleDict(unittest.TestCase):
|
||
|
def test_sample_dict(self):
|
||
|
d = {1: 2, 2: 3, 3: 4, 4: 5}
|
||
|
expected_dict = [(1, 2), (2, 3)]
|
||
|
expected_dict_random = [(k, v) for k, v in iteritems(d)]
|
||
|
sampled_dict = utils.sample_dict(d, 2, False)
|
||
|
self.assertEqual(sampled_dict, expected_dict)
|
||
|
sampled_dict_random = utils.sample_dict(d, 2)
|
||
|
if sampled_dict_random in expected_dict_random:
|
||
|
self.assertTrue(True)
|
||
|
|
||
|
|
||
|
class TestWindowing(unittest.TestCase):
|
||
|
|
||
|
arr10_5 = np.array([
|
||
|
[0, 1, 2, 3, 4],
|
||
|
[1, 2, 3, 4, 5],
|
||
|
[2, 3, 4, 5, 6],
|
||
|
[3, 4, 5, 6, 7],
|
||
|
[4, 5, 6, 7, 8],
|
||
|
[5, 6, 7, 8, 9]
|
||
|
])
|
||
|
|
||
|
def _assert_arrays_equal(self, expected, actual):
|
||
|
self.assertEqual(expected.shape, actual.shape)
|
||
|
self.assertTrue((actual == expected).all())
|
||
|
|
||
|
def test_strided_windows1(self):
|
||
|
out = utils.strided_windows(range(5), 2)
|
||
|
expected = np.array([
|
||
|
[0, 1],
|
||
|
[1, 2],
|
||
|
[2, 3],
|
||
|
[3, 4]
|
||
|
])
|
||
|
self._assert_arrays_equal(expected, out)
|
||
|
|
||
|
def test_strided_windows2(self):
|
||
|
input_arr = np.arange(10)
|
||
|
out = utils.strided_windows(input_arr, 5)
|
||
|
expected = self.arr10_5.copy()
|
||
|
self._assert_arrays_equal(expected, out)
|
||
|
out[0, 0] = 10
|
||
|
self.assertEqual(10, input_arr[0], "should make view rather than copy")
|
||
|
|
||
|
def test_strided_windows_window_size_exceeds_size(self):
|
||
|
input_arr = np.array(['this', 'is', 'test'], dtype='object')
|
||
|
out = utils.strided_windows(input_arr, 4)
|
||
|
expected = np.ndarray((0, 0))
|
||
|
self._assert_arrays_equal(expected, out)
|
||
|
|
||
|
def test_strided_windows_window_size_equals_size(self):
|
||
|
input_arr = np.array(['this', 'is', 'test'], dtype='object')
|
||
|
out = utils.strided_windows(input_arr, 3)
|
||
|
expected = np.array([input_arr.copy()])
|
||
|
self._assert_arrays_equal(expected, out)
|
||
|
|
||
|
def test_iter_windows_include_below_window_size(self):
|
||
|
texts = [['this', 'is', 'a'], ['test', 'document']]
|
||
|
out = utils.iter_windows(texts, 3, ignore_below_size=False)
|
||
|
windows = [list(w) for w in out]
|
||
|
self.assertEqual(texts, windows)
|
||
|
|
||
|
out = utils.iter_windows(texts, 3)
|
||
|
windows = [list(w) for w in out]
|
||
|
self.assertEqual([texts[0]], windows)
|
||
|
|
||
|
def test_iter_windows_list_texts(self):
|
||
|
texts = [['this', 'is', 'a'], ['test', 'document']]
|
||
|
windows = list(utils.iter_windows(texts, 2))
|
||
|
list_windows = [list(iterable) for iterable in windows]
|
||
|
expected = [['this', 'is'], ['is', 'a'], ['test', 'document']]
|
||
|
self.assertListEqual(list_windows, expected)
|
||
|
|
||
|
def test_iter_windows_uses_views(self):
|
||
|
texts = [np.array(['this', 'is', 'a'], dtype='object'), ['test', 'document']]
|
||
|
windows = list(utils.iter_windows(texts, 2))
|
||
|
list_windows = [list(iterable) for iterable in windows]
|
||
|
expected = [['this', 'is'], ['is', 'a'], ['test', 'document']]
|
||
|
self.assertListEqual(list_windows, expected)
|
||
|
windows[0][0] = 'modified'
|
||
|
self.assertEqual('modified', texts[0][0])
|
||
|
|
||
|
def test_iter_windows_with_copy(self):
|
||
|
texts = [
|
||
|
np.array(['this', 'is', 'a'], dtype='object'),
|
||
|
np.array(['test', 'document'], dtype='object')
|
||
|
]
|
||
|
windows = list(utils.iter_windows(texts, 2, copy=True))
|
||
|
|
||
|
windows[0][0] = 'modified'
|
||
|
self.assertEqual('this', texts[0][0])
|
||
|
|
||
|
windows[2][0] = 'modified'
|
||
|
self.assertEqual('test', texts[1][0])
|
||
|
|
||
|
def test_flatten_nested(self):
|
||
|
nested_list = [[[1, 2, 3], [4, 5]], 6]
|
||
|
expected = [1, 2, 3, 4, 5, 6]
|
||
|
self.assertEqual(utils.flatten(nested_list), expected)
|
||
|
|
||
|
def test_flatten_not_nested(self):
|
||
|
not_nested = [1, 2, 3, 4, 5, 6]
|
||
|
expected = [1, 2, 3, 4, 5, 6]
|
||
|
self.assertEqual(utils.flatten(not_nested), expected)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
logging.root.setLevel(logging.WARNING)
|
||
|
unittest.main()
|