220 lines
7.6 KiB
Python
220 lines
7.6 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""
|
|
Automated tests for checking various utils functions.
|
|
"""
|
|
|
|
|
|
import logging
|
|
import unittest
|
|
|
|
import numpy as np
|
|
from six import iteritems
|
|
|
|
from gensim import utils
|
|
from gensim.test.utils import datapath
|
|
|
|
|
|
class TestIsCorpus(unittest.TestCase):
|
|
def test_None(self):
|
|
# test None
|
|
result = utils.is_corpus(None)
|
|
expected = (False, None)
|
|
self.assertEqual(expected, result)
|
|
|
|
def test_simple_lists_of_tuples(self):
|
|
# test list words
|
|
|
|
# one document, one word
|
|
potentialCorpus = [[(0, 4.)]]
|
|
result = utils.is_corpus(potentialCorpus)
|
|
expected = (True, potentialCorpus)
|
|
self.assertEqual(expected, result)
|
|
|
|
# one document, several words
|
|
potentialCorpus = [[(0, 4.), (1, 2.)]]
|
|
result = utils.is_corpus(potentialCorpus)
|
|
expected = (True, potentialCorpus)
|
|
self.assertEqual(expected, result)
|
|
|
|
potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]]
|
|
result = utils.is_corpus(potentialCorpus)
|
|
expected = (True, potentialCorpus)
|
|
self.assertEqual(expected, result)
|
|
|
|
# several documents, one word
|
|
potentialCorpus = [[(0, 4.)], [(1, 2.)]]
|
|
result = utils.is_corpus(potentialCorpus)
|
|
expected = (True, potentialCorpus)
|
|
self.assertEqual(expected, result)
|
|
|
|
potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]]
|
|
result = utils.is_corpus(potentialCorpus)
|
|
expected = (True, potentialCorpus)
|
|
self.assertEqual(expected, result)
|
|
|
|
def test_int_tuples(self):
|
|
potentialCorpus = [[(0, 4)]]
|
|
result = utils.is_corpus(potentialCorpus)
|
|
expected = (True, potentialCorpus)
|
|
self.assertEqual(expected, result)
|
|
|
|
def test_invalid_formats(self):
|
|
# test invalid formats
|
|
# these are no corpus, because they do not consists of 2-tuples with
|
|
# the form(int, float).
|
|
potentials = list()
|
|
potentials.append(["human"])
|
|
potentials.append("human")
|
|
potentials.append(["human", "star"])
|
|
potentials.append([1, 2, 3, 4, 5, 5])
|
|
potentials.append([[(0, 'string')]])
|
|
for noCorpus in potentials:
|
|
result = utils.is_corpus(noCorpus)
|
|
expected = (False, noCorpus)
|
|
self.assertEqual(expected, result)
|
|
|
|
|
|
class TestUtils(unittest.TestCase):
|
|
def test_decode_entities(self):
|
|
# create a string that fails to decode with unichr on narrow python builds
|
|
body = u'It’s the Year of the Horse. YES VIN DIESEL 🙌 💯'
|
|
expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af'
|
|
self.assertEqual(utils.decode_htmlentities(body), expected)
|
|
|
|
def test_open_file_existent_file(self):
|
|
number_of_lines_in_file = 30
|
|
with utils.open_file(datapath('testcorpus.mm')) as infile:
|
|
self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
|
|
|
|
def test_open_file_non_existent_file(self):
|
|
with self.assertRaises(Exception):
|
|
with utils.open_file('non_existent_file.txt'):
|
|
pass
|
|
|
|
def test_open_file_existent_file_object(self):
|
|
number_of_lines_in_file = 30
|
|
file_obj = open(datapath('testcorpus.mm'))
|
|
with utils.open_file(file_obj) as infile:
|
|
self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
|
|
|
|
def test_open_file_non_existent_file_object(self):
|
|
file_obj = None
|
|
with self.assertRaises(Exception):
|
|
with utils.open_file(file_obj):
|
|
pass
|
|
|
|
|
|
class TestSampleDict(unittest.TestCase):
|
|
def test_sample_dict(self):
|
|
d = {1: 2, 2: 3, 3: 4, 4: 5}
|
|
expected_dict = [(1, 2), (2, 3)]
|
|
expected_dict_random = [(k, v) for k, v in iteritems(d)]
|
|
sampled_dict = utils.sample_dict(d, 2, False)
|
|
self.assertEqual(sampled_dict, expected_dict)
|
|
sampled_dict_random = utils.sample_dict(d, 2)
|
|
if sampled_dict_random in expected_dict_random:
|
|
self.assertTrue(True)
|
|
|
|
|
|
class TestWindowing(unittest.TestCase):
|
|
|
|
arr10_5 = np.array([
|
|
[0, 1, 2, 3, 4],
|
|
[1, 2, 3, 4, 5],
|
|
[2, 3, 4, 5, 6],
|
|
[3, 4, 5, 6, 7],
|
|
[4, 5, 6, 7, 8],
|
|
[5, 6, 7, 8, 9]
|
|
])
|
|
|
|
def _assert_arrays_equal(self, expected, actual):
|
|
self.assertEqual(expected.shape, actual.shape)
|
|
self.assertTrue((actual == expected).all())
|
|
|
|
def test_strided_windows1(self):
|
|
out = utils.strided_windows(range(5), 2)
|
|
expected = np.array([
|
|
[0, 1],
|
|
[1, 2],
|
|
[2, 3],
|
|
[3, 4]
|
|
])
|
|
self._assert_arrays_equal(expected, out)
|
|
|
|
def test_strided_windows2(self):
|
|
input_arr = np.arange(10)
|
|
out = utils.strided_windows(input_arr, 5)
|
|
expected = self.arr10_5.copy()
|
|
self._assert_arrays_equal(expected, out)
|
|
out[0, 0] = 10
|
|
self.assertEqual(10, input_arr[0], "should make view rather than copy")
|
|
|
|
def test_strided_windows_window_size_exceeds_size(self):
|
|
input_arr = np.array(['this', 'is', 'test'], dtype='object')
|
|
out = utils.strided_windows(input_arr, 4)
|
|
expected = np.ndarray((0, 0))
|
|
self._assert_arrays_equal(expected, out)
|
|
|
|
def test_strided_windows_window_size_equals_size(self):
|
|
input_arr = np.array(['this', 'is', 'test'], dtype='object')
|
|
out = utils.strided_windows(input_arr, 3)
|
|
expected = np.array([input_arr.copy()])
|
|
self._assert_arrays_equal(expected, out)
|
|
|
|
def test_iter_windows_include_below_window_size(self):
|
|
texts = [['this', 'is', 'a'], ['test', 'document']]
|
|
out = utils.iter_windows(texts, 3, ignore_below_size=False)
|
|
windows = [list(w) for w in out]
|
|
self.assertEqual(texts, windows)
|
|
|
|
out = utils.iter_windows(texts, 3)
|
|
windows = [list(w) for w in out]
|
|
self.assertEqual([texts[0]], windows)
|
|
|
|
def test_iter_windows_list_texts(self):
|
|
texts = [['this', 'is', 'a'], ['test', 'document']]
|
|
windows = list(utils.iter_windows(texts, 2))
|
|
list_windows = [list(iterable) for iterable in windows]
|
|
expected = [['this', 'is'], ['is', 'a'], ['test', 'document']]
|
|
self.assertListEqual(list_windows, expected)
|
|
|
|
def test_iter_windows_uses_views(self):
|
|
texts = [np.array(['this', 'is', 'a'], dtype='object'), ['test', 'document']]
|
|
windows = list(utils.iter_windows(texts, 2))
|
|
list_windows = [list(iterable) for iterable in windows]
|
|
expected = [['this', 'is'], ['is', 'a'], ['test', 'document']]
|
|
self.assertListEqual(list_windows, expected)
|
|
windows[0][0] = 'modified'
|
|
self.assertEqual('modified', texts[0][0])
|
|
|
|
def test_iter_windows_with_copy(self):
|
|
texts = [
|
|
np.array(['this', 'is', 'a'], dtype='object'),
|
|
np.array(['test', 'document'], dtype='object')
|
|
]
|
|
windows = list(utils.iter_windows(texts, 2, copy=True))
|
|
|
|
windows[0][0] = 'modified'
|
|
self.assertEqual('this', texts[0][0])
|
|
|
|
windows[2][0] = 'modified'
|
|
self.assertEqual('test', texts[1][0])
|
|
|
|
def test_flatten_nested(self):
|
|
nested_list = [[[1, 2, 3], [4, 5]], 6]
|
|
expected = [1, 2, 3, 4, 5, 6]
|
|
self.assertEqual(utils.flatten(nested_list), expected)
|
|
|
|
def test_flatten_not_nested(self):
|
|
not_nested = [1, 2, 3, 4, 5, 6]
|
|
expected = [1, 2, 3, 4, 5, 6]
|
|
self.assertEqual(utils.flatten(not_nested), expected)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
logging.root.setLevel(logging.WARNING)
|
|
unittest.main()
|