laywerrobot/lib/python3.6/site-packages/gensim/test/test_miislita.py
2020-08-27 21:55:39 +02:00

108 lines
3.6 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
This module replicates the miislita vector spaces from
"A Linear Algebra Approach to the Vector Space Model -- A Fast Track Tutorial"
by Dr. E. Garcia, admin@miislita.com
See http://www.miislita.com for further details.
"""
from __future__ import division # always use floats
from __future__ import with_statement
import logging
import os
import unittest
from gensim import utils, corpora, models, similarities
from gensim.test.utils import datapath, get_tmpfile
logger = logging.getLogger('test_miislita')
class CorpusMiislita(corpora.TextCorpus):
stoplist = set('for a of the and to in on'.split())
def get_texts(self):
"""
Parse documents from the .cor file provided in the constructor. Lowercase
each document and ignore some stopwords.
.cor format: one document per line, words separated by whitespace.
"""
for doc in self.getstream():
yield [word for word in utils.to_unicode(doc).lower().split()
if word not in CorpusMiislita.stoplist]
def __len__(self):
"""Define this so we can use `len(corpus)`"""
if 'length' not in self.__dict__:
logger.info("caching corpus size (calculating number of documents)")
self.length = sum(1 for _ in self.get_texts())
return self.length
class TestMiislita(unittest.TestCase):
def test_textcorpus(self):
"""Make sure TextCorpus can be serialized to disk. """
# construct corpus from file
miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
# make sure serializing works
ftmp = get_tmpfile('test_textcorpus.mm')
corpora.MmCorpus.save_corpus(ftmp, miislita)
self.assertTrue(os.path.exists(ftmp))
# make sure deserializing gives the same result
miislita2 = corpora.MmCorpus(ftmp)
self.assertEqual(list(miislita), list(miislita2))
def test_save_load_ability(self):
"""
Make sure we can save and load (un/pickle) TextCorpus objects (as long
as the underlying input isn't a file-like object; we cannot pickle those).
"""
# construct corpus from file
corpusname = datapath('miIslita.cor')
miislita = CorpusMiislita(corpusname)
# pickle to disk
tmpf = get_tmpfile('tc_test.cpickle')
miislita.save(tmpf)
miislita2 = CorpusMiislita.load(tmpf)
self.assertEqual(len(miislita), len(miislita2))
self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id)
def test_miislita_high_level(self):
# construct corpus from file
miislita = CorpusMiislita(datapath('miIslita.cor'))
# initialize tfidf transformation and similarity index
tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))
# compare to query
query = 'latent semantic indexing'
vec_bow = miislita.dictionary.doc2bow(query.lower().split())
vec_tfidf = tfidf[vec_bow]
# perform a similarity query against the corpus
sims_tfidf = index[vec_tfidf]
# for the expected results see the article
expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
for i, value in enumerate(expected):
self.assertAlmostEqual(sims_tfidf[i], value, 2)
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
unittest.main()