laywerrobot/lib/python3.6/site-packages/gensim/test/test_big.py
2020-08-27 21:55:39 +02:00

73 lines
2.4 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2014 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Automated tests for checking processing/storing large inputs.
"""
import logging
import unittest
import os
import numpy as np
import gensim
from gensim.test.utils import get_tmpfile
class BigCorpus(object):
"""A corpus of a large number of docs & large vocab"""
def __init__(self, words_only=False, num_terms=200000, num_docs=1000000, doc_len=100):
self.dictionary = gensim.utils.FakeDict(num_terms)
self.words_only = words_only
self.num_docs = num_docs
self.doc_len = doc_len
def __iter__(self):
for _ in range(self.num_docs):
doc_len = np.random.poisson(self.doc_len)
ids = np.random.randint(0, len(self.dictionary), doc_len)
if self.words_only:
yield [str(idx) for idx in ids]
else:
weights = np.random.poisson(3, doc_len)
yield sorted(zip(ids, weights))
if os.environ.get('GENSIM_BIG', False):
class TestLargeData(unittest.TestCase):
"""Try common operations, using large models. You'll need ~8GB RAM to run these tests"""
def testWord2Vec(self):
corpus = BigCorpus(words_only=True, num_docs=100000, num_terms=3000000, doc_len=200)
tmpf = get_tmpfile('gensim_big.tst')
model = gensim.models.Word2Vec(corpus, size=300, workers=4)
model.save(tmpf, ignore=['syn1'])
del model
gensim.models.Word2Vec.load(tmpf)
def testLsiModel(self):
corpus = BigCorpus(num_docs=50000)
tmpf = get_tmpfile('gensim_big.tst')
model = gensim.models.LsiModel(corpus, num_topics=500, id2word=corpus.dictionary)
model.save(tmpf)
del model
gensim.models.LsiModel.load(tmpf)
def testLdaModel(self):
corpus = BigCorpus(num_docs=5000)
tmpf = get_tmpfile('gensim_big.tst')
model = gensim.models.LdaModel(corpus, num_topics=500, id2word=corpus.dictionary)
model.save(tmpf)
del model
gensim.models.LdaModel.load(tmpf)
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()