import unittest
import numpy
import codecs
import pickle

from scipy import sparse
try:
    from sklearn.pipeline import Pipeline
    from sklearn import linear_model, cluster
    from sklearn.exceptions import NotFittedError
except ImportError:
    raise unittest.SkipTest("Test requires scikit-learn to be installed, which is not available")

from gensim.sklearn_api.rpmodel import RpTransformer
from gensim.sklearn_api.ldamodel import LdaTransformer
from gensim.sklearn_api.lsimodel import LsiTransformer
from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer
from gensim.sklearn_api.w2vmodel import W2VTransformer
from gensim.sklearn_api.atmodel import AuthorTopicTransformer
from gensim.sklearn_api.d2vmodel import D2VTransformer
from gensim.sklearn_api.text2bow import Text2BowTransformer
from gensim.sklearn_api.tfidf import TfIdfTransformer
from gensim.sklearn_api.hdp import HdpTransformer
from gensim.sklearn_api.phrases import PhrasesTransformer
from gensim.corpora import mmcorpus, Dictionary
from gensim import matutils, models
from gensim.test.utils import datapath, common_texts

texts = [
    ['complier', 'system', 'computer'],
    ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
    ['graph', 'flow', 'network', 'graph'],
    ['loading', 'computer', 'system'],
    ['user', 'server', 'system'],
    ['tree', 'hamiltonian'],
    ['graph', 'trees'],
    ['computer', 'kernel', 'malfunction', 'computer'],
    ['server', 'system', 'computer'],
]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
author2doc = {
    'john': [0, 1, 2, 3, 4, 5, 6],
    'jane': [2, 3, 4, 5, 6, 7, 8],
    'jack': [0, 2, 4, 6, 8],
    'jill': [1, 3, 5, 7]
}

texts_new = texts[0:3]
author2doc_new = {
    'jill': [0],
    'bob': [0, 1],
    'sally': [1, 2]
}
dictionary_new = Dictionary(texts_new)
corpus_new = [dictionary_new.doc2bow(text) for text in texts_new]

texts_ldaseq = [
    [
        u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently',
        u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'
    ],
    [
        u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming',
        u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities',
        u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing',
        u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates',
        u'hiring', u'conducting', u'interviews'
    ],
    [
        u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate',
        u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing',
        u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones',
        u'participating'
    ],
    [
        u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills',
        u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge',
        u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills',
        u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor',
        u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates',
        u'openings', u'jobs'
    ],
    [
        u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail',
        u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology',
        u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates',
        u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands',
        u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects',
        u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel',
        u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating',
        u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops',
        u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine',
        u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates',
        u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits',
        u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness',
        u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives',
        u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge',
        u'skills', u'engineering', u'quality', u'engineering'
    ],
    [
        u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering',
        u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills',
        u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation',
        u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'
    ],
    [
        u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills',
        u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone',
        u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions',
        u'solutions', u'biggest', u'insurers', u'operates', u'investment'
    ],
    [
        u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments',
        u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors',
        u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile',
        u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship',
        u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering',
        u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding',
        u'estimation', u'testing', u'procedures', u'voltage', u'engineering'
    ],
    [
        u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage',
        u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills',
        u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad',
        u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks',
        u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled',
        u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond',
        u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical',
        u'contracting', u'southwest', u'electrical', u'contractors'
    ],
    [
        u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated',
        u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies',
        u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx',
        u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns',
        u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership',
        u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants',
        u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation',
        u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians',
        u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality',
        u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants',
        u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing',
        u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories',
        u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'
    ],
    [
        u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation',
        u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated',
        u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers',
        u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers',
        u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual',
        u'automated', u'participate', u'ongoing'
    ],
    [
        u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor',
        u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi',
        u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills',
        u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components',
        u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize',
        u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts',
        u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy',
        u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing',
        u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click',
        u'attach'
    ],
    [
        u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary',
        u'asrc', u'engineering', u'technology', u'contracts'
    ],
    [
        u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions',
        u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical',
        u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal',
        u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops',
        u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness',
        u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability',
        u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco',
        u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital',
        u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies',
        u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic',
        u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact',
        u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries',
        u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport',
        u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'
    ],
    [
        u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients',
        u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt',
        u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities',
        u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console',
        u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer',
        u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment',
        u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical',
        u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably',
        u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills',
        u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings',
        u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace',
        u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'
    ],
    [
        u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities',
        u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal',
        u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined',
        u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle',
        u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android',
        u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'
    ],
    [
        u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression',
        u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation',
        u'multiple', u'engineering', u'techexpousa', u'reviews'
    ],
    [
        u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema',
        u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries',
        u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned',
        u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains',
        u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate',
        u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling',
        u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically',
        u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation',
        u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies',
        u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills',
        u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified',
        u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'
    ],
    [
        u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive',
        u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews',
        u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input',
        u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks',
        u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating',
        u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving',
        u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing',
        u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing',
        u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware',
        u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository',
        u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls',
        u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat',
        u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe',
        u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax',
        u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script',
        u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration',
        u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral',
        u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'
    ],
    [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'],
    ['bank', 'river', 'shore', 'water'],
    ['river', 'water', 'flow', 'fast', 'tree'],
    ['bank', 'water', 'fall', 'flow'],
    ['bank', 'bank', 'water', 'rain', 'river'],
    ['river', 'water', 'mud', 'tree'],
    ['money', 'transaction', 'bank', 'finance'],
    ['bank', 'borrow', 'money'],
    ['bank', 'finance'],
    ['finance', 'money', 'sell', 'bank'],
    ['borrow', 'sell'],
    ['bank', 'loan', 'sell']
]
dictionary_ldaseq = Dictionary(texts_ldaseq)
corpus_ldaseq = [dictionary_ldaseq.doc2bow(text) for text in texts_ldaseq]

w2v_texts = [
    ['calculus', 'is', 'the', 'mathematical', 'study', 'of', 'continuous', 'change'],
    ['geometry', 'is', 'the', 'study', 'of', 'shape'],
    ['algebra', 'is', 'the', 'study', 'of', 'generalizations', 'of', 'arithmetic', 'operations'],
    ['differential', 'calculus', 'is', 'related', 'to', 'rates', 'of', 'change', 'and', 'slopes', 'of', 'curves'],
    ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and',
     'the', 'areas', 'under', 'and', 'between', 'curves'],
    ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter',
     'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'],
    ['the', 'main', 'goal', 'of', 'physics', 'is', 'to', 'understand', 'how', 'the', 'universe', 'behaves'],
    ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new',
     'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'],
    ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics',
     'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically',
     'transformed', 'modern', 'day', 'society']
]

d2v_sentences = [models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)]

dict_texts = [' '.join(text) for text in common_texts]

phrases_sentences = common_texts + [
    ['graph', 'minors', 'survey', 'human', 'interface']
]


class TestLdaWrapper(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        self.model = LdaTransformer(
            id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)
        )
        self.model.fit(corpus)

    def testTransform(self):
        texts_new = ['graph', 'eulerian']
        bow = self.model.id2word.doc2bow(texts_new)
        matrix = self.model.transform(bow)
        self.assertEqual(matrix.shape[0], 1)
        self.assertEqual(matrix.shape[1], self.model.num_topics)
        texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']]
        bow = []
        for i in texts_new:
            bow.append(self.model.id2word.doc2bow(i))
        matrix = self.model.transform(bow)
        self.assertEqual(matrix.shape[0], 3)
        self.assertEqual(matrix.shape[1], self.model.num_topics)

    def testPartialFit(self):
        for i in range(10):
            self.model.partial_fit(X=corpus)  # fit against the model again
        doc = list(corpus)[0]  # transform only the first document
        transformed = self.model.transform(doc)
        expected = numpy.array([0.13, 0.87])
        passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1e-1)
        self.assertTrue(passed)

    def testConsistencyWithGensimModel(self):
        # training an LdaTransformer with `num_topics`=10
        self.model = LdaTransformer(
            id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)
        )
        self.model.fit(corpus)

        # training a Gensim LdaModel with the same params
        gensim_ldamodel = models.LdaModel(
            corpus=corpus, id2word=dictionary, num_topics=10, passes=100,
            minimum_probability=0, random_state=numpy.random.seed(0)
        )

        texts_new = ['graph', 'eulerian']
        bow = self.model.id2word.doc2bow(texts_new)
        matrix_transformer_api = self.model.transform(bow)
        matrix_gensim_model = gensim_ldamodel[bow]
        # convert into dense representation to be able to compare with transformer output
        matrix_gensim_model_dense = matutils.sparse2full(matrix_gensim_model, 10)
        passed = numpy.allclose(matrix_transformer_api, matrix_gensim_model_dense, atol=1e-1)
        self.assertTrue(passed)

    def testCSRMatrixConversion(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        arr = numpy.array([[1, 2, 0], [0, 0, 3], [1, 0, 0]])
        sarr = sparse.csr_matrix(arr)
        newmodel = LdaTransformer(num_topics=2, passes=100)
        newmodel.fit(sarr)
        bow = [(0, 1), (1, 2), (2, 0)]
        transformed_vec = newmodel.transform(bow)
        expected_vec = numpy.array([0.12843782, 0.87156218])
        passed = numpy.allclose(transformed_vec, expected_vec, atol=1e-1)
        self.assertTrue(passed)

    def testPipeline(self):
        model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        id2word = Dictionary([x.split() for x in data.data])
        corpus = [id2word.doc2bow(i.split()) for i in data.data]
        numpy.random.mtrand.RandomState(1)  # set seed for getting same result
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_lda = Pipeline([('features', model,), ('classifier', clf)])
        text_lda.fit(corpus, data.target)
        score = text_lda.score(corpus, data.target)
        self.assertGreaterEqual(score, 0.40)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(num_topics=3)
        model_params = self.model.get_params()
        self.assertEqual(model_params["num_topics"], 3)
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)

        # updating multiple params
        param_dict = {"eval_every": 20, "decay": 0.7}
        self.model.set_params(**param_dict)
        model_params = self.model.get_params()
        for key in param_dict.keys():
            self.assertEqual(model_params[key], param_dict[key])
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'eval_every'), 20)
        self.assertEqual(getattr(self.model.gensim_model, 'decay'), 0.7)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        texts_new = ['graph', 'eulerian']
        loaded_bow = model_load.id2word.doc2bow(texts_new)
        loaded_matrix = model_load.transform(loaded_bow)

        # sanity check for transformation operation
        self.assertEqual(loaded_matrix.shape[0], 1)
        self.assertEqual(loaded_matrix.shape[1], model_load.num_topics)

        # comparing the original and loaded models
        original_bow = self.model.id2word.doc2bow(texts_new)
        original_matrix = self.model.transform(original_bow)
        passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        lda_wrapper = LdaTransformer(
            id2word=dictionary, num_topics=2, passes=100,
            minimum_probability=0, random_state=numpy.random.seed(0)
        )
        texts_new = ['graph', 'eulerian']
        bow = lda_wrapper.id2word.doc2bow(texts_new)
        self.assertRaises(NotFittedError, lda_wrapper.transform, bow)


class TestLsiWrapper(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        self.model = LsiTransformer(id2word=dictionary, num_topics=2)
        self.model.fit(corpus)

    def testTransform(self):
        texts_new = ['graph', 'eulerian']
        bow = self.model.id2word.doc2bow(texts_new)
        matrix = self.model.transform(bow)
        self.assertEqual(matrix.shape[0], 1)
        self.assertEqual(matrix.shape[1], self.model.num_topics)
        texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']]
        bow = []
        for i in texts_new:
            bow.append(self.model.id2word.doc2bow(i))
        matrix = self.model.transform(bow)
        self.assertEqual(matrix.shape[0], 3)
        self.assertEqual(matrix.shape[1], self.model.num_topics)

    def testPartialFit(self):
        for i in range(10):
            self.model.partial_fit(X=corpus)  # fit against the model again
        doc = list(corpus)[0]  # transform only the first document
        transformed = self.model.transform(doc)
        expected = numpy.array([1.39, 0.0])
        passed = numpy.allclose(transformed[0], expected, atol=1)
        self.assertTrue(passed)

    def testPipeline(self):
        model = LsiTransformer(num_topics=2)
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        id2word = Dictionary([x.split() for x in data.data])
        corpus = [id2word.doc2bow(i.split()) for i in data.data]
        numpy.random.mtrand.RandomState(1)  # set seed for getting same result
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_lsi = Pipeline([('features', model,), ('classifier', clf)])
        text_lsi.fit(corpus, data.target)
        score = text_lsi.score(corpus, data.target)
        self.assertGreater(score, 0.50)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(num_topics=3)
        model_params = self.model.get_params()
        self.assertEqual(model_params["num_topics"], 3)
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)

        # updating multiple params
        param_dict = {"chunksize": 10000, "decay": 0.9}
        self.model.set_params(**param_dict)
        model_params = self.model.get_params()
        for key in param_dict.keys():
            self.assertEqual(model_params[key], param_dict[key])
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'chunksize'), 10000)
        self.assertEqual(getattr(self.model.gensim_model, 'decay'), 0.9)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        texts_new = ['graph', 'eulerian']
        loaded_bow = model_load.id2word.doc2bow(texts_new)
        loaded_matrix = model_load.transform(loaded_bow)

        # sanity check for transformation operation
        self.assertEqual(loaded_matrix.shape[0], 1)
        self.assertEqual(loaded_matrix.shape[1], model_load.num_topics)

        # comparing the original and loaded models
        original_bow = self.model.id2word.doc2bow(texts_new)
        original_matrix = self.model.transform(original_bow)
        passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2)
        texts_new = ['graph', 'eulerian']
        bow = lsi_wrapper.id2word.doc2bow(texts_new)
        self.assertRaises(NotFittedError, lsi_wrapper.transform, bow)


class TestLdaSeqWrapper(unittest.TestCase):
    def setUp(self):
        self.model = LdaSeqTransformer(
            id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim'
        )
        self.model.fit(corpus_ldaseq)

    def testTransform(self):
        # transforming two documents
        docs = [list(corpus_ldaseq)[0], list(corpus_ldaseq)[1]]
        transformed_vecs = self.model.transform(docs)
        self.assertEqual(transformed_vecs.shape[0], 2)
        self.assertEqual(transformed_vecs.shape[1], self.model.num_topics)

        # transforming one document
        doc = list(corpus_ldaseq)[0]
        transformed_vecs = self.model.transform(doc)
        self.assertEqual(transformed_vecs.shape[0], 1)
        self.assertEqual(transformed_vecs.shape[1], self.model.num_topics)

    def testPipeline(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        test_data = data.data[0:2]
        test_target = data.target[0:2]
        id2word = Dictionary([x.split() for x in test_data])
        corpus = [id2word.doc2bow(i.split()) for i in test_data]
        model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_ldaseq = Pipeline([('features', model,), ('classifier', clf)])
        text_ldaseq.fit(corpus, test_target)
        score = text_ldaseq.score(corpus, test_target)
        self.assertGreater(score, 0.50)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(num_topics=3)
        model_params = self.model.get_params()
        self.assertEqual(model_params["num_topics"], 3)
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(corpus_ldaseq)
        self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = list(corpus_ldaseq)[0]
        loaded_transformed_vecs = model_load.transform(doc)

        # sanity check for transformation operation
        self.assertEqual(loaded_transformed_vecs.shape[0], 1)
        self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics)

        # comparing the original and loaded models
        original_transformed_vecs = self.model.transform(doc)
        passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        ldaseq_wrapper = LdaSeqTransformer(num_topics=2)
        doc = list(corpus_ldaseq)[0]
        self.assertRaises(NotFittedError, ldaseq_wrapper.transform, doc)


class TestRpWrapper(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(13)
        self.model = RpTransformer(num_topics=2)
        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
        self.model.fit(self.corpus)

    def testTransform(self):
        # tranform two documents
        docs = [list(self.corpus)[0], list(self.corpus)[1]]
        matrix = self.model.transform(docs)
        self.assertEqual(matrix.shape[0], 2)
        self.assertEqual(matrix.shape[1], self.model.num_topics)

        # tranform one document
        doc = list(self.corpus)[0]
        matrix = self.model.transform(doc)
        self.assertEqual(matrix.shape[0], 1)
        self.assertEqual(matrix.shape[1], self.model.num_topics)

    def testPipeline(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        model = RpTransformer(num_topics=2)
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        id2word = Dictionary([x.split() for x in data.data])
        corpus = [id2word.doc2bow(i.split()) for i in data.data]
        numpy.random.mtrand.RandomState(1)  # set seed for getting same result
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_rp = Pipeline([('features', model,), ('classifier', clf)])
        text_rp.fit(corpus, data.target)
        score = text_rp.score(corpus, data.target)
        self.assertGreater(score, 0.40)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(num_topics=3)
        model_params = self.model.get_params()
        self.assertEqual(model_params["num_topics"], 3)
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(self.corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = list(self.corpus)[0]
        loaded_transformed_vecs = model_load.transform(doc)

        # sanity check for transformation operation
        self.assertEqual(loaded_transformed_vecs.shape[0], 1)
        self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics)

        # comparing the original and loaded models
        original_transformed_vecs = self.model.transform(doc)
        passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        rpmodel_wrapper = RpTransformer(num_topics=2)
        doc = list(self.corpus)[0]
        self.assertRaises(NotFittedError, rpmodel_wrapper.transform, doc)


class TestWord2VecWrapper(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)
        self.model = W2VTransformer(size=10, min_count=0, seed=42)
        self.model.fit(texts)

    def testTransform(self):
        # tranform multiple words
        words = []
        words = words + texts[0]
        matrix = self.model.transform(words)
        self.assertEqual(matrix.shape[0], 3)
        self.assertEqual(matrix.shape[1], self.model.size)

        # tranform one word
        word = texts[0][0]
        matrix = self.model.transform(word)
        self.assertEqual(matrix.shape[0], 1)
        self.assertEqual(matrix.shape[1], self.model.size)

    def testConsistencyWithGensimModel(self):
        # training a W2VTransformer
        self.model = W2VTransformer(size=10, min_count=0, seed=42)
        self.model.fit(texts)

        # training a Gensim Word2Vec model with the same params
        gensim_w2vmodel = models.Word2Vec(texts, size=10, min_count=0, seed=42)

        word = texts[0][0]
        vec_transformer_api = self.model.transform(word)  # vector returned by W2VTransformer
        vec_gensim_model = gensim_w2vmodel[word]  # vector returned by Word2Vec
        passed = numpy.allclose(vec_transformer_api, vec_gensim_model, atol=1e-1)
        self.assertTrue(passed)

    def testPipeline(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        model = W2VTransformer(size=10, min_count=1)
        model.fit(w2v_texts)

        class_dict = {'mathematics': 1, 'physics': 0}
        train_data = [
            ('calculus', 'mathematics'), ('mathematical', 'mathematics'),
            ('geometry', 'mathematics'), ('operations', 'mathematics'),
            ('curves', 'mathematics'), ('natural', 'physics'), ('nuclear', 'physics'),
            ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')
        ]
        train_input = [x[0] for x in train_data]
        train_target = [class_dict[x[1]] for x in train_data]

        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        clf.fit(model.transform(train_input), train_target)
        text_w2v = Pipeline([('features', model,), ('classifier', clf)])
        score = text_w2v.score(train_input, train_target)
        self.assertGreater(score, 0.40)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(negative=20)
        model_params = self.model.get_params()
        self.assertEqual(model_params["negative"], 20)
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(texts)
        self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        word = texts[0][0]
        loaded_transformed_vecs = model_load.transform(word)

        # sanity check for transformation operation
        self.assertEqual(loaded_transformed_vecs.shape[0], 1)
        self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size)

        # comparing the original and loaded models
        original_transformed_vecs = self.model.transform(word)
        passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42)
        word = texts[0][0]
        self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word)


class TestAuthorTopicWrapper(unittest.TestCase):
    def setUp(self):
        self.model = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=2, passes=100)
        self.model.fit(corpus)

    def testTransform(self):
        # transforming multiple authors
        author_list = ['jill', 'jack']
        author_topics = self.model.transform(author_list)
        self.assertEqual(author_topics.shape[0], 2)
        self.assertEqual(author_topics.shape[1], self.model.num_topics)

        # transforming one author
        jill_topics = self.model.transform('jill')
        self.assertEqual(jill_topics.shape[0], 1)
        self.assertEqual(jill_topics.shape[1], self.model.num_topics)

    def testPartialFit(self):
        self.model.partial_fit(corpus_new, author2doc=author2doc_new)

        # Did we learn something about Sally?
        output_topics = self.model.transform('sally')
        sally_topics = output_topics[0]  # getting the topics corresponding to 'sally' (from the list of lists)
        self.assertTrue(all(sally_topics > 0))

    def testPipeline(self):
        # train the AuthorTopic model first
        model = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=10, passes=100)
        model.fit(corpus)

        # create and train clustering model
        clstr = cluster.MiniBatchKMeans(n_clusters=2)
        authors_full = ['john', 'jane', 'jack', 'jill']
        clstr.fit(model.transform(authors_full))

        # stack together the two models in a pipeline
        text_atm = Pipeline([('features', model,), ('cluster', clstr)])
        author_list = ['jane', 'jack', 'jill']
        ret_val = text_atm.predict(author_list)
        self.assertEqual(len(ret_val), len(author_list))

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(num_topics=3)
        model_params = self.model.get_params()
        self.assertEqual(model_params["num_topics"], 3)
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)

        # updating multiple params
        param_dict = {"passes": 5, "iterations": 10}
        self.model.set_params(**param_dict)
        model_params = self.model.get_params()
        for key in param_dict.keys():
            self.assertEqual(model_params[key], param_dict[key])
        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'passes'), 5)
        self.assertEqual(getattr(self.model.gensim_model, 'iterations'), 10)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        author_list = ['jill']
        loaded_author_topics = model_load.transform(author_list)

        # sanity check for transformation operation
        self.assertEqual(loaded_author_topics.shape[0], 1)
        self.assertEqual(loaded_author_topics.shape[1], self.model.num_topics)

        # comparing the original and loaded models
        original_author_topics = self.model.transform(author_list)
        passed = numpy.allclose(loaded_author_topics, original_author_topics, atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        atmodel_wrapper = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=10, passes=100)
        author_list = ['jill', 'jack']
        self.assertRaises(NotFittedError, atmodel_wrapper.transform, author_list)


class TestD2VTransformer(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)
        self.model = D2VTransformer(min_count=1)
        self.model.fit(d2v_sentences)

    def testTransform(self):
        # tranform multiple documents
        docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]]
        matrix = self.model.transform(docs)
        self.assertEqual(matrix.shape[0], 3)
        self.assertEqual(matrix.shape[1], self.model.size)

        # tranform one document
        doc = w2v_texts[0]
        matrix = self.model.transform(doc)
        self.assertEqual(matrix.shape[0], 1)
        self.assertEqual(matrix.shape[1], self.model.size)

    def testFitTransform(self):
        model = D2VTransformer(min_count=1)

        # fit and transform multiple documents
        docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]]
        matrix = model.fit_transform(docs)
        self.assertEqual(matrix.shape[0], 3)
        self.assertEqual(matrix.shape[1], model.size)

        # fit and transform one document
        doc = w2v_texts[0]
        matrix = model.fit_transform(doc)
        self.assertEqual(matrix.shape[0], 1)
        self.assertEqual(matrix.shape[1], model.size)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(negative=20)
        model_params = self.model.get_params()
        self.assertEqual(model_params["negative"], 20)

        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(d2v_sentences)
        self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20)

    def testPipeline(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        model = D2VTransformer(min_count=1)
        model.fit(d2v_sentences)

        class_dict = {'mathematics': 1, 'physics': 0}
        train_data = [
            (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'),
            (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics')
        ]
        train_input = [x[0] for x in train_data]
        train_target = [class_dict[x[1]] for x in train_data]

        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        clf.fit(model.transform(train_input), train_target)
        text_w2v = Pipeline([('features', model,), ('classifier', clf)])
        score = text_w2v.score(train_input, train_target)
        self.assertGreater(score, 0.40)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = w2v_texts[0]
        loaded_transformed_vecs = model_load.transform(doc)

        # sanity check for transformation operation
        self.assertEqual(loaded_transformed_vecs.shape[0], 1)
        self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size)

        # comparing the original and loaded models
        original_transformed_vecs = self.model.transform(doc)
        passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1)
        self.assertTrue(passed)

    def testConsistencyWithGensimModel(self):
        # training a D2VTransformer
        self.model = D2VTransformer(min_count=1)
        self.model.fit(d2v_sentences)

        # training a Gensim Doc2Vec model with the same params
        gensim_d2vmodel = models.Doc2Vec(d2v_sentences, min_count=1)

        doc = w2v_texts[0]
        vec_transformer_api = self.model.transform(doc)  # vector returned by D2VTransformer
        vec_gensim_model = gensim_d2vmodel[doc]  # vector returned by Doc2Vec
        passed = numpy.allclose(vec_transformer_api, vec_gensim_model, atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        d2vmodel_wrapper = D2VTransformer(min_count=1)
        self.assertRaises(NotFittedError, d2vmodel_wrapper.transform, 1)


class TestText2BowTransformer(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)
        self.model = Text2BowTransformer()
        self.model.fit(dict_texts)

    def testTransform(self):
        # tranform one document
        doc = ['computer system interface time computer system']
        bow_vec = self.model.transform(doc)[0]
        expected_values = [1, 1, 2, 2]  # comparing only the word-counts
        values = [x[1] for x in bow_vec]
        self.assertEqual(sorted(expected_values), sorted(values))

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(prune_at=1000000)
        model_params = self.model.get_params()
        self.assertEqual(model_params["prune_at"], 1000000)

    def testPipeline(self):
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        text2bow_model = Text2BowTransformer()
        lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
        numpy.random.mtrand.RandomState(1)  # set seed for getting same result
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_lda = Pipeline([('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)])
        text_lda.fit(data.data, data.target)
        score = text_lda.score(data.data, data.target)
        self.assertGreater(score, 0.40)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = dict_texts[0]
        loaded_transformed_vecs = model_load.transform(doc)

        # comparing the original and loaded models
        original_transformed_vecs = self.model.transform(doc)
        self.assertEqual(original_transformed_vecs, loaded_transformed_vecs)

    def testModelNotFitted(self):
        text2bow_wrapper = Text2BowTransformer()
        self.assertRaises(NotFittedError, text2bow_wrapper.transform, dict_texts[0])


class TestTfIdfTransformer(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)
        self.model = TfIdfTransformer(normalize=True)
        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
        self.model.fit(self.corpus)

    def testTransform(self):
        # tranform one document
        doc = corpus[0]
        transformed_doc = self.model.transform(doc)
        expected_doc = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]]
        self.assertTrue(numpy.allclose(transformed_doc, expected_doc))

        # tranform multiple documents
        docs = [corpus[0], corpus[1]]
        transformed_docs = self.model.transform(docs)
        expected_docs = [
            [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)],
            [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555),
             (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]
        ]
        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(smartirs='nnn')
        model_params = self.model.get_params()
        self.assertEqual(model_params["smartirs"], 'nnn')

        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(self.corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn')

    def testPipeline(self):
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        id2word = Dictionary([x.split() for x in data.data])
        corpus = [id2word.doc2bow(i.split()) for i in data.data]
        tfidf_model = TfIdfTransformer()
        tfidf_model.fit(corpus)
        lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
        numpy.random.mtrand.RandomState(1)  # set seed for getting same result
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_tfidf = Pipeline([('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf)])
        text_tfidf.fit(corpus, data.target)
        score = text_tfidf.score(corpus, data.target)
        self.assertGreater(score, 0.40)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = corpus[0]
        loaded_transformed_doc = model_load.transform(doc)

        # comparing the original and loaded models
        original_transformed_doc = self.model.transform(doc)
        self.assertEqual(original_transformed_doc, loaded_transformed_doc)

    def testModelNotFitted(self):
        tfidf_wrapper = TfIdfTransformer()
        self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0])


class TestHdpTransformer(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)
        self.model = HdpTransformer(id2word=dictionary, random_state=42)
        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
        self.model.fit(self.corpus)

    def testTransform(self):
        # tranform one document
        doc = self.corpus[0]
        transformed_doc = self.model.transform(doc)
        expected_doc = [
            [0.81043386270128193, 0.049357139518070477, 0.035840906753517532,
             0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148]
        ]
        self.assertTrue(numpy.allclose(transformed_doc, expected_doc, atol=1e-2))

        # tranform multiple documents
        docs = [self.corpus[0], self.corpus[1]]
        transformed_docs = self.model.transform(docs)
        expected_docs = [
            [0.81043386270128193, 0.049357139518070477, 0.035840906753517532,
             0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148],
            [0.03795908, 0.39542609, 0.50650585, 0.0151082, 0.01132749, 0., 0.]
        ]
        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0], atol=1e-2))
        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1], atol=1e-2))

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(var_converge=0.05)
        model_params = self.model.get_params()
        self.assertEqual(model_params["var_converge"], 0.05)

        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(self.corpus)
        self.assertEqual(getattr(self.model.gensim_model, 'm_var_converge'), 0.05)

    def testPipeline(self):
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        id2word = Dictionary([x.split() for x in data.data])
        corpus = [id2word.doc2bow(i.split()) for i in data.data]
        model = HdpTransformer(id2word=id2word)
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_lda = Pipeline([('features', model,), ('classifier', clf)])
        text_lda.fit(corpus, data.target)
        score = text_lda.score(corpus, data.target)
        self.assertGreater(score, 0.40)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = corpus[0]
        loaded_transformed_doc = model_load.transform(doc)

        # comparing the original and loaded models
        original_transformed_doc = self.model.transform(doc)
        self.assertTrue(numpy.allclose(original_transformed_doc, loaded_transformed_doc))

    def testModelNotFitted(self):
        hdp_wrapper = HdpTransformer(id2word=dictionary)
        self.assertRaises(NotFittedError, hdp_wrapper.transform, corpus[0])


class TestPhrasesTransformer(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)
        self.model = PhrasesTransformer(min_count=1, threshold=1)
        self.model.fit(phrases_sentences)

    def testTransform(self):
        # tranform one document
        doc = phrases_sentences[-1]
        phrase_tokens = self.model.transform(doc)[0]
        expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface']
        self.assertEqual(phrase_tokens, expected_phrase_tokens)

    def testPartialFit(self):
        new_sentences = [
            ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'],
            ['world', 'peace', 'people'],
            ['world', 'peace', 'humans']
        ]
        self.model.partial_fit(X=new_sentences)  # train model with new sentences

        doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace']
        phrase_tokens = self.model.transform(doc)[0]
        expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface', u'world_peace']
        self.assertEqual(phrase_tokens, expected_phrase_tokens)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(progress_per=5000)
        model_params = self.model.get_params()
        self.assertEqual(model_params["progress_per"], 5000)

        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(phrases_sentences)
        self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = phrases_sentences[-1]
        loaded_phrase_tokens = model_load.transform(doc)

        # comparing the original and loaded models
        original_phrase_tokens = self.model.transform(doc)
        self.assertEqual(original_phrase_tokens, loaded_phrase_tokens)

    def testModelNotFitted(self):
        phrases_transformer = PhrasesTransformer()
        self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0])


# specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter

# this is intentionally in main rather than a class method to support pickling
# all scores will be 1
def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
    return 1


class TestPhrasesTransformerCustomScorer(unittest.TestCase):

    def setUp(self):
        numpy.random.seed(0)

        self.model = PhrasesTransformer(min_count=1, threshold=.9, scoring=dumb_scorer)
        self.model.fit(phrases_sentences)

    def testTransform(self):
        # tranform one document
        doc = phrases_sentences[-1]
        phrase_tokens = self.model.transform(doc)[0]
        expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface']
        self.assertEqual(phrase_tokens, expected_phrase_tokens)

    def testPartialFit(self):
        new_sentences = [
            ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'],
            ['world', 'peace', 'people'],
            ['world', 'peace', 'humans']
        ]
        self.model.partial_fit(X=new_sentences)  # train model with new sentences

        doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace']
        phrase_tokens = self.model.transform(doc)[0]
        expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface', u'world_peace']
        self.assertEqual(phrase_tokens, expected_phrase_tokens)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(progress_per=5000)
        model_params = self.model.get_params()
        self.assertEqual(model_params["progress_per"], 5000)

        # verify that the attributes values are also changed for `gensim_model` after fitting
        self.model.fit(phrases_sentences)
        self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = phrases_sentences[-1]
        loaded_phrase_tokens = model_load.transform(doc)

        # comparing the original and loaded models
        original_phrase_tokens = self.model.transform(doc)
        self.assertEqual(original_phrase_tokens, loaded_phrase_tokens)

    def testModelNotFitted(self):
        phrases_transformer = PhrasesTransformer()
        self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0])


if __name__ == '__main__':
    unittest.main()