1217 lines
60 KiB
Python
1217 lines
60 KiB
Python
import unittest
|
|
import numpy
|
|
import codecs
|
|
import pickle
|
|
|
|
from scipy import sparse
|
|
try:
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn import linear_model, cluster
|
|
from sklearn.exceptions import NotFittedError
|
|
except ImportError:
|
|
raise unittest.SkipTest("Test requires scikit-learn to be installed, which is not available")
|
|
|
|
from gensim.sklearn_api.rpmodel import RpTransformer
|
|
from gensim.sklearn_api.ldamodel import LdaTransformer
|
|
from gensim.sklearn_api.lsimodel import LsiTransformer
|
|
from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer
|
|
from gensim.sklearn_api.w2vmodel import W2VTransformer
|
|
from gensim.sklearn_api.atmodel import AuthorTopicTransformer
|
|
from gensim.sklearn_api.d2vmodel import D2VTransformer
|
|
from gensim.sklearn_api.text2bow import Text2BowTransformer
|
|
from gensim.sklearn_api.tfidf import TfIdfTransformer
|
|
from gensim.sklearn_api.hdp import HdpTransformer
|
|
from gensim.sklearn_api.phrases import PhrasesTransformer
|
|
from gensim.corpora import mmcorpus, Dictionary
|
|
from gensim import matutils, models
|
|
from gensim.test.utils import datapath, common_texts
|
|
|
|
texts = [
|
|
['complier', 'system', 'computer'],
|
|
['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
|
|
['graph', 'flow', 'network', 'graph'],
|
|
['loading', 'computer', 'system'],
|
|
['user', 'server', 'system'],
|
|
['tree', 'hamiltonian'],
|
|
['graph', 'trees'],
|
|
['computer', 'kernel', 'malfunction', 'computer'],
|
|
['server', 'system', 'computer'],
|
|
]
|
|
dictionary = Dictionary(texts)
|
|
corpus = [dictionary.doc2bow(text) for text in texts]
|
|
author2doc = {
|
|
'john': [0, 1, 2, 3, 4, 5, 6],
|
|
'jane': [2, 3, 4, 5, 6, 7, 8],
|
|
'jack': [0, 2, 4, 6, 8],
|
|
'jill': [1, 3, 5, 7]
|
|
}
|
|
|
|
texts_new = texts[0:3]
|
|
author2doc_new = {
|
|
'jill': [0],
|
|
'bob': [0, 1],
|
|
'sally': [1, 2]
|
|
}
|
|
dictionary_new = Dictionary(texts_new)
|
|
corpus_new = [dictionary_new.doc2bow(text) for text in texts_new]
|
|
|
|
texts_ldaseq = [
|
|
[
|
|
u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently',
|
|
u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'
|
|
],
|
|
[
|
|
u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming',
|
|
u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities',
|
|
u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing',
|
|
u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates',
|
|
u'hiring', u'conducting', u'interviews'
|
|
],
|
|
[
|
|
u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate',
|
|
u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing',
|
|
u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones',
|
|
u'participating'
|
|
],
|
|
[
|
|
u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills',
|
|
u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge',
|
|
u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills',
|
|
u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor',
|
|
u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates',
|
|
u'openings', u'jobs'
|
|
],
|
|
[
|
|
u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail',
|
|
u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology',
|
|
u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates',
|
|
u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands',
|
|
u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects',
|
|
u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel',
|
|
u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating',
|
|
u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops',
|
|
u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine',
|
|
u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates',
|
|
u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits',
|
|
u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness',
|
|
u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives',
|
|
u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge',
|
|
u'skills', u'engineering', u'quality', u'engineering'
|
|
],
|
|
[
|
|
u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering',
|
|
u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills',
|
|
u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation',
|
|
u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'
|
|
],
|
|
[
|
|
u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills',
|
|
u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone',
|
|
u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions',
|
|
u'solutions', u'biggest', u'insurers', u'operates', u'investment'
|
|
],
|
|
[
|
|
u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments',
|
|
u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors',
|
|
u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile',
|
|
u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship',
|
|
u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering',
|
|
u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding',
|
|
u'estimation', u'testing', u'procedures', u'voltage', u'engineering'
|
|
],
|
|
[
|
|
u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage',
|
|
u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills',
|
|
u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad',
|
|
u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks',
|
|
u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled',
|
|
u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond',
|
|
u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical',
|
|
u'contracting', u'southwest', u'electrical', u'contractors'
|
|
],
|
|
[
|
|
u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated',
|
|
u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies',
|
|
u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx',
|
|
u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns',
|
|
u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership',
|
|
u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants',
|
|
u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation',
|
|
u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians',
|
|
u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality',
|
|
u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants',
|
|
u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing',
|
|
u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories',
|
|
u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'
|
|
],
|
|
[
|
|
u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation',
|
|
u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated',
|
|
u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers',
|
|
u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers',
|
|
u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual',
|
|
u'automated', u'participate', u'ongoing'
|
|
],
|
|
[
|
|
u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor',
|
|
u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi',
|
|
u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills',
|
|
u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components',
|
|
u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize',
|
|
u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts',
|
|
u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy',
|
|
u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing',
|
|
u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click',
|
|
u'attach'
|
|
],
|
|
[
|
|
u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary',
|
|
u'asrc', u'engineering', u'technology', u'contracts'
|
|
],
|
|
[
|
|
u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions',
|
|
u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical',
|
|
u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal',
|
|
u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops',
|
|
u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness',
|
|
u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability',
|
|
u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco',
|
|
u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital',
|
|
u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies',
|
|
u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic',
|
|
u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact',
|
|
u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries',
|
|
u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport',
|
|
u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'
|
|
],
|
|
[
|
|
u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients',
|
|
u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt',
|
|
u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities',
|
|
u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console',
|
|
u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer',
|
|
u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment',
|
|
u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical',
|
|
u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably',
|
|
u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills',
|
|
u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings',
|
|
u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace',
|
|
u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'
|
|
],
|
|
[
|
|
u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities',
|
|
u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal',
|
|
u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined',
|
|
u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle',
|
|
u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android',
|
|
u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'
|
|
],
|
|
[
|
|
u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression',
|
|
u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation',
|
|
u'multiple', u'engineering', u'techexpousa', u'reviews'
|
|
],
|
|
[
|
|
u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema',
|
|
u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries',
|
|
u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned',
|
|
u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains',
|
|
u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate',
|
|
u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling',
|
|
u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically',
|
|
u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation',
|
|
u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies',
|
|
u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills',
|
|
u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified',
|
|
u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'
|
|
],
|
|
[
|
|
u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive',
|
|
u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews',
|
|
u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input',
|
|
u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks',
|
|
u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating',
|
|
u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving',
|
|
u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing',
|
|
u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing',
|
|
u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware',
|
|
u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository',
|
|
u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls',
|
|
u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat',
|
|
u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe',
|
|
u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax',
|
|
u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script',
|
|
u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration',
|
|
u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral',
|
|
u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'
|
|
],
|
|
[u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'],
|
|
['bank', 'river', 'shore', 'water'],
|
|
['river', 'water', 'flow', 'fast', 'tree'],
|
|
['bank', 'water', 'fall', 'flow'],
|
|
['bank', 'bank', 'water', 'rain', 'river'],
|
|
['river', 'water', 'mud', 'tree'],
|
|
['money', 'transaction', 'bank', 'finance'],
|
|
['bank', 'borrow', 'money'],
|
|
['bank', 'finance'],
|
|
['finance', 'money', 'sell', 'bank'],
|
|
['borrow', 'sell'],
|
|
['bank', 'loan', 'sell']
|
|
]
|
|
dictionary_ldaseq = Dictionary(texts_ldaseq)
|
|
corpus_ldaseq = [dictionary_ldaseq.doc2bow(text) for text in texts_ldaseq]
|
|
|
|
w2v_texts = [
|
|
['calculus', 'is', 'the', 'mathematical', 'study', 'of', 'continuous', 'change'],
|
|
['geometry', 'is', 'the', 'study', 'of', 'shape'],
|
|
['algebra', 'is', 'the', 'study', 'of', 'generalizations', 'of', 'arithmetic', 'operations'],
|
|
['differential', 'calculus', 'is', 'related', 'to', 'rates', 'of', 'change', 'and', 'slopes', 'of', 'curves'],
|
|
['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and',
|
|
'the', 'areas', 'under', 'and', 'between', 'curves'],
|
|
['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter',
|
|
'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'],
|
|
['the', 'main', 'goal', 'of', 'physics', 'is', 'to', 'understand', 'how', 'the', 'universe', 'behaves'],
|
|
['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new',
|
|
'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'],
|
|
['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics',
|
|
'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically',
|
|
'transformed', 'modern', 'day', 'society']
|
|
]
|
|
|
|
d2v_sentences = [models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)]
|
|
|
|
dict_texts = [' '.join(text) for text in common_texts]
|
|
|
|
phrases_sentences = common_texts + [
|
|
['graph', 'minors', 'survey', 'human', 'interface']
|
|
]
|
|
|
|
|
|
class TestLdaWrapper(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(0) # set fixed seed to get similar values everytime
|
|
self.model = LdaTransformer(
|
|
id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)
|
|
)
|
|
self.model.fit(corpus)
|
|
|
|
def testTransform(self):
|
|
texts_new = ['graph', 'eulerian']
|
|
bow = self.model.id2word.doc2bow(texts_new)
|
|
matrix = self.model.transform(bow)
|
|
self.assertEqual(matrix.shape[0], 1)
|
|
self.assertEqual(matrix.shape[1], self.model.num_topics)
|
|
texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']]
|
|
bow = []
|
|
for i in texts_new:
|
|
bow.append(self.model.id2word.doc2bow(i))
|
|
matrix = self.model.transform(bow)
|
|
self.assertEqual(matrix.shape[0], 3)
|
|
self.assertEqual(matrix.shape[1], self.model.num_topics)
|
|
|
|
def testPartialFit(self):
|
|
for i in range(10):
|
|
self.model.partial_fit(X=corpus) # fit against the model again
|
|
doc = list(corpus)[0] # transform only the first document
|
|
transformed = self.model.transform(doc)
|
|
expected = numpy.array([0.13, 0.87])
|
|
passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testConsistencyWithGensimModel(self):
|
|
# training an LdaTransformer with `num_topics`=10
|
|
self.model = LdaTransformer(
|
|
id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)
|
|
)
|
|
self.model.fit(corpus)
|
|
|
|
# training a Gensim LdaModel with the same params
|
|
gensim_ldamodel = models.LdaModel(
|
|
corpus=corpus, id2word=dictionary, num_topics=10, passes=100,
|
|
minimum_probability=0, random_state=numpy.random.seed(0)
|
|
)
|
|
|
|
texts_new = ['graph', 'eulerian']
|
|
bow = self.model.id2word.doc2bow(texts_new)
|
|
matrix_transformer_api = self.model.transform(bow)
|
|
matrix_gensim_model = gensim_ldamodel[bow]
|
|
# convert into dense representation to be able to compare with transformer output
|
|
matrix_gensim_model_dense = matutils.sparse2full(matrix_gensim_model, 10)
|
|
passed = numpy.allclose(matrix_transformer_api, matrix_gensim_model_dense, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testCSRMatrixConversion(self):
|
|
numpy.random.seed(0) # set fixed seed to get similar values everytime
|
|
arr = numpy.array([[1, 2, 0], [0, 0, 3], [1, 0, 0]])
|
|
sarr = sparse.csr_matrix(arr)
|
|
newmodel = LdaTransformer(num_topics=2, passes=100)
|
|
newmodel.fit(sarr)
|
|
bow = [(0, 1), (1, 2), (2, 0)]
|
|
transformed_vec = newmodel.transform(bow)
|
|
expected_vec = numpy.array([0.12843782, 0.87156218])
|
|
passed = numpy.allclose(transformed_vec, expected_vec, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testPipeline(self):
|
|
model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
|
|
with open(datapath('mini_newsgroup'), 'rb') as f:
|
|
compressed_content = f.read()
|
|
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
|
|
cache = pickle.loads(uncompressed_content)
|
|
data = cache
|
|
id2word = Dictionary([x.split() for x in data.data])
|
|
corpus = [id2word.doc2bow(i.split()) for i in data.data]
|
|
numpy.random.mtrand.RandomState(1) # set seed for getting same result
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
text_lda = Pipeline([('features', model,), ('classifier', clf)])
|
|
text_lda.fit(corpus, data.target)
|
|
score = text_lda.score(corpus, data.target)
|
|
self.assertGreaterEqual(score, 0.40)
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(num_topics=3)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["num_topics"], 3)
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)
|
|
|
|
# updating multiple params
|
|
param_dict = {"eval_every": 20, "decay": 0.7}
|
|
self.model.set_params(**param_dict)
|
|
model_params = self.model.get_params()
|
|
for key in param_dict.keys():
|
|
self.assertEqual(model_params[key], param_dict[key])
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'eval_every'), 20)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'decay'), 0.7)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
texts_new = ['graph', 'eulerian']
|
|
loaded_bow = model_load.id2word.doc2bow(texts_new)
|
|
loaded_matrix = model_load.transform(loaded_bow)
|
|
|
|
# sanity check for transformation operation
|
|
self.assertEqual(loaded_matrix.shape[0], 1)
|
|
self.assertEqual(loaded_matrix.shape[1], model_load.num_topics)
|
|
|
|
# comparing the original and loaded models
|
|
original_bow = self.model.id2word.doc2bow(texts_new)
|
|
original_matrix = self.model.transform(original_bow)
|
|
passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testModelNotFitted(self):
|
|
lda_wrapper = LdaTransformer(
|
|
id2word=dictionary, num_topics=2, passes=100,
|
|
minimum_probability=0, random_state=numpy.random.seed(0)
|
|
)
|
|
texts_new = ['graph', 'eulerian']
|
|
bow = lda_wrapper.id2word.doc2bow(texts_new)
|
|
self.assertRaises(NotFittedError, lda_wrapper.transform, bow)
|
|
|
|
|
|
class TestLsiWrapper(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(0) # set fixed seed to get similar values everytime
|
|
self.model = LsiTransformer(id2word=dictionary, num_topics=2)
|
|
self.model.fit(corpus)
|
|
|
|
def testTransform(self):
|
|
texts_new = ['graph', 'eulerian']
|
|
bow = self.model.id2word.doc2bow(texts_new)
|
|
matrix = self.model.transform(bow)
|
|
self.assertEqual(matrix.shape[0], 1)
|
|
self.assertEqual(matrix.shape[1], self.model.num_topics)
|
|
texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']]
|
|
bow = []
|
|
for i in texts_new:
|
|
bow.append(self.model.id2word.doc2bow(i))
|
|
matrix = self.model.transform(bow)
|
|
self.assertEqual(matrix.shape[0], 3)
|
|
self.assertEqual(matrix.shape[1], self.model.num_topics)
|
|
|
|
def testPartialFit(self):
|
|
for i in range(10):
|
|
self.model.partial_fit(X=corpus) # fit against the model again
|
|
doc = list(corpus)[0] # transform only the first document
|
|
transformed = self.model.transform(doc)
|
|
expected = numpy.array([1.39, 0.0])
|
|
passed = numpy.allclose(transformed[0], expected, atol=1)
|
|
self.assertTrue(passed)
|
|
|
|
def testPipeline(self):
|
|
model = LsiTransformer(num_topics=2)
|
|
with open(datapath('mini_newsgroup'), 'rb') as f:
|
|
compressed_content = f.read()
|
|
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
|
|
cache = pickle.loads(uncompressed_content)
|
|
data = cache
|
|
id2word = Dictionary([x.split() for x in data.data])
|
|
corpus = [id2word.doc2bow(i.split()) for i in data.data]
|
|
numpy.random.mtrand.RandomState(1) # set seed for getting same result
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
text_lsi = Pipeline([('features', model,), ('classifier', clf)])
|
|
text_lsi.fit(corpus, data.target)
|
|
score = text_lsi.score(corpus, data.target)
|
|
self.assertGreater(score, 0.50)
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(num_topics=3)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["num_topics"], 3)
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)
|
|
|
|
# updating multiple params
|
|
param_dict = {"chunksize": 10000, "decay": 0.9}
|
|
self.model.set_params(**param_dict)
|
|
model_params = self.model.get_params()
|
|
for key in param_dict.keys():
|
|
self.assertEqual(model_params[key], param_dict[key])
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'chunksize'), 10000)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'decay'), 0.9)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
texts_new = ['graph', 'eulerian']
|
|
loaded_bow = model_load.id2word.doc2bow(texts_new)
|
|
loaded_matrix = model_load.transform(loaded_bow)
|
|
|
|
# sanity check for transformation operation
|
|
self.assertEqual(loaded_matrix.shape[0], 1)
|
|
self.assertEqual(loaded_matrix.shape[1], model_load.num_topics)
|
|
|
|
# comparing the original and loaded models
|
|
original_bow = self.model.id2word.doc2bow(texts_new)
|
|
original_matrix = self.model.transform(original_bow)
|
|
passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testModelNotFitted(self):
|
|
lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2)
|
|
texts_new = ['graph', 'eulerian']
|
|
bow = lsi_wrapper.id2word.doc2bow(texts_new)
|
|
self.assertRaises(NotFittedError, lsi_wrapper.transform, bow)
|
|
|
|
|
|
class TestLdaSeqWrapper(unittest.TestCase):
|
|
def setUp(self):
|
|
self.model = LdaSeqTransformer(
|
|
id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='gensim'
|
|
)
|
|
self.model.fit(corpus_ldaseq)
|
|
|
|
def testTransform(self):
|
|
# transforming two documents
|
|
docs = [list(corpus_ldaseq)[0], list(corpus_ldaseq)[1]]
|
|
transformed_vecs = self.model.transform(docs)
|
|
self.assertEqual(transformed_vecs.shape[0], 2)
|
|
self.assertEqual(transformed_vecs.shape[1], self.model.num_topics)
|
|
|
|
# transforming one document
|
|
doc = list(corpus_ldaseq)[0]
|
|
transformed_vecs = self.model.transform(doc)
|
|
self.assertEqual(transformed_vecs.shape[0], 1)
|
|
self.assertEqual(transformed_vecs.shape[1], self.model.num_topics)
|
|
|
|
def testPipeline(self):
|
|
numpy.random.seed(0) # set fixed seed to get similar values everytime
|
|
with open(datapath('mini_newsgroup'), 'rb') as f:
|
|
compressed_content = f.read()
|
|
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
|
|
cache = pickle.loads(uncompressed_content)
|
|
data = cache
|
|
test_data = data.data[0:2]
|
|
test_target = data.target[0:2]
|
|
id2word = Dictionary([x.split() for x in test_data])
|
|
corpus = [id2word.doc2bow(i.split()) for i in test_data]
|
|
model = LdaSeqTransformer(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
text_ldaseq = Pipeline([('features', model,), ('classifier', clf)])
|
|
text_ldaseq.fit(corpus, test_target)
|
|
score = text_ldaseq.score(corpus, test_target)
|
|
self.assertGreater(score, 0.50)
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(num_topics=3)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["num_topics"], 3)
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(corpus_ldaseq)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
doc = list(corpus_ldaseq)[0]
|
|
loaded_transformed_vecs = model_load.transform(doc)
|
|
|
|
# sanity check for transformation operation
|
|
self.assertEqual(loaded_transformed_vecs.shape[0], 1)
|
|
self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics)
|
|
|
|
# comparing the original and loaded models
|
|
original_transformed_vecs = self.model.transform(doc)
|
|
passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testModelNotFitted(self):
|
|
ldaseq_wrapper = LdaSeqTransformer(num_topics=2)
|
|
doc = list(corpus_ldaseq)[0]
|
|
self.assertRaises(NotFittedError, ldaseq_wrapper.transform, doc)
|
|
|
|
|
|
class TestRpWrapper(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(13)
|
|
self.model = RpTransformer(num_topics=2)
|
|
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
|
|
self.model.fit(self.corpus)
|
|
|
|
def testTransform(self):
|
|
# tranform two documents
|
|
docs = [list(self.corpus)[0], list(self.corpus)[1]]
|
|
matrix = self.model.transform(docs)
|
|
self.assertEqual(matrix.shape[0], 2)
|
|
self.assertEqual(matrix.shape[1], self.model.num_topics)
|
|
|
|
# tranform one document
|
|
doc = list(self.corpus)[0]
|
|
matrix = self.model.transform(doc)
|
|
self.assertEqual(matrix.shape[0], 1)
|
|
self.assertEqual(matrix.shape[1], self.model.num_topics)
|
|
|
|
def testPipeline(self):
|
|
numpy.random.seed(0) # set fixed seed to get similar values everytime
|
|
model = RpTransformer(num_topics=2)
|
|
with open(datapath('mini_newsgroup'), 'rb') as f:
|
|
compressed_content = f.read()
|
|
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
|
|
cache = pickle.loads(uncompressed_content)
|
|
data = cache
|
|
id2word = Dictionary([x.split() for x in data.data])
|
|
corpus = [id2word.doc2bow(i.split()) for i in data.data]
|
|
numpy.random.mtrand.RandomState(1) # set seed for getting same result
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
text_rp = Pipeline([('features', model,), ('classifier', clf)])
|
|
text_rp.fit(corpus, data.target)
|
|
score = text_rp.score(corpus, data.target)
|
|
self.assertGreater(score, 0.40)
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(num_topics=3)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["num_topics"], 3)
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(self.corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
doc = list(self.corpus)[0]
|
|
loaded_transformed_vecs = model_load.transform(doc)
|
|
|
|
# sanity check for transformation operation
|
|
self.assertEqual(loaded_transformed_vecs.shape[0], 1)
|
|
self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics)
|
|
|
|
# comparing the original and loaded models
|
|
original_transformed_vecs = self.model.transform(doc)
|
|
passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testModelNotFitted(self):
|
|
rpmodel_wrapper = RpTransformer(num_topics=2)
|
|
doc = list(self.corpus)[0]
|
|
self.assertRaises(NotFittedError, rpmodel_wrapper.transform, doc)
|
|
|
|
|
|
class TestWord2VecWrapper(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(0)
|
|
self.model = W2VTransformer(size=10, min_count=0, seed=42)
|
|
self.model.fit(texts)
|
|
|
|
def testTransform(self):
|
|
# tranform multiple words
|
|
words = []
|
|
words = words + texts[0]
|
|
matrix = self.model.transform(words)
|
|
self.assertEqual(matrix.shape[0], 3)
|
|
self.assertEqual(matrix.shape[1], self.model.size)
|
|
|
|
# tranform one word
|
|
word = texts[0][0]
|
|
matrix = self.model.transform(word)
|
|
self.assertEqual(matrix.shape[0], 1)
|
|
self.assertEqual(matrix.shape[1], self.model.size)
|
|
|
|
def testConsistencyWithGensimModel(self):
|
|
# training a W2VTransformer
|
|
self.model = W2VTransformer(size=10, min_count=0, seed=42)
|
|
self.model.fit(texts)
|
|
|
|
# training a Gensim Word2Vec model with the same params
|
|
gensim_w2vmodel = models.Word2Vec(texts, size=10, min_count=0, seed=42)
|
|
|
|
word = texts[0][0]
|
|
vec_transformer_api = self.model.transform(word) # vector returned by W2VTransformer
|
|
vec_gensim_model = gensim_w2vmodel[word] # vector returned by Word2Vec
|
|
passed = numpy.allclose(vec_transformer_api, vec_gensim_model, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testPipeline(self):
|
|
numpy.random.seed(0) # set fixed seed to get similar values everytime
|
|
model = W2VTransformer(size=10, min_count=1)
|
|
model.fit(w2v_texts)
|
|
|
|
class_dict = {'mathematics': 1, 'physics': 0}
|
|
train_data = [
|
|
('calculus', 'mathematics'), ('mathematical', 'mathematics'),
|
|
('geometry', 'mathematics'), ('operations', 'mathematics'),
|
|
('curves', 'mathematics'), ('natural', 'physics'), ('nuclear', 'physics'),
|
|
('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')
|
|
]
|
|
train_input = [x[0] for x in train_data]
|
|
train_target = [class_dict[x[1]] for x in train_data]
|
|
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
clf.fit(model.transform(train_input), train_target)
|
|
text_w2v = Pipeline([('features', model,), ('classifier', clf)])
|
|
score = text_w2v.score(train_input, train_target)
|
|
self.assertGreater(score, 0.40)
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(negative=20)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["negative"], 20)
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(texts)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
word = texts[0][0]
|
|
loaded_transformed_vecs = model_load.transform(word)
|
|
|
|
# sanity check for transformation operation
|
|
self.assertEqual(loaded_transformed_vecs.shape[0], 1)
|
|
self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size)
|
|
|
|
# comparing the original and loaded models
|
|
original_transformed_vecs = self.model.transform(word)
|
|
passed = numpy.allclose(loaded_transformed_vecs, original_transformed_vecs, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testModelNotFitted(self):
|
|
w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42)
|
|
word = texts[0][0]
|
|
self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word)
|
|
|
|
|
|
class TestAuthorTopicWrapper(unittest.TestCase):
|
|
def setUp(self):
|
|
self.model = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=2, passes=100)
|
|
self.model.fit(corpus)
|
|
|
|
def testTransform(self):
|
|
# transforming multiple authors
|
|
author_list = ['jill', 'jack']
|
|
author_topics = self.model.transform(author_list)
|
|
self.assertEqual(author_topics.shape[0], 2)
|
|
self.assertEqual(author_topics.shape[1], self.model.num_topics)
|
|
|
|
# transforming one author
|
|
jill_topics = self.model.transform('jill')
|
|
self.assertEqual(jill_topics.shape[0], 1)
|
|
self.assertEqual(jill_topics.shape[1], self.model.num_topics)
|
|
|
|
def testPartialFit(self):
|
|
self.model.partial_fit(corpus_new, author2doc=author2doc_new)
|
|
|
|
# Did we learn something about Sally?
|
|
output_topics = self.model.transform('sally')
|
|
sally_topics = output_topics[0] # getting the topics corresponding to 'sally' (from the list of lists)
|
|
self.assertTrue(all(sally_topics > 0))
|
|
|
|
def testPipeline(self):
|
|
# train the AuthorTopic model first
|
|
model = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=10, passes=100)
|
|
model.fit(corpus)
|
|
|
|
# create and train clustering model
|
|
clstr = cluster.MiniBatchKMeans(n_clusters=2)
|
|
authors_full = ['john', 'jane', 'jack', 'jill']
|
|
clstr.fit(model.transform(authors_full))
|
|
|
|
# stack together the two models in a pipeline
|
|
text_atm = Pipeline([('features', model,), ('cluster', clstr)])
|
|
author_list = ['jane', 'jack', 'jill']
|
|
ret_val = text_atm.predict(author_list)
|
|
self.assertEqual(len(ret_val), len(author_list))
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(num_topics=3)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["num_topics"], 3)
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'num_topics'), 3)
|
|
|
|
# updating multiple params
|
|
param_dict = {"passes": 5, "iterations": 10}
|
|
self.model.set_params(**param_dict)
|
|
model_params = self.model.get_params()
|
|
for key in param_dict.keys():
|
|
self.assertEqual(model_params[key], param_dict[key])
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'passes'), 5)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'iterations'), 10)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
author_list = ['jill']
|
|
loaded_author_topics = model_load.transform(author_list)
|
|
|
|
# sanity check for transformation operation
|
|
self.assertEqual(loaded_author_topics.shape[0], 1)
|
|
self.assertEqual(loaded_author_topics.shape[1], self.model.num_topics)
|
|
|
|
# comparing the original and loaded models
|
|
original_author_topics = self.model.transform(author_list)
|
|
passed = numpy.allclose(loaded_author_topics, original_author_topics, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testModelNotFitted(self):
|
|
atmodel_wrapper = AuthorTopicTransformer(id2word=dictionary, author2doc=author2doc, num_topics=10, passes=100)
|
|
author_list = ['jill', 'jack']
|
|
self.assertRaises(NotFittedError, atmodel_wrapper.transform, author_list)
|
|
|
|
|
|
class TestD2VTransformer(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(0)
|
|
self.model = D2VTransformer(min_count=1)
|
|
self.model.fit(d2v_sentences)
|
|
|
|
def testTransform(self):
|
|
# tranform multiple documents
|
|
docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]]
|
|
matrix = self.model.transform(docs)
|
|
self.assertEqual(matrix.shape[0], 3)
|
|
self.assertEqual(matrix.shape[1], self.model.size)
|
|
|
|
# tranform one document
|
|
doc = w2v_texts[0]
|
|
matrix = self.model.transform(doc)
|
|
self.assertEqual(matrix.shape[0], 1)
|
|
self.assertEqual(matrix.shape[1], self.model.size)
|
|
|
|
def testFitTransform(self):
|
|
model = D2VTransformer(min_count=1)
|
|
|
|
# fit and transform multiple documents
|
|
docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]]
|
|
matrix = model.fit_transform(docs)
|
|
self.assertEqual(matrix.shape[0], 3)
|
|
self.assertEqual(matrix.shape[1], model.size)
|
|
|
|
# fit and transform one document
|
|
doc = w2v_texts[0]
|
|
matrix = model.fit_transform(doc)
|
|
self.assertEqual(matrix.shape[0], 1)
|
|
self.assertEqual(matrix.shape[1], model.size)
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(negative=20)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["negative"], 20)
|
|
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(d2v_sentences)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'negative'), 20)
|
|
|
|
def testPipeline(self):
|
|
numpy.random.seed(0) # set fixed seed to get similar values everytime
|
|
model = D2VTransformer(min_count=1)
|
|
model.fit(d2v_sentences)
|
|
|
|
class_dict = {'mathematics': 1, 'physics': 0}
|
|
train_data = [
|
|
(['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'),
|
|
(['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics')
|
|
]
|
|
train_input = [x[0] for x in train_data]
|
|
train_target = [class_dict[x[1]] for x in train_data]
|
|
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
clf.fit(model.transform(train_input), train_target)
|
|
text_w2v = Pipeline([('features', model,), ('classifier', clf)])
|
|
score = text_w2v.score(train_input, train_target)
|
|
self.assertGreater(score, 0.40)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
doc = w2v_texts[0]
|
|
loaded_transformed_vecs = model_load.transform(doc)
|
|
|
|
# sanity check for transformation operation
|
|
self.assertEqual(loaded_transformed_vecs.shape[0], 1)
|
|
self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size)
|
|
|
|
# comparing the original and loaded models
|
|
original_transformed_vecs = self.model.transform(doc)
|
|
passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testConsistencyWithGensimModel(self):
|
|
# training a D2VTransformer
|
|
self.model = D2VTransformer(min_count=1)
|
|
self.model.fit(d2v_sentences)
|
|
|
|
# training a Gensim Doc2Vec model with the same params
|
|
gensim_d2vmodel = models.Doc2Vec(d2v_sentences, min_count=1)
|
|
|
|
doc = w2v_texts[0]
|
|
vec_transformer_api = self.model.transform(doc) # vector returned by D2VTransformer
|
|
vec_gensim_model = gensim_d2vmodel[doc] # vector returned by Doc2Vec
|
|
passed = numpy.allclose(vec_transformer_api, vec_gensim_model, atol=1e-1)
|
|
self.assertTrue(passed)
|
|
|
|
def testModelNotFitted(self):
|
|
d2vmodel_wrapper = D2VTransformer(min_count=1)
|
|
self.assertRaises(NotFittedError, d2vmodel_wrapper.transform, 1)
|
|
|
|
|
|
class TestText2BowTransformer(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(0)
|
|
self.model = Text2BowTransformer()
|
|
self.model.fit(dict_texts)
|
|
|
|
def testTransform(self):
|
|
# tranform one document
|
|
doc = ['computer system interface time computer system']
|
|
bow_vec = self.model.transform(doc)[0]
|
|
expected_values = [1, 1, 2, 2] # comparing only the word-counts
|
|
values = [x[1] for x in bow_vec]
|
|
self.assertEqual(sorted(expected_values), sorted(values))
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(prune_at=1000000)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["prune_at"], 1000000)
|
|
|
|
def testPipeline(self):
|
|
with open(datapath('mini_newsgroup'), 'rb') as f:
|
|
compressed_content = f.read()
|
|
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
|
|
cache = pickle.loads(uncompressed_content)
|
|
data = cache
|
|
text2bow_model = Text2BowTransformer()
|
|
lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
|
|
numpy.random.mtrand.RandomState(1) # set seed for getting same result
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
text_lda = Pipeline([('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)])
|
|
text_lda.fit(data.data, data.target)
|
|
score = text_lda.score(data.data, data.target)
|
|
self.assertGreater(score, 0.40)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
doc = dict_texts[0]
|
|
loaded_transformed_vecs = model_load.transform(doc)
|
|
|
|
# comparing the original and loaded models
|
|
original_transformed_vecs = self.model.transform(doc)
|
|
self.assertEqual(original_transformed_vecs, loaded_transformed_vecs)
|
|
|
|
def testModelNotFitted(self):
|
|
text2bow_wrapper = Text2BowTransformer()
|
|
self.assertRaises(NotFittedError, text2bow_wrapper.transform, dict_texts[0])
|
|
|
|
|
|
class TestTfIdfTransformer(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(0)
|
|
self.model = TfIdfTransformer(normalize=True)
|
|
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
|
|
self.model.fit(self.corpus)
|
|
|
|
def testTransform(self):
|
|
# tranform one document
|
|
doc = corpus[0]
|
|
transformed_doc = self.model.transform(doc)
|
|
expected_doc = [[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]]
|
|
self.assertTrue(numpy.allclose(transformed_doc, expected_doc))
|
|
|
|
# tranform multiple documents
|
|
docs = [corpus[0], corpus[1]]
|
|
transformed_docs = self.model.transform(docs)
|
|
expected_docs = [
|
|
[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)],
|
|
[(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555),
|
|
(6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]
|
|
]
|
|
self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
|
|
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(smartirs='nnn')
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["smartirs"], 'nnn')
|
|
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(self.corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn')
|
|
|
|
def testPipeline(self):
|
|
with open(datapath('mini_newsgroup'), 'rb') as f:
|
|
compressed_content = f.read()
|
|
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
|
|
cache = pickle.loads(uncompressed_content)
|
|
data = cache
|
|
id2word = Dictionary([x.split() for x in data.data])
|
|
corpus = [id2word.doc2bow(i.split()) for i in data.data]
|
|
tfidf_model = TfIdfTransformer()
|
|
tfidf_model.fit(corpus)
|
|
lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
|
|
numpy.random.mtrand.RandomState(1) # set seed for getting same result
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
text_tfidf = Pipeline([('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf)])
|
|
text_tfidf.fit(corpus, data.target)
|
|
score = text_tfidf.score(corpus, data.target)
|
|
self.assertGreater(score, 0.40)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
doc = corpus[0]
|
|
loaded_transformed_doc = model_load.transform(doc)
|
|
|
|
# comparing the original and loaded models
|
|
original_transformed_doc = self.model.transform(doc)
|
|
self.assertEqual(original_transformed_doc, loaded_transformed_doc)
|
|
|
|
def testModelNotFitted(self):
|
|
tfidf_wrapper = TfIdfTransformer()
|
|
self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0])
|
|
|
|
|
|
class TestHdpTransformer(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(0)
|
|
self.model = HdpTransformer(id2word=dictionary, random_state=42)
|
|
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
|
|
self.model.fit(self.corpus)
|
|
|
|
def testTransform(self):
|
|
# tranform one document
|
|
doc = self.corpus[0]
|
|
transformed_doc = self.model.transform(doc)
|
|
expected_doc = [
|
|
[0.81043386270128193, 0.049357139518070477, 0.035840906753517532,
|
|
0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148]
|
|
]
|
|
self.assertTrue(numpy.allclose(transformed_doc, expected_doc, atol=1e-2))
|
|
|
|
# tranform multiple documents
|
|
docs = [self.corpus[0], self.corpus[1]]
|
|
transformed_docs = self.model.transform(docs)
|
|
expected_docs = [
|
|
[0.81043386270128193, 0.049357139518070477, 0.035840906753517532,
|
|
0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148],
|
|
[0.03795908, 0.39542609, 0.50650585, 0.0151082, 0.01132749, 0., 0.]
|
|
]
|
|
self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0], atol=1e-2))
|
|
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1], atol=1e-2))
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(var_converge=0.05)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["var_converge"], 0.05)
|
|
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(self.corpus)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'm_var_converge'), 0.05)
|
|
|
|
def testPipeline(self):
|
|
with open(datapath('mini_newsgroup'), 'rb') as f:
|
|
compressed_content = f.read()
|
|
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
|
|
cache = pickle.loads(uncompressed_content)
|
|
data = cache
|
|
id2word = Dictionary([x.split() for x in data.data])
|
|
corpus = [id2word.doc2bow(i.split()) for i in data.data]
|
|
model = HdpTransformer(id2word=id2word)
|
|
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
|
|
text_lda = Pipeline([('features', model,), ('classifier', clf)])
|
|
text_lda.fit(corpus, data.target)
|
|
score = text_lda.score(corpus, data.target)
|
|
self.assertGreater(score, 0.40)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
doc = corpus[0]
|
|
loaded_transformed_doc = model_load.transform(doc)
|
|
|
|
# comparing the original and loaded models
|
|
original_transformed_doc = self.model.transform(doc)
|
|
self.assertTrue(numpy.allclose(original_transformed_doc, loaded_transformed_doc))
|
|
|
|
def testModelNotFitted(self):
|
|
hdp_wrapper = HdpTransformer(id2word=dictionary)
|
|
self.assertRaises(NotFittedError, hdp_wrapper.transform, corpus[0])
|
|
|
|
|
|
class TestPhrasesTransformer(unittest.TestCase):
|
|
def setUp(self):
|
|
numpy.random.seed(0)
|
|
self.model = PhrasesTransformer(min_count=1, threshold=1)
|
|
self.model.fit(phrases_sentences)
|
|
|
|
def testTransform(self):
|
|
# tranform one document
|
|
doc = phrases_sentences[-1]
|
|
phrase_tokens = self.model.transform(doc)[0]
|
|
expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface']
|
|
self.assertEqual(phrase_tokens, expected_phrase_tokens)
|
|
|
|
def testPartialFit(self):
|
|
new_sentences = [
|
|
['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'],
|
|
['world', 'peace', 'people'],
|
|
['world', 'peace', 'humans']
|
|
]
|
|
self.model.partial_fit(X=new_sentences) # train model with new sentences
|
|
|
|
doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace']
|
|
phrase_tokens = self.model.transform(doc)[0]
|
|
expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface', u'world_peace']
|
|
self.assertEqual(phrase_tokens, expected_phrase_tokens)
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(progress_per=5000)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["progress_per"], 5000)
|
|
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(phrases_sentences)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
doc = phrases_sentences[-1]
|
|
loaded_phrase_tokens = model_load.transform(doc)
|
|
|
|
# comparing the original and loaded models
|
|
original_phrase_tokens = self.model.transform(doc)
|
|
self.assertEqual(original_phrase_tokens, loaded_phrase_tokens)
|
|
|
|
def testModelNotFitted(self):
|
|
phrases_transformer = PhrasesTransformer()
|
|
self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0])
|
|
|
|
|
|
# specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter
|
|
|
|
# this is intentionally in main rather than a class method to support pickling
|
|
# all scores will be 1
|
|
def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
|
|
return 1
|
|
|
|
|
|
class TestPhrasesTransformerCustomScorer(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
numpy.random.seed(0)
|
|
|
|
self.model = PhrasesTransformer(min_count=1, threshold=.9, scoring=dumb_scorer)
|
|
self.model.fit(phrases_sentences)
|
|
|
|
def testTransform(self):
|
|
# tranform one document
|
|
doc = phrases_sentences[-1]
|
|
phrase_tokens = self.model.transform(doc)[0]
|
|
expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface']
|
|
self.assertEqual(phrase_tokens, expected_phrase_tokens)
|
|
|
|
def testPartialFit(self):
|
|
new_sentences = [
|
|
['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'],
|
|
['world', 'peace', 'people'],
|
|
['world', 'peace', 'humans']
|
|
]
|
|
self.model.partial_fit(X=new_sentences) # train model with new sentences
|
|
|
|
doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace']
|
|
phrase_tokens = self.model.transform(doc)[0]
|
|
expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface', u'world_peace']
|
|
self.assertEqual(phrase_tokens, expected_phrase_tokens)
|
|
|
|
def testSetGetParams(self):
|
|
# updating only one param
|
|
self.model.set_params(progress_per=5000)
|
|
model_params = self.model.get_params()
|
|
self.assertEqual(model_params["progress_per"], 5000)
|
|
|
|
# verify that the attributes values are also changed for `gensim_model` after fitting
|
|
self.model.fit(phrases_sentences)
|
|
self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000)
|
|
|
|
def testPersistence(self):
|
|
model_dump = pickle.dumps(self.model)
|
|
model_load = pickle.loads(model_dump)
|
|
|
|
doc = phrases_sentences[-1]
|
|
loaded_phrase_tokens = model_load.transform(doc)
|
|
|
|
# comparing the original and loaded models
|
|
original_phrase_tokens = self.model.transform(doc)
|
|
self.assertEqual(original_phrase_tokens, loaded_phrase_tokens)
|
|
|
|
def testModelNotFitted(self):
|
|
phrases_transformer = PhrasesTransformer()
|
|
self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|