laywerrobot/lib/python3.6/site-packages/gensim/models/__init__.py

63 lines
2.3 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
"""
This package contains algorithms for extracting document representations from their raw
bag-of-word counts.
"""
# bring model classes directly into package namespace, to save some typing
from .coherencemodel import CoherenceModel # noqa:F401
from .hdpmodel import HdpModel # noqa:F401
from .ldamodel import LdaModel # noqa:F401
from .lsimodel import LsiModel # noqa:F401
from .tfidfmodel import TfidfModel # noqa:F401
from .rpmodel import RpModel # noqa:F401
from .logentropy_model import LogEntropyModel # noqa:F401
from .word2vec import Word2Vec # noqa:F401
from .doc2vec import Doc2Vec # noqa:F401
from .keyedvectors import KeyedVectors # noqa:F401
from .ldamulticore import LdaMulticore # noqa:F401
from .phrases import Phrases # noqa:F401
from .normmodel import NormModel # noqa:F401
from .atmodel import AuthorTopicModel # noqa:F401
from .ldaseqmodel import LdaSeqModel # noqa:F401
from .fasttext import FastText # noqa:F401
from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401
from . import wrappers # noqa:F401
from . import deprecated # noqa:F401
from gensim import interfaces, utils
class VocabTransform(interfaces.TransformationABC):
"""
Remap feature ids to new values.
Given a mapping between old ids and new ids (some old ids may be missing = these
features are to be discarded), this will wrap a corpus so that iterating over
`VocabTransform[corpus]` returns the same vectors but with the new ids.
Old features that have no counterpart in the new ids are discarded. This
can be used to filter vocabulary of a corpus "online"::
>>> old2new = {oldid: newid for newid, oldid in enumerate(ids_you_want_to_keep)}
>>> vt = VocabTransform(old2new)
>>> for vec_with_new_ids in vt[corpus_with_old_ids]:
>>> ...
"""
def __init__(self, old2new, id2token=None):
self.old2new = old2new
self.id2token = id2token
def __getitem__(self, bow):
"""
Return representation with the ids transformed.
"""
# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:
return self._apply(bow)
return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)