|
# import modules and set up logging
|
|
from gensim.models import word2vec
|
|
import logging
|
|
import gensim
|
|
|
|
import _pickle as cPickle
|
|
|
|
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
|
|
|
# load up unzipped corpus from http://mattmahoney.net/dc/text8.zip
|
|
#sentences = word2vec.Text8Corpus('corpus/dewiki.txt') # muss auskommentiert werden und den text der output vom preprocessing.py ist muss rein
|
|
# train the skip-gram model; default window=5
|
|
#model = word2vec.Word2Vec(sentences, size=200) # muss auskommentiert werden
|
|
# ... and some hours later... just as advertised...
|
|
#model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
|
|
|
|
# pickle the entire model to disk, so we can load&resume training later
|
|
#model.save('models/wiki.model')
|
|
# store the learned weights, in a format the original C tool understands
|
|
#model.save_word2vec_format('models/wiki.model.bin', binary=True) # Jo diese Zeile muss auskommentiert werden, mit den dreien muesste es klappen.
|
|
# or, import word weights created by the (faster) C word2vec
|
|
# this way, you can switch between the C/Python toolkits easily
|
|
#model = gensim.models.Word2Vec.load('models/wiki.model')
|
|
|
|
|
|
# Hier kommt jetzt ein beispiel, wie man das schoen dann nutzen kann. kommentiere das beim trainieren erst mal aus, also alles ab hier. Dann lade das bin model und schau dir an was bei rumkommt. Ich dachte an cluster von similar_by_word mengen, das muss dann natuerlich alles noch in zahlen umgewandelt werden damit man schnellen db zugriff hat.
|
|
|
|
print('loading the big model')
|
|
model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True) # C binary format
|
|
print('done')
|
|
|
|
print('the vocab is')
|
|
print(model.vocab)
|
|
|
|
|
|
|
|
|
|
# "boy" is to "father" as "girl" is to ...?
|
|
print(1 ,model.most_similar(positive=['Koenig','Koenigin'], negative=['Mann'], topn=1))
|
|
|
|
#print(2 ,model.doesnt_match("das nichts dazu".split()))
|
|
|
|
|
|
print(3 , model.similar_by_word("Asylbewerber", topn=20))
|
|
|
|
print(4, model.n_similarity(['dazu'], ['nicht']))
|
|
|
|
print(5, model.n_similarity(['dazu'], ['nichts']))
|
|
'''
|
|
print
|
|
print 'Der Vektor fuer Warschau ist:'
|
|
print
|
|
x = model['Warschau']
|
|
|
|
print x
|
|
print
|
|
print 'Und die laenge betraegt:'
|
|
print
|
|
print len(model['Warschau'])
|
|
|
|
pickle.dump( x, open( "Vektoren.p", "wb" ) )
|
|
|
|
y = pickle.load( open( "Vektoren.p", "rb" ) )
|
|
|
|
|
|
print y'''
|