You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

66 lines
2.3 KiB

# import modules and set up logging
from gensim.models import word2vec
import logging
import gensim
import _pickle as cPickle
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# load up unzipped corpus from http://mattmahoney.net/dc/text8.zip
#sentences = word2vec.Text8Corpus('corpus/dewiki.txt') # muss auskommentiert werden und den text der output vom preprocessing.py ist muss rein
# train the skip-gram model; default window=5
#model = word2vec.Word2Vec(sentences, size=200) # muss auskommentiert werden
# ... and some hours later... just as advertised...
#model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
# pickle the entire model to disk, so we can load&resume training later
#model.save('models/wiki.model')
# store the learned weights, in a format the original C tool understands
#model.save_word2vec_format('models/wiki.model.bin', binary=True) # Jo diese Zeile muss auskommentiert werden, mit den dreien muesste es klappen.
# or, import word weights created by the (faster) C word2vec
# this way, you can switch between the C/Python toolkits easily
#model = gensim.models.Word2Vec.load('models/wiki.model')
# Hier kommt jetzt ein beispiel, wie man das schoen dann nutzen kann. kommentiere das beim trainieren erst mal aus, also alles ab hier. Dann lade das bin model und schau dir an was bei rumkommt. Ich dachte an cluster von similar_by_word mengen, das muss dann natuerlich alles noch in zahlen umgewandelt werden damit man schnellen db zugriff hat.
print('loading the big model')
model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True) # C binary format
print('done')
print('the vocab is')
print(model.vocab)
# "boy" is to "father" as "girl" is to ...?
print(1 ,model.most_similar(positive=['Koenig','Koenigin'], negative=['Mann'], topn=1))
#print(2 ,model.doesnt_match("das nichts dazu".split()))
print(3 , model.similar_by_word("Asylbewerber", topn=20))
print(4, model.n_similarity(['dazu'], ['nicht']))
print(5, model.n_similarity(['dazu'], ['nichts']))
'''
print
print 'Der Vektor fuer Warschau ist:'
print
x = model['Warschau']
print x
print
print 'Und die laenge betraegt:'
print
print len(model['Warschau'])
pickle.dump( x, open( "Vektoren.p", "wb" ) )
y = pickle.load( open( "Vektoren.p", "rb" ) )
print y'''