alpcentaur
/
laywerrobot

# import modules and set up loggingfrom gensim.models import word2vecimport loggingimport gensim
import _pickle as cPickle
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# load up unzipped corpus from http://mattmahoney.net/dc/text8.zip#sentences = word2vec.Text8Corpus('corpus/dewiki.txt')                  # muss auskommentiert werden und den text der output vom preprocessing.py ist muss rein# train the skip-gram model; default window=5#model = word2vec.Word2Vec(sentences, size=200)                         # muss auskommentiert werden# ... and some hours later... just as advertised...#model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1) # pickle the entire model to disk, so we can load&resume training later#model.save('models/wiki.model')# store the learned weights, in a format the original C tool understands#model.save_word2vec_format('models/wiki.model.bin', binary=True)        # Jo diese Zeile muss auskommentiert werden, mit den dreien muesste es klappen.# or, import word weights created by the (faster) C word2vec# this way, you can switch between the C/Python toolkits easily#model = gensim.models.Word2Vec.load('models/wiki.model')

# Hier kommt jetzt ein beispiel, wie man das schoen dann nutzen kann. kommentiere das beim trainieren erst mal aus, also alles ab hier. Dann lade das bin model und schau dir an was bei rumkommt. Ich dachte an cluster von similar_by_word mengen, das muss dann natuerlich alles noch in zahlen umgewandelt werden damit man schnellen db zugriff hat.
print('loading the big model')model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True)  # C binary formatprint('done')
print('the vocab is')print(model.vocab)


# "boy" is to "father" as "girl" is to ...?print(1 ,model.most_similar(positive=['Koenig','Koenigin'], negative=['Mann'], topn=1))
#print(2 ,model.doesnt_match("das nichts dazu".split()))

print(3 , model.similar_by_word("Asylbewerber", topn=20))
print(4, model.n_similarity(['dazu'], ['nicht']))
print(5, model.n_similarity(['dazu'], ['nichts']))'''
printprint 'Der Vektor fuer Warschau ist:'printx = model['Warschau']
print xprint print 'Und die laenge betraegt:'printprint len(model['Warschau'])
pickle.dump( x, open( "Vektoren.p", "wb" ) )
y = pickle.load( open( "Vektoren.p", "rb" ) )

print y'''