alpcentaur
/
laywerrobot


								# import modules and set up logging

								from gensim.models import word2vec

								import logging

								import gensim


								import _pickle as cPickle


								logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


								# load up unzipped corpus from http://mattmahoney.net/dc/text8.zip

								#sentences = word2vec.Text8Corpus('corpus/dewiki.txt')                  # muss auskommentiert werden und den text der output vom preprocessing.py ist muss rein

								# train the skip-gram model; default window=5

								#model = word2vec.Word2Vec(sentences, size=200)                         # muss auskommentiert werden

								# ... and some hours later... just as advertised...

								#model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)


								# pickle the entire model to disk, so we can load&resume training later

								#model.save('models/wiki.model')

								# store the learned weights, in a format the original C tool understands

								#model.save_word2vec_format('models/wiki.model.bin', binary=True)        # Jo diese Zeile muss auskommentiert werden, mit den dreien muesste es klappen.

								# or, import word weights created by the (faster) C word2vec

								# this way, you can switch between the C/Python toolkits easily

								#model = gensim.models.Word2Vec.load('models/wiki.model')


								# Hier kommt jetzt ein beispiel, wie man das schoen dann nutzen kann. kommentiere das beim trainieren erst mal aus, also alles ab hier. Dann lade das bin model und schau dir an was bei rumkommt. Ich dachte an cluster von similar_by_word mengen, das muss dann natuerlich alles noch in zahlen umgewandelt werden damit man schnellen db zugriff hat.


								print('loading the big model')

								model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True)  # C binary format

								print('done')


								print('the vocab is')

								print(model.vocab)


								# "boy" is to "father" as "girl" is to ...?

								print(1 ,model.most_similar(positive=['Koenig','Koenigin'], negative=['Mann'], topn=1))


								#print(2 ,model.doesnt_match("das nichts dazu".split()))


								print(3 , model.similar_by_word("Asylbewerber", topn=20))


								print(4, model.n_similarity(['dazu'], ['nicht']))


								print(5, model.n_similarity(['dazu'], ['nichts']))

								'''

								print

								print 'Der Vektor fuer Warschau ist:'

								print

								x = model['Warschau']


								print x

								print

								print 'Und die laenge betraegt:'

								print

								print len(model['Warschau'])


								pickle.dump( x, open( "Vektoren.p", "wb" ) )


								y = pickle.load( open( "Vektoren.p", "rb" ) )


								print y'''