Search on legal documents using Tensorflow and a web_actix web interface
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

42 lines
1.1 KiB

  1. # fist to lower, because written uppercase stops wont be recognized
  2. import os
  3. import sys
  4. Indok = sys.argv[1]
  5. Outdok = sys.argv[2]
  6. from gensim.models import word2vec
  7. import logging
  8. import gensim
  9. print('loading model...')
  10. model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True) # C binary format
  11. print('done')
  12. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  13. with open(Indok) as InDok:
  14. with open(Outdok, 'a') as OutDok:
  15. for line in InDok:
  16. linelist_wordcloud = []
  17. words = eval(line[:-1])
  18. for word in words:
  19. synonyms = []
  20. try:
  21. momo = model.similar_by_word(word, topn=3)
  22. for element in momo:
  23. synonyms.append(element[0])
  24. except KeyError:
  25. pass
  26. for s in synonyms:
  27. linelist_wordcloud.append(s)
  28. linelist_wordcloud.append(word)
  29. OutDok.write(str(linelist_wordcloud))
  30. OutDok.write('\n')