Search on legal documents using Tensorflow and a web_actix web interface
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

29 lines
700 B

  1. # fist to lower, because written uppercase stops wont be recognized
  2. import os
  3. import sys
  4. Indok = sys.argv[1]
  5. Outdok = sys.argv[2]
  6. import nltk
  7. nltk.download('stopwords')
  8. from nltk.corpus import stopwords # Import the stop word list
  9. stops = set(stopwords.words("german"))
  10. with open(Indok) as InDok:
  11. with open(Outdok, 'a') as OutDok:
  12. for line in InDok:
  13. linelist_noStopwords = []
  14. words = eval(line[:-1])
  15. linelist_noStopwords = [w for w in words if not w in stops]
  16. #for word in linelist not in stops:
  17. # linelist_noStopwords.append(word)
  18. OutDok.write(str(linelist_noStopwords))
  19. OutDok.write('\n')