Search on legal documents using Tensorflow and a web_actix web interface
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

24 lines
849 B

  1. import os
  2. import sys
  3. Indok = sys.argv[1]
  4. Outdok = sys.argv[2]
  5. with open(Indok) as InDok:
  6. with open(Outdok, 'a') as OutDok:
  7. for line in InDok:
  8. linelist_lower = []
  9. linelist = eval(line[:-1])
  10. for word in linelist:
  11. if '_' and '^' and 'x0' and 'x1' not in word:
  12. word = word.replace('ü', 'ue')
  13. word = word.replace('ö', 'oe')
  14. word = word.replace('ä', 'ae')
  15. word = word.replace('ß', 'ss')
  16. word = word.strip('-.,?!<>|#+~}{][&%$^°*;:-_')
  17. word = word.encode('ascii', 'ignore').decode('ascii')
  18. if len(word) > 1:
  19. linelist_lower.append(word.lower())
  20. OutDok.write(str(linelist_lower))
  21. OutDok.write('\n')