Search on legal documents using Tensorflow and a web_actix web interface
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

24 lines
849 B

import os
import sys
Indok = sys.argv[1]
Outdok = sys.argv[2]
with open(Indok) as InDok:
with open(Outdok, 'a') as OutDok:
for line in InDok:
linelist_lower = []
linelist = eval(line[:-1])
for word in linelist:
if '_' and '^' and 'x0' and 'x1' not in word:
word = word.replace('ü', 'ue')
word = word.replace('ö', 'oe')
word = word.replace('ä', 'ae')
word = word.replace('ß', 'ss')
word = word.strip('-.,?!<>|#+~}{][&%$^°*;:-_')
word = word.encode('ascii', 'ignore').decode('ascii')
if len(word) > 1: