30 lines
700 B
Python
30 lines
700 B
Python
|
# fist to lower, because written uppercase stops wont be recognized
|
||
|
|
||
|
import os
|
||
|
import sys
|
||
|
|
||
|
Indok = sys.argv[1]
|
||
|
Outdok = sys.argv[2]
|
||
|
|
||
|
import nltk
|
||
|
|
||
|
nltk.download('stopwords')
|
||
|
|
||
|
from nltk.corpus import stopwords # Import the stop word list
|
||
|
|
||
|
|
||
|
stops = set(stopwords.words("german"))
|
||
|
|
||
|
|
||
|
with open(Indok) as InDok:
|
||
|
with open(Outdok, 'a') as OutDok:
|
||
|
for line in InDok:
|
||
|
linelist_noStopwords = []
|
||
|
words = eval(line[:-1])
|
||
|
linelist_noStopwords = [w for w in words if not w in stops]
|
||
|
#for word in linelist not in stops:
|
||
|
# linelist_noStopwords.append(word)
|
||
|
OutDok.write(str(linelist_noStopwords))
|
||
|
OutDok.write('\n')
|
||
|
|