328 lines
13 KiB
Python
328 lines
13 KiB
Python
|
# Klasse zum Konvertieren von nicht Aussagen zu postiven Aussagen.
|
||
|
|
||
|
|
||
|
# Notiz: nicht + Adjektiv kann direkt durch Gegenwort ausgetauscht werden.
|
||
|
# nicht + verb kann zum Gegenwort des Verbes ausgetauscht werden, dabei muss aber nach Hause weggecuttet werden bei bsp Er ging nicht nach Hause. Er blieb
|
||
|
|
||
|
|
||
|
# in wiktionary {{Gegenwörter}} Kategorie
|
||
|
|
||
|
import spacy
|
||
|
import nltk
|
||
|
from nltk.stem.snowball import SnowballStemmer
|
||
|
|
||
|
import hickle as hkl
|
||
|
import FASTsearch
|
||
|
|
||
|
stemmer = SnowballStemmer("german")
|
||
|
|
||
|
|
||
|
class SayYes(object):
|
||
|
|
||
|
def __init__(self, hklDatabaseDir_Opposites, hklDatabaseDir_Opposites_All):
|
||
|
|
||
|
if hklDatabaseDir_Opposites is not None:
|
||
|
self.OppositesDB = hkl.load(hklDatabaseDir_Opposites)
|
||
|
|
||
|
|
||
|
#print('loading the german spacy model..')
|
||
|
self.nlp = spacy.load('de_core_news_sm')
|
||
|
#print('done')
|
||
|
|
||
|
#print('loading the stemmer..')
|
||
|
self.stemmer = SnowballStemmer("german")
|
||
|
#print('done')
|
||
|
|
||
|
return
|
||
|
|
||
|
|
||
|
def create_hklDB_from_csv(self, csvDbDir):
|
||
|
|
||
|
with open(csvDbDir) as lines:
|
||
|
|
||
|
self.OppositesDB_All = []
|
||
|
|
||
|
for line in lines:
|
||
|
|
||
|
#print(line)
|
||
|
|
||
|
self.OppositesDB_All.append(list(eval(line)))
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
self.hkldbOpposites1 = []
|
||
|
self.hkldbOpposites2 = []
|
||
|
|
||
|
|
||
|
counter = 0
|
||
|
for n in range(len(self.OppositesDB_All)):
|
||
|
|
||
|
counter += 1
|
||
|
if counter % 1000 == 0:
|
||
|
print(counter)
|
||
|
|
||
|
|
||
|
self.hkldbOpposites1.append([self.OppositesDB_All[n][0][0]] + [self.stemmer.stem(word) for word in self.OppositesDB_All[n][0]] )
|
||
|
self.hkldbOpposites2.append([self.OppositesDB_All[n][1][0]] + [stemmer.stem(word) for word in self.OppositesDB_All[n][1]] )
|
||
|
|
||
|
|
||
|
|
||
|
#print('hkldbOpposites1', self.hkldbOpposites1)
|
||
|
#print('hkldbOpposites2', self.hkldbOpposites2)
|
||
|
|
||
|
print('creating the hkl dump of OppositesDBAll')
|
||
|
hkl.dump(self.OppositesDB_All, 'hkldbOpposites_All.hkl', mode='w', compression='gzip')
|
||
|
print('done..')
|
||
|
|
||
|
print('Creating the hkl dump of OppositesDB 1')
|
||
|
hkl.dump(self.hkldbOpposites1, 'hkldbOpposites1.hkl', mode='w', compression='gzip')
|
||
|
print('done..')
|
||
|
|
||
|
print('Creating the hkl dump of OppositesDB 2')
|
||
|
hkl.dump(self.hkldbOpposites2, 'hkldbOpposites2.hkl', mode='w', compression='gzip')
|
||
|
print('done..')
|
||
|
|
||
|
|
||
|
|
||
|
return 'done'
|
||
|
|
||
|
def load_DB_into_FASTsearch(self):
|
||
|
|
||
|
#print('Loading the hklDB1..')
|
||
|
self.hkldbOpposites1 = hkl.load('hkldbOpposites1.hkl')
|
||
|
#print('done')
|
||
|
|
||
|
#print('Loading the hklDB2')
|
||
|
self.hkldbOpposites2 = hkl.load('hkldbOpposites2.hkl')
|
||
|
#print('done')
|
||
|
|
||
|
#print('loading hkldbOpposites 1..')
|
||
|
self.fsearch1 = FASTsearch.FASTsearch('hkldbOpposites1.hkl')
|
||
|
#print('done')
|
||
|
|
||
|
#print('loading hkldbOpposites 2..')
|
||
|
self.fsearch2 = FASTsearch.FASTsearch('hkldbOpposites2.hkl')
|
||
|
#print('done')
|
||
|
|
||
|
#print('generating BoW Model 1..')
|
||
|
#self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = False)
|
||
|
#print('done')
|
||
|
|
||
|
#print('generating BoW Model 2..')
|
||
|
#self.fsearch2.Gen_BoW_Model(3000, "word", punctuation = False)
|
||
|
#print('done')
|
||
|
|
||
|
|
||
|
#print('loading the bow model 1')
|
||
|
self.fsearch1.Load_BoW_Model('bagofwordshkldbOpposites1.pkl', 'DataBaseOneZeroshkldbOpposites1.hkl')
|
||
|
#print('done')
|
||
|
|
||
|
#print('loading the bow model 2')
|
||
|
self.fsearch2.Load_BoW_Model('bagofwordshkldbOpposites2.pkl', 'DataBaseOneZeroshkldbOpposites2.hkl')
|
||
|
#print('done')
|
||
|
|
||
|
|
||
|
#print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
|
||
|
#print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def replaceOpposites(self, sentences):
|
||
|
outsentences = []
|
||
|
#print('wenigstens etwas')
|
||
|
sentencecount = 0
|
||
|
for sentence in sentences:
|
||
|
#print('oloa')
|
||
|
sentencecount += 1
|
||
|
#print('processing sentence', sentencecount)
|
||
|
|
||
|
listofAdjektives = []
|
||
|
nichtIndex = None
|
||
|
KeinIndex = None
|
||
|
for m in range(len(sentence)):
|
||
|
if sentence[m] == 'nicht':
|
||
|
nichtIndex = m
|
||
|
|
||
|
|
||
|
if sentence[m][:4] == 'kein':
|
||
|
|
||
|
KeinIndex = m
|
||
|
|
||
|
#if KeinIndex not None or nichtIndex not None:
|
||
|
|
||
|
|
||
|
#if len(listofAdjektives) == 0:
|
||
|
#if word.dep_[0] == 'V':
|
||
|
#print('ola')
|
||
|
|
||
|
if (KeinIndex is not None) or (nichtIndex is not None):
|
||
|
|
||
|
doc = self.nlp(' '.join(sentence))
|
||
|
count = 0
|
||
|
for word in doc:
|
||
|
count += 1
|
||
|
if word.text == ',':
|
||
|
count -= 1
|
||
|
#print(word.text, word.tag_, word.tag_[:1])
|
||
|
if word.tag_[:2] == 'AD':
|
||
|
|
||
|
listofAdjektives.append([word.text, count - 1])
|
||
|
|
||
|
|
||
|
|
||
|
listOfOpposites = []
|
||
|
if (KeinIndex is not None):
|
||
|
|
||
|
#print(sentence[KeinIndex + 1])
|
||
|
|
||
|
|
||
|
if len(listofAdjektives) == 0 or len(listofAdjektives) > 1:
|
||
|
AdjIndex = 1
|
||
|
#print('listofadjectives', listofAdjektives)
|
||
|
for n in range(len(listofAdjektives)):
|
||
|
ad = listofAdjektives[n]
|
||
|
#print(ad[1])
|
||
|
if ad[1] == KeinIndex + 2 and ad[0] == 'zu':
|
||
|
if listofAdjektives[n + 1][1] == KeinIndex + 3:
|
||
|
AdjIndex = 2
|
||
|
|
||
|
#print('Adj und stemadj 0 2')
|
||
|
#print(sentence[KeinIndex + AdjIndex])
|
||
|
#print(self.stemmer.stem(sentence[KeinIndex + AdjIndex]))
|
||
|
|
||
|
bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1)
|
||
|
|
||
|
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1)
|
||
|
|
||
|
Austauschindex = KeinIndex + AdjIndex
|
||
|
|
||
|
else:
|
||
|
|
||
|
|
||
|
|
||
|
Adjektiv = listofAdjektives[0][0]
|
||
|
#print('Adj und stemadj')
|
||
|
#print(Adjektiv)
|
||
|
#print(self.stemmer.stem(Adjektiv))
|
||
|
Austauschindex = listofAdjektives[0][1]
|
||
|
|
||
|
bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
|
||
|
|
||
|
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
|
||
|
|
||
|
|
||
|
Opposite = None
|
||
|
#print('thetheone')
|
||
|
if (nichtIndex is not None):
|
||
|
|
||
|
#print(sentence[nichtIndex + 1])
|
||
|
#print('theone')
|
||
|
if len(listofAdjektives) == 0 or len(listofAdjektives) > 1:
|
||
|
#print('1')
|
||
|
#print(nichtIndex)
|
||
|
#print('2')
|
||
|
if nichtIndex == (len(sentence) - 1):
|
||
|
Austauschindex = nichtIndex - 1
|
||
|
else:
|
||
|
Austauschindex = nichtIndex + 1
|
||
|
|
||
|
# TO DO: egal formen auf infinitiv mappen
|
||
|
# Das muss mit machine learnign gelöst werden..
|
||
|
# --> ergiebt sich aus den übersetzungen ( welches wort fehl, welches neu
|
||
|
# da, dann daraus eine maschine die sich die gegenteile merkt =)
|
||
|
|
||
|
#itisaVerb = False
|
||
|
#if doc[Austauschindex].dep_[0] == 'V':
|
||
|
# itisaVerb = True
|
||
|
|
||
|
#someform = sentence[Austauschindex]
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1)
|
||
|
|
||
|
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1)
|
||
|
|
||
|
|
||
|
|
||
|
else:
|
||
|
|
||
|
Adjektiv = listofAdjektives[0][0]
|
||
|
|
||
|
Austauschindex = listofAdjektives[0][1]
|
||
|
|
||
|
bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
|
||
|
|
||
|
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
|
||
|
|
||
|
Opposite = None
|
||
|
#print(sentence)
|
||
|
#print(bestmatches1, matchindex1)
|
||
|
#print(bestmatches1, matchindex1)
|
||
|
#print(len(listOfOpposites))
|
||
|
if matchindex1[1] >= 1:
|
||
|
OppositeIndex = matchindex1[0]
|
||
|
|
||
|
Opposite = self.hkldbOpposites2[OppositeIndex][0]
|
||
|
#print('Opposite in match1', Opposite)
|
||
|
listOfOpposites.append([Opposite,Austauschindex])
|
||
|
|
||
|
if matchindex2[1] >= 1:
|
||
|
OppositeIndex = matchindex2[0]
|
||
|
|
||
|
Opposite = self.hkldbOpposites1[OppositeIndex][0]
|
||
|
#print('opposite in match2', Opposite)
|
||
|
listOfOpposites.append([Opposite,Austauschindex])
|
||
|
|
||
|
|
||
|
#print(listOfOpposites)
|
||
|
for opposite in listOfOpposites:
|
||
|
if sentence[opposite[1]][-1] == ',':
|
||
|
|
||
|
if sentence[opposite[1]][-3:] == 'es,':
|
||
|
opposite[0] = opposite[0] + 'es'
|
||
|
if sentence[opposite[1]][-3:] == 'er,':
|
||
|
opposite[0] = opposite[0] + 'er'
|
||
|
if sentence[opposite[1]][-3:] == 'em,':
|
||
|
opposite[0] = opposite[0] + 'em'
|
||
|
if sentence[opposite[1]][-2:] == 'e,':
|
||
|
opposite[0] = opposite[0] + 'e'
|
||
|
sentence[opposite[1]] = opposite[0] + ','
|
||
|
else:
|
||
|
|
||
|
if sentence[opposite[1]][-2:] == 'es':
|
||
|
opposite[0] = opposite[0] + 'es'
|
||
|
if sentence[opposite[1]][-2:] == 'er':
|
||
|
opposite[0] = opposite[0] + 'er'
|
||
|
if sentence[opposite[1]][-2:] == 'em':
|
||
|
opposite[0] = opposite[0] + 'em'
|
||
|
if sentence[opposite[1]][-1:] == 'e':
|
||
|
opposite[0] = opposite[0] + 'e'
|
||
|
sentence[opposite[1]] = opposite[0]
|
||
|
|
||
|
if KeinIndex is not None and len(listOfOpposites) > 0:
|
||
|
|
||
|
#print(KeinIndex)
|
||
|
sentence[KeinIndex] = sentence[KeinIndex][1:]
|
||
|
|
||
|
|
||
|
|
||
|
if nichtIndex is not None and len(listOfOpposites) > 0:
|
||
|
|
||
|
#print(nichtIndex)
|
||
|
|
||
|
printer = sentence.pop(nichtIndex)
|
||
|
|
||
|
#print(printer)
|
||
|
|
||
|
|
||
|
|
||
|
outsentences.append(sentence)
|
||
|
return outsentences
|