# Klasse zum Konvertieren von nicht Aussagen zu postiven Aussagen. # Notiz: nicht + Adjektiv kann direkt durch Gegenwort ausgetauscht werden. # nicht + verb kann zum Gegenwort des Verbes ausgetauscht werden, dabei muss aber nach Hause weggecuttet werden bei bsp Er ging nicht nach Hause. Er blieb # in wiktionary {{Gegenwörter}} Kategorie import spacy import nltk from nltk.stem.snowball import SnowballStemmer import hickle as hkl import FASTsearch stemmer = SnowballStemmer("german") class SayYes(object): def __init__(self, hklDatabaseDir_Opposites, hklDatabaseDir_Opposites_All): if hklDatabaseDir_Opposites is not None: self.OppositesDB = hkl.load(hklDatabaseDir_Opposites) #print('loading the german spacy model..') self.nlp = spacy.load('de_core_news_sm') #print('done') #print('loading the stemmer..') self.stemmer = SnowballStemmer("german") #print('done') return def create_hklDB_from_csv(self, csvDbDir): with open(csvDbDir) as lines: self.OppositesDB_All = [] for line in lines: #print(line) self.OppositesDB_All.append(list(eval(line))) self.hkldbOpposites1 = [] self.hkldbOpposites2 = [] counter = 0 for n in range(len(self.OppositesDB_All)): counter += 1 if counter % 1000 == 0: print(counter) self.hkldbOpposites1.append([self.OppositesDB_All[n][0][0]] + [self.stemmer.stem(word) for word in self.OppositesDB_All[n][0]] ) self.hkldbOpposites2.append([self.OppositesDB_All[n][1][0]] + [stemmer.stem(word) for word in self.OppositesDB_All[n][1]] ) #print('hkldbOpposites1', self.hkldbOpposites1) #print('hkldbOpposites2', self.hkldbOpposites2) print('creating the hkl dump of OppositesDBAll') hkl.dump(self.OppositesDB_All, 'hkldbOpposites_All.hkl', mode='w', compression='gzip') print('done..') print('Creating the hkl dump of OppositesDB 1') hkl.dump(self.hkldbOpposites1, 'hkldbOpposites1.hkl', mode='w', compression='gzip') print('done..') print('Creating the hkl dump of OppositesDB 2') hkl.dump(self.hkldbOpposites2, 'hkldbOpposites2.hkl', mode='w', compression='gzip') print('done..') return 'done' def load_DB_into_FASTsearch(self): #print('Loading the hklDB1..') self.hkldbOpposites1 = hkl.load('hkldbOpposites1.hkl') #print('done') #print('Loading the hklDB2') self.hkldbOpposites2 = hkl.load('hkldbOpposites2.hkl') #print('done') #print('loading hkldbOpposites 1..') self.fsearch1 = FASTsearch.FASTsearch('hkldbOpposites1.hkl') #print('done') #print('loading hkldbOpposites 2..') self.fsearch2 = FASTsearch.FASTsearch('hkldbOpposites2.hkl') #print('done') #print('generating BoW Model 1..') #self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = False) #print('done') #print('generating BoW Model 2..') #self.fsearch2.Gen_BoW_Model(3000, "word", punctuation = False) #print('done') #print('loading the bow model 1') self.fsearch1.Load_BoW_Model('bagofwordshkldbOpposites1.pkl', 'DataBaseOneZeroshkldbOpposites1.hkl') #print('done') #print('loading the bow model 2') self.fsearch2.Load_BoW_Model('bagofwordshkldbOpposites2.pkl', 'DataBaseOneZeroshkldbOpposites2.hkl') #print('done') #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names()) #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names()) def replaceOpposites(self, sentences): outsentences = [] #print('wenigstens etwas') sentencecount = 0 for sentence in sentences: #print('oloa') sentencecount += 1 #print('processing sentence', sentencecount) listofAdjektives = [] nichtIndex = None KeinIndex = None for m in range(len(sentence)): if sentence[m] == 'nicht': nichtIndex = m if sentence[m][:4] == 'kein': KeinIndex = m #if KeinIndex not None or nichtIndex not None: #if len(listofAdjektives) == 0: #if word.dep_[0] == 'V': #print('ola') if (KeinIndex is not None) or (nichtIndex is not None): doc = self.nlp(' '.join(sentence)) count = 0 for word in doc: count += 1 if word.text == ',': count -= 1 #print(word.text, word.tag_, word.tag_[:1]) if word.tag_[:2] == 'AD': listofAdjektives.append([word.text, count - 1]) listOfOpposites = [] if (KeinIndex is not None): #print(sentence[KeinIndex + 1]) if len(listofAdjektives) == 0 or len(listofAdjektives) > 1: AdjIndex = 1 #print('listofadjectives', listofAdjektives) for n in range(len(listofAdjektives)): ad = listofAdjektives[n] #print(ad[1]) if ad[1] == KeinIndex + 2 and ad[0] == 'zu': if listofAdjektives[n + 1][1] == KeinIndex + 3: AdjIndex = 2 #print('Adj und stemadj 0 2') #print(sentence[KeinIndex + AdjIndex]) #print(self.stemmer.stem(sentence[KeinIndex + AdjIndex])) bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1) bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1) Austauschindex = KeinIndex + AdjIndex else: Adjektiv = listofAdjektives[0][0] #print('Adj und stemadj') #print(Adjektiv) #print(self.stemmer.stem(Adjektiv)) Austauschindex = listofAdjektives[0][1] bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1) bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1) Opposite = None #print('thetheone') if (nichtIndex is not None): #print(sentence[nichtIndex + 1]) #print('theone') if len(listofAdjektives) == 0 or len(listofAdjektives) > 1: #print('1') #print(nichtIndex) #print('2') if nichtIndex == (len(sentence) - 1): Austauschindex = nichtIndex - 1 else: Austauschindex = nichtIndex + 1 # TO DO: egal formen auf infinitiv mappen # Das muss mit machine learnign gelöst werden.. # --> ergiebt sich aus den übersetzungen ( welches wort fehl, welches neu # da, dann daraus eine maschine die sich die gegenteile merkt =) #itisaVerb = False #if doc[Austauschindex].dep_[0] == 'V': # itisaVerb = True #someform = sentence[Austauschindex] bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1) bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1) else: Adjektiv = listofAdjektives[0][0] Austauschindex = listofAdjektives[0][1] bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1) bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1) Opposite = None #print(sentence) #print(bestmatches1, matchindex1) #print(bestmatches1, matchindex1) #print(len(listOfOpposites)) if matchindex1[1] >= 1: OppositeIndex = matchindex1[0] Opposite = self.hkldbOpposites2[OppositeIndex][0] #print('Opposite in match1', Opposite) listOfOpposites.append([Opposite,Austauschindex]) if matchindex2[1] >= 1: OppositeIndex = matchindex2[0] Opposite = self.hkldbOpposites1[OppositeIndex][0] #print('opposite in match2', Opposite) listOfOpposites.append([Opposite,Austauschindex]) #print(listOfOpposites) for opposite in listOfOpposites: if sentence[opposite[1]][-1] == ',': if sentence[opposite[1]][-3:] == 'es,': opposite[0] = opposite[0] + 'es' if sentence[opposite[1]][-3:] == 'er,': opposite[0] = opposite[0] + 'er' if sentence[opposite[1]][-3:] == 'em,': opposite[0] = opposite[0] + 'em' if sentence[opposite[1]][-2:] == 'e,': opposite[0] = opposite[0] + 'e' sentence[opposite[1]] = opposite[0] + ',' else: if sentence[opposite[1]][-2:] == 'es': opposite[0] = opposite[0] + 'es' if sentence[opposite[1]][-2:] == 'er': opposite[0] = opposite[0] + 'er' if sentence[opposite[1]][-2:] == 'em': opposite[0] = opposite[0] + 'em' if sentence[opposite[1]][-1:] == 'e': opposite[0] = opposite[0] + 'e' sentence[opposite[1]] = opposite[0] if KeinIndex is not None and len(listOfOpposites) > 0: #print(KeinIndex) sentence[KeinIndex] = sentence[KeinIndex][1:] if nichtIndex is not None and len(listOfOpposites) > 0: #print(nichtIndex) printer = sentence.pop(nichtIndex) #print(printer) outsentences.append(sentence) return outsentences