|
|
- # Klasse zum Konvertieren von nicht Aussagen zu postiven Aussagen.
-
-
- # Notiz: nicht + Adjektiv kann direkt durch Gegenwort ausgetauscht werden.
- # nicht + verb kann zum Gegenwort des Verbes ausgetauscht werden, dabei muss aber nach Hause weggecuttet werden bei bsp Er ging nicht nach Hause. Er blieb
-
-
- # in wiktionary {{Gegenwörter}} Kategorie
-
- import spacy
- import nltk
- from nltk.stem.snowball import SnowballStemmer
-
- import hickle as hkl
- import FASTsearch
-
- stemmer = SnowballStemmer("german")
-
-
- class SayYes(object):
-
- def __init__(self, hklDatabaseDir_Opposites, hklDatabaseDir_Opposites_All):
-
- if hklDatabaseDir_Opposites is not None:
- self.OppositesDB = hkl.load(hklDatabaseDir_Opposites)
-
-
- #print('loading the german spacy model..')
- self.nlp = spacy.load('de_core_news_sm')
- #print('done')
-
- #print('loading the stemmer..')
- self.stemmer = SnowballStemmer("german")
- #print('done')
-
- return
-
-
- def create_hklDB_from_csv(self, csvDbDir):
-
- with open(csvDbDir) as lines:
-
- self.OppositesDB_All = []
-
- for line in lines:
-
- #print(line)
-
- self.OppositesDB_All.append(list(eval(line)))
-
-
-
-
-
- self.hkldbOpposites1 = []
- self.hkldbOpposites2 = []
-
-
- counter = 0
- for n in range(len(self.OppositesDB_All)):
-
- counter += 1
- if counter % 1000 == 0:
- print(counter)
-
-
- self.hkldbOpposites1.append([self.OppositesDB_All[n][0][0]] + [self.stemmer.stem(word) for word in self.OppositesDB_All[n][0]] )
- self.hkldbOpposites2.append([self.OppositesDB_All[n][1][0]] + [stemmer.stem(word) for word in self.OppositesDB_All[n][1]] )
-
-
-
- #print('hkldbOpposites1', self.hkldbOpposites1)
- #print('hkldbOpposites2', self.hkldbOpposites2)
-
- print('creating the hkl dump of OppositesDBAll')
- hkl.dump(self.OppositesDB_All, 'hkldbOpposites_All.hkl', mode='w', compression='gzip')
- print('done..')
-
- print('Creating the hkl dump of OppositesDB 1')
- hkl.dump(self.hkldbOpposites1, 'hkldbOpposites1.hkl', mode='w', compression='gzip')
- print('done..')
-
- print('Creating the hkl dump of OppositesDB 2')
- hkl.dump(self.hkldbOpposites2, 'hkldbOpposites2.hkl', mode='w', compression='gzip')
- print('done..')
-
-
-
- return 'done'
-
- def load_DB_into_FASTsearch(self):
-
- #print('Loading the hklDB1..')
- self.hkldbOpposites1 = hkl.load('hkldbOpposites1.hkl')
- #print('done')
-
- #print('Loading the hklDB2')
- self.hkldbOpposites2 = hkl.load('hkldbOpposites2.hkl')
- #print('done')
-
- #print('loading hkldbOpposites 1..')
- self.fsearch1 = FASTsearch.FASTsearch('hkldbOpposites1.hkl')
- #print('done')
-
- #print('loading hkldbOpposites 2..')
- self.fsearch2 = FASTsearch.FASTsearch('hkldbOpposites2.hkl')
- #print('done')
-
- #print('generating BoW Model 1..')
- #self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = False)
- #print('done')
-
- #print('generating BoW Model 2..')
- #self.fsearch2.Gen_BoW_Model(3000, "word", punctuation = False)
- #print('done')
-
-
- #print('loading the bow model 1')
- self.fsearch1.Load_BoW_Model('bagofwordshkldbOpposites1.pkl', 'DataBaseOneZeroshkldbOpposites1.hkl')
- #print('done')
-
- #print('loading the bow model 2')
- self.fsearch2.Load_BoW_Model('bagofwordshkldbOpposites2.pkl', 'DataBaseOneZeroshkldbOpposites2.hkl')
- #print('done')
-
-
- #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
- #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
-
-
-
-
-
- def replaceOpposites(self, sentences):
- outsentences = []
- #print('wenigstens etwas')
- sentencecount = 0
- for sentence in sentences:
- #print('oloa')
- sentencecount += 1
- #print('processing sentence', sentencecount)
-
- listofAdjektives = []
- nichtIndex = None
- KeinIndex = None
- for m in range(len(sentence)):
- if sentence[m] == 'nicht':
- nichtIndex = m
-
-
- if sentence[m][:4] == 'kein':
-
- KeinIndex = m
-
- #if KeinIndex not None or nichtIndex not None:
-
-
- #if len(listofAdjektives) == 0:
- #if word.dep_[0] == 'V':
- #print('ola')
-
- if (KeinIndex is not None) or (nichtIndex is not None):
-
- doc = self.nlp(' '.join(sentence))
- count = 0
- for word in doc:
- count += 1
- if word.text == ',':
- count -= 1
- #print(word.text, word.tag_, word.tag_[:1])
- if word.tag_[:2] == 'AD':
-
- listofAdjektives.append([word.text, count - 1])
-
-
-
- listOfOpposites = []
- if (KeinIndex is not None):
-
- #print(sentence[KeinIndex + 1])
-
-
- if len(listofAdjektives) == 0 or len(listofAdjektives) > 1:
- AdjIndex = 1
- #print('listofadjectives', listofAdjektives)
- for n in range(len(listofAdjektives)):
- ad = listofAdjektives[n]
- #print(ad[1])
- if ad[1] == KeinIndex + 2 and ad[0] == 'zu':
- if listofAdjektives[n + 1][1] == KeinIndex + 3:
- AdjIndex = 2
-
- #print('Adj und stemadj 0 2')
- #print(sentence[KeinIndex + AdjIndex])
- #print(self.stemmer.stem(sentence[KeinIndex + AdjIndex]))
-
- bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1)
-
- bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1)
-
- Austauschindex = KeinIndex + AdjIndex
-
- else:
-
-
-
- Adjektiv = listofAdjektives[0][0]
- #print('Adj und stemadj')
- #print(Adjektiv)
- #print(self.stemmer.stem(Adjektiv))
- Austauschindex = listofAdjektives[0][1]
-
- bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
-
- bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
-
-
- Opposite = None
- #print('thetheone')
- if (nichtIndex is not None):
-
- #print(sentence[nichtIndex + 1])
- #print('theone')
- if len(listofAdjektives) == 0 or len(listofAdjektives) > 1:
- #print('1')
- #print(nichtIndex)
- #print('2')
- if nichtIndex == (len(sentence) - 1):
- Austauschindex = nichtIndex - 1
- else:
- Austauschindex = nichtIndex + 1
-
- # TO DO: egal formen auf infinitiv mappen
- # Das muss mit machine learnign gelöst werden..
- # --> ergiebt sich aus den übersetzungen ( welches wort fehl, welches neu
- # da, dann daraus eine maschine die sich die gegenteile merkt =)
-
- #itisaVerb = False
- #if doc[Austauschindex].dep_[0] == 'V':
- # itisaVerb = True
-
- #someform = sentence[Austauschindex]
-
-
-
-
-
- bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1)
-
- bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1)
-
-
-
- else:
-
- Adjektiv = listofAdjektives[0][0]
-
- Austauschindex = listofAdjektives[0][1]
-
- bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
-
- bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
-
- Opposite = None
- #print(sentence)
- #print(bestmatches1, matchindex1)
- #print(bestmatches1, matchindex1)
- #print(len(listOfOpposites))
- if matchindex1[1] >= 1:
- OppositeIndex = matchindex1[0]
-
- Opposite = self.hkldbOpposites2[OppositeIndex][0]
- #print('Opposite in match1', Opposite)
- listOfOpposites.append([Opposite,Austauschindex])
-
- if matchindex2[1] >= 1:
- OppositeIndex = matchindex2[0]
-
- Opposite = self.hkldbOpposites1[OppositeIndex][0]
- #print('opposite in match2', Opposite)
- listOfOpposites.append([Opposite,Austauschindex])
-
-
- #print(listOfOpposites)
- for opposite in listOfOpposites:
- if sentence[opposite[1]][-1] == ',':
-
- if sentence[opposite[1]][-3:] == 'es,':
- opposite[0] = opposite[0] + 'es'
- if sentence[opposite[1]][-3:] == 'er,':
- opposite[0] = opposite[0] + 'er'
- if sentence[opposite[1]][-3:] == 'em,':
- opposite[0] = opposite[0] + 'em'
- if sentence[opposite[1]][-2:] == 'e,':
- opposite[0] = opposite[0] + 'e'
- sentence[opposite[1]] = opposite[0] + ','
- else:
-
- if sentence[opposite[1]][-2:] == 'es':
- opposite[0] = opposite[0] + 'es'
- if sentence[opposite[1]][-2:] == 'er':
- opposite[0] = opposite[0] + 'er'
- if sentence[opposite[1]][-2:] == 'em':
- opposite[0] = opposite[0] + 'em'
- if sentence[opposite[1]][-1:] == 'e':
- opposite[0] = opposite[0] + 'e'
- sentence[opposite[1]] = opposite[0]
-
- if KeinIndex is not None and len(listOfOpposites) > 0:
-
- #print(KeinIndex)
- sentence[KeinIndex] = sentence[KeinIndex][1:]
-
-
-
- if nichtIndex is not None and len(listOfOpposites) > 0:
-
- #print(nichtIndex)
-
- printer = sentence.pop(nichtIndex)
-
- #print(printer)
-
-
-
- outsentences.append(sentence)
- return outsentences
|