basabuuka_prototyp/Prototyp/SayYes.py

328 lines
13 KiB
Python
Raw Normal View History

2020-08-16 19:36:44 +02:00
# Klasse zum Konvertieren von nicht Aussagen zu postiven Aussagen.
# Notiz: nicht + Adjektiv kann direkt durch Gegenwort ausgetauscht werden.
# nicht + verb kann zum Gegenwort des Verbes ausgetauscht werden, dabei muss aber nach Hause weggecuttet werden bei bsp Er ging nicht nach Hause. Er blieb
# in wiktionary {{Gegenwörter}} Kategorie
import spacy
import nltk
from nltk.stem.snowball import SnowballStemmer
import hickle as hkl
import FASTsearch
stemmer = SnowballStemmer("german")
class SayYes(object):
def __init__(self, hklDatabaseDir_Opposites, hklDatabaseDir_Opposites_All):
if hklDatabaseDir_Opposites is not None:
self.OppositesDB = hkl.load(hklDatabaseDir_Opposites)
#print('loading the german spacy model..')
self.nlp = spacy.load('de_core_news_sm')
#print('done')
#print('loading the stemmer..')
self.stemmer = SnowballStemmer("german")
#print('done')
return
def create_hklDB_from_csv(self, csvDbDir):
with open(csvDbDir) as lines:
self.OppositesDB_All = []
for line in lines:
#print(line)
self.OppositesDB_All.append(list(eval(line)))
self.hkldbOpposites1 = []
self.hkldbOpposites2 = []
counter = 0
for n in range(len(self.OppositesDB_All)):
counter += 1
if counter % 1000 == 0:
print(counter)
self.hkldbOpposites1.append([self.OppositesDB_All[n][0][0]] + [self.stemmer.stem(word) for word in self.OppositesDB_All[n][0]] )
self.hkldbOpposites2.append([self.OppositesDB_All[n][1][0]] + [stemmer.stem(word) for word in self.OppositesDB_All[n][1]] )
#print('hkldbOpposites1', self.hkldbOpposites1)
#print('hkldbOpposites2', self.hkldbOpposites2)
print('creating the hkl dump of OppositesDBAll')
hkl.dump(self.OppositesDB_All, 'hkldbOpposites_All.hkl', mode='w', compression='gzip')
print('done..')
print('Creating the hkl dump of OppositesDB 1')
hkl.dump(self.hkldbOpposites1, 'hkldbOpposites1.hkl', mode='w', compression='gzip')
print('done..')
print('Creating the hkl dump of OppositesDB 2')
hkl.dump(self.hkldbOpposites2, 'hkldbOpposites2.hkl', mode='w', compression='gzip')
print('done..')
return 'done'
def load_DB_into_FASTsearch(self):
#print('Loading the hklDB1..')
self.hkldbOpposites1 = hkl.load('hkldbOpposites1.hkl')
#print('done')
#print('Loading the hklDB2')
self.hkldbOpposites2 = hkl.load('hkldbOpposites2.hkl')
#print('done')
#print('loading hkldbOpposites 1..')
self.fsearch1 = FASTsearch.FASTsearch('hkldbOpposites1.hkl')
#print('done')
#print('loading hkldbOpposites 2..')
self.fsearch2 = FASTsearch.FASTsearch('hkldbOpposites2.hkl')
#print('done')
#print('generating BoW Model 1..')
#self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = False)
#print('done')
#print('generating BoW Model 2..')
#self.fsearch2.Gen_BoW_Model(3000, "word", punctuation = False)
#print('done')
#print('loading the bow model 1')
self.fsearch1.Load_BoW_Model('bagofwordshkldbOpposites1.pkl', 'DataBaseOneZeroshkldbOpposites1.hkl')
#print('done')
#print('loading the bow model 2')
self.fsearch2.Load_BoW_Model('bagofwordshkldbOpposites2.pkl', 'DataBaseOneZeroshkldbOpposites2.hkl')
#print('done')
#print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
#print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
def replaceOpposites(self, sentences):
outsentences = []
#print('wenigstens etwas')
sentencecount = 0
for sentence in sentences:
#print('oloa')
sentencecount += 1
#print('processing sentence', sentencecount)
listofAdjektives = []
nichtIndex = None
KeinIndex = None
for m in range(len(sentence)):
if sentence[m] == 'nicht':
nichtIndex = m
if sentence[m][:4] == 'kein':
KeinIndex = m
#if KeinIndex not None or nichtIndex not None:
#if len(listofAdjektives) == 0:
#if word.dep_[0] == 'V':
#print('ola')
if (KeinIndex is not None) or (nichtIndex is not None):
doc = self.nlp(' '.join(sentence))
count = 0
for word in doc:
count += 1
if word.text == ',':
count -= 1
#print(word.text, word.tag_, word.tag_[:1])
if word.tag_[:2] == 'AD':
listofAdjektives.append([word.text, count - 1])
listOfOpposites = []
if (KeinIndex is not None):
#print(sentence[KeinIndex + 1])
if len(listofAdjektives) == 0 or len(listofAdjektives) > 1:
AdjIndex = 1
#print('listofadjectives', listofAdjektives)
for n in range(len(listofAdjektives)):
ad = listofAdjektives[n]
#print(ad[1])
if ad[1] == KeinIndex + 2 and ad[0] == 'zu':
if listofAdjektives[n + 1][1] == KeinIndex + 3:
AdjIndex = 2
#print('Adj und stemadj 0 2')
#print(sentence[KeinIndex + AdjIndex])
#print(self.stemmer.stem(sentence[KeinIndex + AdjIndex]))
bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1)
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1)
Austauschindex = KeinIndex + AdjIndex
else:
Adjektiv = listofAdjektives[0][0]
#print('Adj und stemadj')
#print(Adjektiv)
#print(self.stemmer.stem(Adjektiv))
Austauschindex = listofAdjektives[0][1]
bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
Opposite = None
#print('thetheone')
if (nichtIndex is not None):
#print(sentence[nichtIndex + 1])
#print('theone')
if len(listofAdjektives) == 0 or len(listofAdjektives) > 1:
#print('1')
#print(nichtIndex)
#print('2')
if nichtIndex == (len(sentence) - 1):
Austauschindex = nichtIndex - 1
else:
Austauschindex = nichtIndex + 1
# TO DO: egal formen auf infinitiv mappen
# Das muss mit machine learnign gelöst werden..
# --> ergiebt sich aus den übersetzungen ( welches wort fehl, welches neu
# da, dann daraus eine maschine die sich die gegenteile merkt =)
#itisaVerb = False
#if doc[Austauschindex].dep_[0] == 'V':
# itisaVerb = True
#someform = sentence[Austauschindex]
bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1)
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1)
else:
Adjektiv = listofAdjektives[0][0]
Austauschindex = listofAdjektives[0][1]
bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
Opposite = None
#print(sentence)
#print(bestmatches1, matchindex1)
#print(bestmatches1, matchindex1)
#print(len(listOfOpposites))
if matchindex1[1] >= 1:
OppositeIndex = matchindex1[0]
Opposite = self.hkldbOpposites2[OppositeIndex][0]
#print('Opposite in match1', Opposite)
listOfOpposites.append([Opposite,Austauschindex])
if matchindex2[1] >= 1:
OppositeIndex = matchindex2[0]
Opposite = self.hkldbOpposites1[OppositeIndex][0]
#print('opposite in match2', Opposite)
listOfOpposites.append([Opposite,Austauschindex])
#print(listOfOpposites)
for opposite in listOfOpposites:
if sentence[opposite[1]][-1] == ',':
if sentence[opposite[1]][-3:] == 'es,':
opposite[0] = opposite[0] + 'es'
if sentence[opposite[1]][-3:] == 'er,':
opposite[0] = opposite[0] + 'er'
if sentence[opposite[1]][-3:] == 'em,':
opposite[0] = opposite[0] + 'em'
if sentence[opposite[1]][-2:] == 'e,':
opposite[0] = opposite[0] + 'e'
sentence[opposite[1]] = opposite[0] + ','
else:
if sentence[opposite[1]][-2:] == 'es':
opposite[0] = opposite[0] + 'es'
if sentence[opposite[1]][-2:] == 'er':
opposite[0] = opposite[0] + 'er'
if sentence[opposite[1]][-2:] == 'em':
opposite[0] = opposite[0] + 'em'
if sentence[opposite[1]][-1:] == 'e':
opposite[0] = opposite[0] + 'e'
sentence[opposite[1]] = opposite[0]
if KeinIndex is not None and len(listOfOpposites) > 0:
#print(KeinIndex)
sentence[KeinIndex] = sentence[KeinIndex][1:]
if nichtIndex is not None and len(listOfOpposites) > 0:
#print(nichtIndex)
printer = sentence.pop(nichtIndex)
#print(printer)
outsentences.append(sentence)
return outsentences