import spacy import nltk from nltk.stem.snowball import SnowballStemmer import hickle as hkl import FASTsearch stemmer = SnowballStemmer("german") class Passiv2Aktiv(object): def __init__(self, hklDatabaseDir_Aktiv, hklDatabaseDir_Vorgangspassiv, hklDatabaseDir_Zustandspassiv): if hklDatabaseDir_Aktiv is not None: self.AktivDB = hkl.load(hklDatabaseDir_Aktiv) if hklDatabaseDir_Vorgangspassiv is not None: self.VorgangspassivDB = hkl.load(hklDatabaseDir_Vorgangspassiv) if hklDatabaseDir_Zustandspassiv is not None: self.ZustandspassivDB = hkl.load(hklDatabaseDir_Zustandspassiv) #print('loading the german spacy model..') self.nlp = spacy.load('de_core_news_sm') #print('done') #print('loading the stemmer..') self.stemmer = SnowballStemmer("german") #print('done') return def create_hklDB_from_csv(self, csvDbDir, StemOrNot): with open(csvDbDir) as lines: self.DB_All = [] for line in lines: #print(line) self.DB_All.append(list(eval(line))) self.hkldb1 = [] self.hkldb2 = [] counter = 0 for n in range(len(self.DB_All)): counter += 1 if counter % 1000 == 0: print(counter) self.hkldb1.append([self.DB_All[n][0][0]] ) self.hkldb2.append([self.DB_All[n][1][0]] ) print('creating the hkl dump of DBAll') hkl.dump(self.DB_All, 'hkldb_All' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf') #print('done..') print('Creating the hkl dump of DB 1') hkl.dump(self.hkldb1, 'hkldb1' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf') #print('done..') print('Creating the hkl dump of DB 2') hkl.dump(self.hkldb2, 'hkldb2' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf') #print('done..') return 'done' def load_DB_into_FASTsearch(self): #print('loading the hkldb_All databases..') self.hkldbAktiv_All = hkl.load('hkldb_AllAktiv.hkl') #print('first done') self.hkldbVorgangspassiv_All = hkl.load('hkldb_AllVorgangspassiv.hkl') #print('second done') self.hkldbZustandspassiv_All = hkl.load('hkldb_AllZustandspassiv.hkl') #print('third done') #print('loading hkldbIndi_Conju 1..') self.fsearchAktiv1 = FASTsearch.FASTsearch('hkldb1Aktiv.hkl') #print('done') #print('loading hkldbIndi_Conju 2..') self.fsearchAktiv2 = FASTsearch.FASTsearch('hkldb2Aktiv.hkl') #print('done') # generate bow model only necessary the first time #print('generating BoW Model 1..') #self.fsearchAktiv1.Gen_BoW_Model(20000, "word", punctuation = False) #print('done') #print('generating BoW Model 2..') #self.fsearchAktiv2.Gen_BoW_Model(20000, "word", punctuation = False) #print('done') #print('loading the bow model 1') self.fsearchAktiv1.Load_BoW_Model('bagofwordshkldb1Aktiv.pkl', 'DataBaseOneZeroshkldb1Aktiv.hkl') #print('done') #print('loading the bow model 2') self.fsearchAktiv2.Load_BoW_Model('bagofwordshkldb2Aktiv.pkl', 'DataBaseOneZeroshkldb2Aktiv.hkl') #print('done') #print('loading hkldbIndi_Conju 1..') self.fsearchVorgangspassiv1 = FASTsearch.FASTsearch('hkldb1Vorgangspassiv.hkl') #print('done') #print('loading hkldbIndi_Conju 2..') self.fsearchVorgangspassiv2 = FASTsearch.FASTsearch('hkldb2Vorgangspassiv.hkl') #print('done') # uncomment if models are not there #print('generating BoW Model 1..') #self.fsearchVorgangspassiv1.Gen_BoW_Model(20000, "word", punctuation = False) #print('done') #print('generating BoW Model 2..') #self.fsearchVorgangspassiv2.Gen_BoW_Model(20000, "word", punctuation = False) #print('done') #print('loading the bow model 1') self.fsearchVorgangspassiv1.Load_BoW_Model('bagofwordshkldb1Vorgangspassiv.pkl', 'DataBaseOneZeroshkldb1Vorgangspassiv.hkl') #print('done') #print('loading the bow model 2') self.fsearchVorgangspassiv2.Load_BoW_Model('bagofwordshkldb2Vorgangspassiv.pkl', 'DataBaseOneZeroshkldb2Vorgangspassiv.hkl') #print('done') #print('loading hkldbIndi_Conju 1..') self.fsearchZustandspassiv1 = FASTsearch.FASTsearch('hkldb1Zustandspassiv.hkl') #print('done') #print('loading hkldbIndi_Conju 2..') self.fsearchZustandspassiv2 = FASTsearch.FASTsearch('hkldb2Zustandspassiv.hkl') #print('done') #print('generating BoW Model 1..') #self.fsearchZustandspassiv1.Gen_BoW_Model(20000, "word", punctuation = False) #print('done') #print('generating BoW Model 2..') #self.fsearchZustandspassiv2.Gen_BoW_Model(20000, "word", punctuation = False) #print('done') #print('loading the bow model 1') self.fsearchZustandspassiv1.Load_BoW_Model('bagofwordshkldb1Zustandspassiv.pkl', 'DataBaseOneZeroshkldb1Zustandspassiv.hkl') #print('done') #print('loading the bow model 2') self.fsearchZustandspassiv2.Load_BoW_Model('bagofwordshkldb2Zustandspassiv.pkl', 'DataBaseOneZeroshkldb2Zustandspassiv.hkl') #print('done') import GS_Utils #print('initializing the gs utils..') self.gs = GS_Utils.GS_Utils('de_core_news_sm') #print('done') from SentGlue import SentGlueMach #print('loading the Stochastic Gradient models..') self.sgm = SentGlueMach('trainedSGD.pkl', 'bagofwords.pkl') #print('done') #print('initializing the SGM..') self.sgm.initialize() #print('done') #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names()) #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names()) def replacePassivForms(self,sentences): endsentences = [] sentencecount = 0 for sentence in sentences: try: sentencecount += 1 #print('processing sentence', sentencecount) doc = self.nlp(' '.join(sentence)) verbs_of_sentence = [] wordindex_to_replace = [] count = 0 subjectofsentence = [] subjectindex = [] erindex = [] Erindex = [] undindex = [] for word in doc: count += 1 #print(word.text) #print(word.dep_) if word.dep_ == 'sb': #print('oi') subjectofsentence.append(word.text) subjectindex.append(count) if word.text == 'er': erindex.append(count) if word.text == 'Er': Erindex.append(count) if word.text == 'und': undindex.append(count) if word.tag_[0] == 'V': verbs_of_sentence.append(word.text) wordindex_to_replace.append(count) if len(verbs_of_sentence) == 1 and verbs_of_sentence[0] == ('wurde' or 'wird' or 'werden' or 'wirst' or 'werde' or 'war'): verbs_of_sentence[0] = 'bliblablubdudidu' verbs_of_sentence_string = ' '.join(verbs_of_sentence) length_verbs_of_sentence_string = len(verbs_of_sentence_string) verbs_of_sentence_string += ' ' + str(length_verbs_of_sentence_string) #print(verbs_of_sentence_string) bestmatchesZustandspassiv1, matchindexZustandspassiv1 = self.fsearchZustandspassiv1.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1) bestmatchesVorgangspassiv1, matchindexVorgangspassiv1 = self.fsearchVorgangspassiv1.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1) #print('verbs of sentence string', verbs_of_sentence_string) #print(len(verbs_of_sentence)) #print(matchindexVorgangspassiv1) #print(matchindexZustandspassiv1) vorgangORnot = 0 zustandORnot = 0 if (len(verbs_of_sentence) + 1) == matchindexVorgangspassiv1[1]: workindex = matchindexVorgangspassiv1[0] vorgangORnot = 1 if (len(verbs_of_sentence) + 1) == matchindexZustandspassiv1[1]: workindex = matchindexZustandspassiv1[0] zustandORnot = 1 #print(workindex) #print(self.hkldbAktiv_All[matchindexVorgangspassiv1[0]]) #print(self.hkldbVorgangspassiv_All[matchindexVorgangspassiv1[0]]) #print(self.hkldbZustandspassiv_All[matchindexZustandspassiv1[0]]) formToReplace = [] if vorgangORnot == 1: completeform = self.hkldbVorgangspassiv_All[workindex] if len(verbs_of_sentence_string.split()) != len(completeform[0][0].split()): vorgangORnot = 0 if vorgangORnot == 1: completeform = self.hkldbVorgangspassiv_All[workindex] formToReplace = self.hkldbVorgangspassiv_All[workindex][1][0].split()[-2:] #print('formtoreplace vorgang',formToReplace) #print('complete form', completeform) formToReplace = '3. Person Singular ' + ' '.join(formToReplace) #print(formToReplace) thrdPersonAktivindex = self.fsearchAktiv2.search_with_highest_multiplikation_Output(formToReplace, 1)[0] thrdPersonAktiv = self.hkldbAktiv_All[thrdPersonAktivindex[0]][0][0].split()[:-1] #print(thrdPersonAktiv) thrdPersonAktiv = ' '.join(thrdPersonAktiv) dalist = verbs_of_sentence_string.split()[:-1] for verb in dalist: #print(sentence) #print(index) sentence.remove(verb) thereisasubjectEr = 0 for index in subjectindex: for ind in undindex: if index - 1 == ind: if index - 2 == ('er' or 'Er'): thereisasubjectEr = 1 if index + 1 == ind: if index + 2 == 'er' or index + 2 == 'Er': thereisasubjectEr = 1 #print('subjectofsentence', subjectofsentence) thereisasubjectich = 0 thereisasubjectdu = 0 thereisasubjectihr = 0 thereisasubjectwir = 0 for word in subjectofsentence: if word == 'er' or word == 'Er': thereisasubjectEr = 1 if word == 'ich': thereisasubjectich = 1 if word == 'du': thereisasubjectdu = 1 if word == 'ihr': thereisasubjectihr = 1 if word == 'wir': thereisasubjectwir = 1 #print('there is a subjecter', thereisasubjectEr) if thereisasubjectEr == 1: try: sentence.remove('Er') except: sentence.remove('er') sentence.append('ihn') if thereisasubjectich == 1: sentence.remove('ich') sentence.append('mich') if thereisasubjectdu == 1: sentence.remove('du') sentence.append('dich') if thereisasubjectihr == 1: sentence.remove('ihr') sentence.append('euch') if thereisasubjectwir == 1: sentence.remove('wir') sentence.append('uns') sentence.append(thrdPersonAktiv) #print('sentence in the vorgangornot', sentence) jemandornot = 1 wordstodelete = [] for n in range(len(sentence) - 1): if sentence[n] == 'von': if sentence[n + 1] == 'ihr': sentence[n + 1] = 'sie' wordstodelete.append(n) jemandornot = 0 if sentence[n + 1] == 'ihm': sentence[n + 1] = 'er' wordstodelete.append(n) jemandornot = 0 import spacy nlp = spacy.load('de_core_news_sm') token1 = nlp(sentence[n - 1]) token2 = nlp(sentence[n + 1]) for word in token1: if word.tag_ != 'NN' and word.tag_ != 'NE': for word in token2: if word.tag_ == 'NN' or word.tag_ == 'NE': wordstodelete.append(n) jemandornot = 0 if sentence[n + 1] == 'dem' or sentence[n + 1] == 'einem': token3 = nlp(sentence[n-1]) for word in token3: if word.tag_ != 'NN' and word.tag_ != 'NE': sentence[n + 1] = 'ein' wordstodelete.append(n) jemandornot = 0 if sentence[n + 1] == 'der' or sentence[n + 1] == 'einer': token4 = nlp(sentence[n-1]) for word in token4: if word.tag_ != 'NN' and word.tag_ != 'NE': sentence[n + 1] = 'eine' wordstodelete.append(n) jemandornot = 0 if sentence[n] == 'vom': sentence[n] = 'ein' jemandornot = 0 for index in wordstodelete[::-1]: del sentence[index] if jemandornot == 1: sentence.append('jemand') #print('sentence checkpoint 2', sentence) #print('get the tuples and triples to check..') tuplesTocheck, triplesTocheck, quadruplesToCheck = self.gs.GetTuplesinSentence(sentence) #print('done') #print(tuplesTocheck, triplesTocheck) grammpiecessentence = self.gs.createTupleofGrammarpieces( sentence, tuplesTocheck, triplesTocheck, quadruplesToCheck) if len(grammpiecessentence) > 7: print('A sentence is too long, too many permutations. \n piping wrong grammar..') endsentences.append(' '.join(grammpiecessentence).split()) else: #print('the grammpiecessentence', grammpiecessentence) #print('genrating the permutations') permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence) #print('done') #print(permutations) #if (len(tuplesTocheck) != 0) or (len(triplesTocheck) != 0): # print('filtering the permutations based on the tuples and triples..') # filteredpermutations = self.gs.filterpermutationsaccordingtotuples(permutations, tuplesTocheck, triplesTocheck) # print('done') #else: # print('there are no triples or tuples to check..') # filteredpermutations = permutations sentencesToCheck = [] for sentence in permutations: sentencesToCheck.append(' '.join(sentence)) #print('sentencesToCheck', sentencesToCheck) #print('classifying the probability for right grammar in the filtered permutations..') #print(' '.join(sentence)) endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(sentence)) #print('done') #print('the endsentence', endsentence) endsentences.append(endsentence.split()) #count1 = 0 #print(subjectindex) #subjectindex = subjectindex[0] #if subjectindex != 0: #for word in sentence[subjectindex - 1:subjectindex + 1]: #count1 += 1 #if word == 'und': #thereIsanUnd = count1 #if subjectindex == 0: #for word in sentence[subjectindex:subjectindex + 1]: #count1 += 1 #if word == 'und': #thereIsanUnd = count1 #thereisanEr = 0 #if sentence[subjectindex - 1 + thereIsanUnd] == 'er' or sentence[subjectindex - 1 + thereIsanUnd] == 'Er': #thereisanEr = 1 #if thereisanEr == 1: #sentence.remove('Er') #sentence.remove('er') #sentence.append('ihn') #print('zustandornot',zustandORnot) #print('vorgang', vorgangORnot) if zustandORnot == 1: completeform = self.hkldbZustandspassiv_All[workindex] if len(verbs_of_sentence_string.split()) != len(completeform[0][0].split()): zustandORnot = 0 if zustandORnot == 1: #completeform = self.hkldbZustandspassiv_All[workindex] formToReplace = self.hkldbZustandspassiv_All[workindex][1][0].split()[-2:] formToReplace = '3. Person Singular ' + ' '.join(formToReplace) #print('formtoreplace zustand',formToReplace) #print('complete form', completeform) thrdPersonAktivindex = self.fsearchAktiv2.search_with_highest_multiplikation_Output(formToReplace, 1)[0] thrdPersonAktiv = self.hkldbAktiv_All[thrdPersonAktivindex[0]][0][0].split()[:-1] thrdPersonAktiv = ' '.join(thrdPersonAktiv) for verb in verbs_of_sentence_string.split()[:-1]: #print(sentence) #print(index) sentence.remove(verb) thereisasubjectEr = 0 for index in subjectindex: for ind in undindex: if index - 1 == ind: if index - 2 == ('er' or 'Er'): thereisasubjectEr = 1 if index + 1 == ind: if index + 2 == 'er' or index + 2 == 'Er': thereisasubjectEr = 1 #print('subjectofsentence', subjectofsentence) thereisasubjectich = 0 thereisasubjectdu = 0 thereisasubjectihr = 0 thereisasubjectwir = 0 for word in subjectofsentence: if word == 'er' or word == 'Er': thereisasubjectEr = 1 if word == 'ich': thereisasubjectich = 1 if word == 'du': thereisasubjectdu = 1 if word == 'ihr': thereisasubjectihr = 1 if word == 'wir': thereisasubjectwir = 1 if thereisasubjectEr == 1: try: sentence.remove('Er') except: sentence.remove('er') sentence.append('ihn') if thereisasubjectich == 1: sentence.remove('ich') sentence.append('mich') if thereisasubjectdu == 1: sentence.remove('du') sentence.append('dich') if thereisasubjectihr == 1: sentence.remove('ihr') sentence.append('euch') if thereisasubjectwir == 1: sentence.remove('wir') sentence.append('uns') sentence.append(thrdPersonAktiv) jemandornot = 1 wordstodelete = [] for n in range(len(sentence) - 1): if sentence[n] == 'von': if sentence[n + 1] == 'ihr': sentence[n + 1] = 'sie' wordstodelete.append(n) jemandornot = 0 if sentence[n + 1] == 'ihm': sentence[n + 1] = 'er' wordstodelete.append(n) jemandornot = 0 import spacy nlp = spacy.load('de_core_news_sm') token1 = nlp(sentence[n - 1]) token2 = nlp(sentence[n + 1]) for word in token1: if word.tag_ != 'NN' and word.tag_ != 'NE': for word in token2: if word.tag_ == 'NN' or word.tag_ == 'NE': wordstodelete.append(n) jemandornot = 0 if sentence[n + 1] == 'dem' or sentence[n + 1] == 'einem': token3 = nlp(sentence[n-1]) for word in token3: if word.tag_ != 'NN' and word.tag_ != 'NE': sentence[n + 1] = 'ein' wordstodelete.append(n) jemandornot = 0 if sentence[n + 1] == 'der' or sentence[n + 1] == 'einer': token4 = nlp(sentence[n-1]) for word in token4: if word.tag_ != 'NN' and word.tag_ != 'NE': sentence[n + 1] = 'eine' wordstodelete.append(n) jemandornot = 0 if sentence[n] == 'vom': sentence[n] = 'ein' jemandornot = 0 for index in wordstodelete[::-1]: del sentence[index] if jemandornot == 1: sentence.append('jemand') #print(sentence) #print('get the tuples and triples to check..') tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(sentence) #print('done') #print(tuplesTocheck, triplesTocheck) grammpiecessentence = self.gs.createTupleofGrammarpieces( sentence, tuplesTocheck, triplesTocheck, quadruplesTocheck) if len(grammpiecessentence) > 7: print('A sentence is too long, too many permutations. \n piping wrong grammar..') endsentences.append(' '.join(grammpiecessentence).split()) else: #print('the grammpiecessentence', grammpiecessentence) #print('genrating the permutations') permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence) #print('done') #print(permutations) #if (len(tuplesTocheck) != 0) or (len(triplesTocheck) != 0): # print('filtering the permutations based on the tuples and triples..') # filteredpermutations = self.gs.filterpermutationsaccordingtotuples(permutations, tuplesTocheck, triplesTocheck) # print('done') #else: # print('there are no triples or tuples to check..') # filteredpermutations = permutations sentencesToCheck = [] for sentence in permutations: sentencesToCheck.append(' '.join(sentence)) #print('sentencesToCheck', sentencesToCheck) #print('classifying the probability for right grammar in the filtered permutations..') #print(' '.join(sentence)) endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(sentence)) #print('done') #print('the endsentence', endsentence) endsentences.append(endsentence.split()) if zustandORnot == 0 and vorgangORnot == 0: #print('it is coming to the else') endsentences.append(sentence) except: print('the sentence ' + str(sentence) + ' caused an error in the module passive2active') if endsentences[-1] == sentence: pass else: endsentences.append(sentence) return endsentences # Vorgangspassiv wird auf selbe Zeit gemappt, 3. Person Singular. # Zustandspassiv: Immer eine Zeit dahinter. D.h. # Präsens => Präteritum, Präteritum => Perfekt