|
|
-
-
- import spacy
- import nltk
- from nltk.stem.snowball import SnowballStemmer
-
- import hickle as hkl
- import FASTsearch
-
- stemmer = SnowballStemmer("german")
-
-
- class Passiv2Aktiv(object):
-
- def __init__(self, hklDatabaseDir_Aktiv, hklDatabaseDir_Vorgangspassiv, hklDatabaseDir_Zustandspassiv):
-
- if hklDatabaseDir_Aktiv is not None:
- self.AktivDB = hkl.load(hklDatabaseDir_Aktiv)
-
- if hklDatabaseDir_Vorgangspassiv is not None:
- self.VorgangspassivDB = hkl.load(hklDatabaseDir_Vorgangspassiv)
-
- if hklDatabaseDir_Zustandspassiv is not None:
- self.ZustandspassivDB = hkl.load(hklDatabaseDir_Zustandspassiv)
-
-
- #print('loading the german spacy model..')
- self.nlp = spacy.load('de_core_news_sm')
- #print('done')
-
- #print('loading the stemmer..')
- self.stemmer = SnowballStemmer("german")
- #print('done')
-
- return
-
-
- def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
-
- with open(csvDbDir) as lines:
-
- self.DB_All = []
-
- for line in lines:
-
- #print(line)
-
- self.DB_All.append(list(eval(line)))
-
-
-
-
-
- self.hkldb1 = []
- self.hkldb2 = []
-
-
- counter = 0
- for n in range(len(self.DB_All)):
-
- counter += 1
- if counter % 1000 == 0:
- print(counter)
-
-
- self.hkldb1.append([self.DB_All[n][0][0]] )
- self.hkldb2.append([self.DB_All[n][1][0]] )
-
-
-
- print('creating the hkl dump of DBAll')
- hkl.dump(self.DB_All, 'hkldb_All' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')
- #print('done..')
-
- print('Creating the hkl dump of DB 1')
- hkl.dump(self.hkldb1, 'hkldb1' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')
- #print('done..')
-
- print('Creating the hkl dump of DB 2')
- hkl.dump(self.hkldb2, 'hkldb2' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')
- #print('done..')
-
-
-
- return 'done'
-
-
-
- def load_DB_into_FASTsearch(self):
-
- #print('loading the hkldb_All databases..')
- self.hkldbAktiv_All = hkl.load('hkldb_AllAktiv.hkl')
- #print('first done')
- self.hkldbVorgangspassiv_All = hkl.load('hkldb_AllVorgangspassiv.hkl')
- #print('second done')
- self.hkldbZustandspassiv_All = hkl.load('hkldb_AllZustandspassiv.hkl')
- #print('third done')
-
-
- #print('loading hkldbIndi_Conju 1..')
- self.fsearchAktiv1 = FASTsearch.FASTsearch('hkldb1Aktiv.hkl')
- #print('done')
-
- #print('loading hkldbIndi_Conju 2..')
- self.fsearchAktiv2 = FASTsearch.FASTsearch('hkldb2Aktiv.hkl')
- #print('done')
-
-
- # generate bow model only necessary the first time
- #print('generating BoW Model 1..')
- #self.fsearchAktiv1.Gen_BoW_Model(20000, "word", punctuation = False)
- #print('done')
-
- #print('generating BoW Model 2..')
- #self.fsearchAktiv2.Gen_BoW_Model(20000, "word", punctuation = False)
- #print('done')
-
-
- #print('loading the bow model 1')
- self.fsearchAktiv1.Load_BoW_Model('bagofwordshkldb1Aktiv.pkl', 'DataBaseOneZeroshkldb1Aktiv.hkl')
- #print('done')
-
- #print('loading the bow model 2')
- self.fsearchAktiv2.Load_BoW_Model('bagofwordshkldb2Aktiv.pkl', 'DataBaseOneZeroshkldb2Aktiv.hkl')
- #print('done')
-
- #print('loading hkldbIndi_Conju 1..')
- self.fsearchVorgangspassiv1 = FASTsearch.FASTsearch('hkldb1Vorgangspassiv.hkl')
- #print('done')
-
- #print('loading hkldbIndi_Conju 2..')
- self.fsearchVorgangspassiv2 = FASTsearch.FASTsearch('hkldb2Vorgangspassiv.hkl')
- #print('done')
-
- # uncomment if models are not there
- #print('generating BoW Model 1..')
- #self.fsearchVorgangspassiv1.Gen_BoW_Model(20000, "word", punctuation = False)
- #print('done')
-
- #print('generating BoW Model 2..')
- #self.fsearchVorgangspassiv2.Gen_BoW_Model(20000, "word", punctuation = False)
- #print('done')
-
-
- #print('loading the bow model 1')
- self.fsearchVorgangspassiv1.Load_BoW_Model('bagofwordshkldb1Vorgangspassiv.pkl', 'DataBaseOneZeroshkldb1Vorgangspassiv.hkl')
- #print('done')
-
- #print('loading the bow model 2')
- self.fsearchVorgangspassiv2.Load_BoW_Model('bagofwordshkldb2Vorgangspassiv.pkl', 'DataBaseOneZeroshkldb2Vorgangspassiv.hkl')
- #print('done')
-
- #print('loading hkldbIndi_Conju 1..')
- self.fsearchZustandspassiv1 = FASTsearch.FASTsearch('hkldb1Zustandspassiv.hkl')
- #print('done')
-
- #print('loading hkldbIndi_Conju 2..')
- self.fsearchZustandspassiv2 = FASTsearch.FASTsearch('hkldb2Zustandspassiv.hkl')
- #print('done')
-
- #print('generating BoW Model 1..')
- #self.fsearchZustandspassiv1.Gen_BoW_Model(20000, "word", punctuation = False)
- #print('done')
-
- #print('generating BoW Model 2..')
- #self.fsearchZustandspassiv2.Gen_BoW_Model(20000, "word", punctuation = False)
- #print('done')
-
-
- #print('loading the bow model 1')
- self.fsearchZustandspassiv1.Load_BoW_Model('bagofwordshkldb1Zustandspassiv.pkl', 'DataBaseOneZeroshkldb1Zustandspassiv.hkl')
- #print('done')
-
- #print('loading the bow model 2')
- self.fsearchZustandspassiv2.Load_BoW_Model('bagofwordshkldb2Zustandspassiv.pkl', 'DataBaseOneZeroshkldb2Zustandspassiv.hkl')
- #print('done')
-
- import GS_Utils
- #print('initializing the gs utils..')
- self.gs = GS_Utils.GS_Utils('de_core_news_sm')
- #print('done')
-
-
- from SentGlue import SentGlueMach
- #print('loading the Stochastic Gradient models..')
- self.sgm = SentGlueMach('trainedSGD.pkl', 'bagofwords.pkl')
- #print('done')
- #print('initializing the SGM..')
- self.sgm.initialize()
- #print('done')
-
- #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
- #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
-
-
-
- def replacePassivForms(self,sentences):
-
- endsentences = []
- sentencecount = 0
- for sentence in sentences:
- try:
- sentencecount += 1
- #print('processing sentence', sentencecount)
-
- doc = self.nlp(' '.join(sentence))
-
- verbs_of_sentence = []
- wordindex_to_replace = []
- count = 0
- subjectofsentence = []
- subjectindex = []
- erindex = []
- Erindex = []
- undindex = []
-
- for word in doc:
-
- count += 1
-
- #print(word.text)
- #print(word.dep_)
-
- if word.dep_ == 'sb':
-
- #print('oi')
- subjectofsentence.append(word.text)
- subjectindex.append(count)
-
- if word.text == 'er':
- erindex.append(count)
- if word.text == 'Er':
- Erindex.append(count)
- if word.text == 'und':
- undindex.append(count)
-
- if word.tag_[0] == 'V':
- verbs_of_sentence.append(word.text)
- wordindex_to_replace.append(count)
-
-
- if len(verbs_of_sentence) == 1 and verbs_of_sentence[0] == ('wurde' or 'wird' or 'werden' or 'wirst' or 'werde' or 'war'):
- verbs_of_sentence[0] = 'bliblablubdudidu'
-
- verbs_of_sentence_string = ' '.join(verbs_of_sentence)
-
-
-
- length_verbs_of_sentence_string = len(verbs_of_sentence_string)
-
- verbs_of_sentence_string += ' ' + str(length_verbs_of_sentence_string)
- #print(verbs_of_sentence_string)
- bestmatchesZustandspassiv1, matchindexZustandspassiv1 = self.fsearchZustandspassiv1.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)
-
- bestmatchesVorgangspassiv1, matchindexVorgangspassiv1 = self.fsearchVorgangspassiv1.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)
-
-
- #print('verbs of sentence string', verbs_of_sentence_string)
- #print(len(verbs_of_sentence))
- #print(matchindexVorgangspassiv1)
- #print(matchindexZustandspassiv1)
-
-
- vorgangORnot = 0
- zustandORnot = 0
- if (len(verbs_of_sentence) + 1) == matchindexVorgangspassiv1[1]:
- workindex = matchindexVorgangspassiv1[0]
- vorgangORnot = 1
-
- if (len(verbs_of_sentence) + 1) == matchindexZustandspassiv1[1]:
- workindex = matchindexZustandspassiv1[0]
- zustandORnot = 1
-
-
- #print(workindex)
-
- #print(self.hkldbAktiv_All[matchindexVorgangspassiv1[0]])
- #print(self.hkldbVorgangspassiv_All[matchindexVorgangspassiv1[0]])
- #print(self.hkldbZustandspassiv_All[matchindexZustandspassiv1[0]])
-
- formToReplace = []
-
- if vorgangORnot == 1:
- completeform = self.hkldbVorgangspassiv_All[workindex]
- if len(verbs_of_sentence_string.split()) != len(completeform[0][0].split()):
- vorgangORnot = 0
-
- if vorgangORnot == 1:
- completeform = self.hkldbVorgangspassiv_All[workindex]
- formToReplace = self.hkldbVorgangspassiv_All[workindex][1][0].split()[-2:]
-
-
- #print('formtoreplace vorgang',formToReplace)
- #print('complete form', completeform)
-
- formToReplace = '3. Person Singular ' + ' '.join(formToReplace)
-
- #print(formToReplace)
-
- thrdPersonAktivindex = self.fsearchAktiv2.search_with_highest_multiplikation_Output(formToReplace, 1)[0]
-
- thrdPersonAktiv = self.hkldbAktiv_All[thrdPersonAktivindex[0]][0][0].split()[:-1]
-
- #print(thrdPersonAktiv)
-
- thrdPersonAktiv = ' '.join(thrdPersonAktiv)
-
- dalist = verbs_of_sentence_string.split()[:-1]
-
- for verb in dalist:
- #print(sentence)
- #print(index)
-
- sentence.remove(verb)
-
- thereisasubjectEr = 0
-
- for index in subjectindex:
- for ind in undindex:
- if index - 1 == ind:
- if index - 2 == ('er' or 'Er'):
- thereisasubjectEr = 1
- if index + 1 == ind:
- if index + 2 == 'er' or index + 2 == 'Er':
- thereisasubjectEr = 1
- #print('subjectofsentence', subjectofsentence)
- thereisasubjectich = 0
- thereisasubjectdu = 0
- thereisasubjectihr = 0
- thereisasubjectwir = 0
- for word in subjectofsentence:
- if word == 'er' or word == 'Er':
- thereisasubjectEr = 1
- if word == 'ich':
- thereisasubjectich = 1
- if word == 'du':
- thereisasubjectdu = 1
- if word == 'ihr':
- thereisasubjectihr = 1
- if word == 'wir':
- thereisasubjectwir = 1
- #print('there is a subjecter', thereisasubjectEr)
- if thereisasubjectEr == 1:
- try:
- sentence.remove('Er')
- except:
- sentence.remove('er')
- sentence.append('ihn')
- if thereisasubjectich == 1:
- sentence.remove('ich')
- sentence.append('mich')
- if thereisasubjectdu == 1:
- sentence.remove('du')
- sentence.append('dich')
- if thereisasubjectihr == 1:
- sentence.remove('ihr')
- sentence.append('euch')
- if thereisasubjectwir == 1:
- sentence.remove('wir')
- sentence.append('uns')
-
- sentence.append(thrdPersonAktiv)
- #print('sentence in the vorgangornot', sentence)
- jemandornot = 1
- wordstodelete = []
- for n in range(len(sentence) - 1):
- if sentence[n] == 'von':
- if sentence[n + 1] == 'ihr':
- sentence[n + 1] = 'sie'
- wordstodelete.append(n)
- jemandornot = 0
- if sentence[n + 1] == 'ihm':
- sentence[n + 1] = 'er'
- wordstodelete.append(n)
- jemandornot = 0
- import spacy
- nlp = spacy.load('de_core_news_sm')
- token1 = nlp(sentence[n - 1])
- token2 = nlp(sentence[n + 1])
- for word in token1:
- if word.tag_ != 'NN' and word.tag_ != 'NE':
- for word in token2:
- if word.tag_ == 'NN' or word.tag_ == 'NE':
- wordstodelete.append(n)
-
- jemandornot = 0
- if sentence[n + 1] == 'dem' or sentence[n + 1] == 'einem':
-
- token3 = nlp(sentence[n-1])
- for word in token3:
- if word.tag_ != 'NN' and word.tag_ != 'NE':
- sentence[n + 1] = 'ein'
- wordstodelete.append(n)
- jemandornot = 0
- if sentence[n + 1] == 'der' or sentence[n + 1] == 'einer':
- token4 = nlp(sentence[n-1])
- for word in token4:
- if word.tag_ != 'NN' and word.tag_ != 'NE':
- sentence[n + 1] = 'eine'
- wordstodelete.append(n)
- jemandornot = 0
-
- if sentence[n] == 'vom':
-
- sentence[n] = 'ein'
- jemandornot = 0
- for index in wordstodelete[::-1]:
- del sentence[index]
- if jemandornot == 1:
- sentence.append('jemand')
-
-
- #print('sentence checkpoint 2', sentence)
-
- #print('get the tuples and triples to check..')
- tuplesTocheck, triplesTocheck, quadruplesToCheck = self.gs.GetTuplesinSentence(sentence)
- #print('done')
- #print(tuplesTocheck, triplesTocheck)
-
- grammpiecessentence = self.gs.createTupleofGrammarpieces( sentence, tuplesTocheck, triplesTocheck, quadruplesToCheck)
-
- if len(grammpiecessentence) > 7:
- print('A sentence is too long, too many permutations. \n piping wrong grammar..')
- endsentences.append(' '.join(grammpiecessentence).split())
-
- else:
-
- #print('the grammpiecessentence', grammpiecessentence)
- #print('genrating the permutations')
- permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
- #print('done')
- #print(permutations)
- #if (len(tuplesTocheck) != 0) or (len(triplesTocheck) != 0):
- # print('filtering the permutations based on the tuples and triples..')
- # filteredpermutations = self.gs.filterpermutationsaccordingtotuples(permutations, tuplesTocheck, triplesTocheck)
- # print('done')
- #else:
- # print('there are no triples or tuples to check..')
- # filteredpermutations = permutations
-
- sentencesToCheck = []
- for sentence in permutations:
- sentencesToCheck.append(' '.join(sentence))
-
- #print('sentencesToCheck', sentencesToCheck)
- #print('classifying the probability for right grammar in the filtered permutations..')
- #print(' '.join(sentence))
- endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(sentence))
- #print('done')
-
- #print('the endsentence', endsentence)
- endsentences.append(endsentence.split())
-
- #count1 = 0
-
- #print(subjectindex)
- #subjectindex = subjectindex[0]
- #if subjectindex != 0:
- #for word in sentence[subjectindex - 1:subjectindex + 1]:
- #count1 += 1
- #if word == 'und':
- #thereIsanUnd = count1
- #if subjectindex == 0:
- #for word in sentence[subjectindex:subjectindex + 1]:
- #count1 += 1
- #if word == 'und':
- #thereIsanUnd = count1
- #thereisanEr = 0
- #if sentence[subjectindex - 1 + thereIsanUnd] == 'er' or sentence[subjectindex - 1 + thereIsanUnd] == 'Er':
-
- #thereisanEr = 1
-
-
- #if thereisanEr == 1:
-
- #sentence.remove('Er')
- #sentence.remove('er')
- #sentence.append('ihn')
-
-
- #print('zustandornot',zustandORnot)
- #print('vorgang', vorgangORnot)
-
- if zustandORnot == 1:
- completeform = self.hkldbZustandspassiv_All[workindex]
- if len(verbs_of_sentence_string.split()) != len(completeform[0][0].split()):
- zustandORnot = 0
-
-
- if zustandORnot == 1:
- #completeform = self.hkldbZustandspassiv_All[workindex]
- formToReplace = self.hkldbZustandspassiv_All[workindex][1][0].split()[-2:]
- formToReplace = '3. Person Singular ' + ' '.join(formToReplace)
- #print('formtoreplace zustand',formToReplace)
- #print('complete form', completeform)
-
- thrdPersonAktivindex = self.fsearchAktiv2.search_with_highest_multiplikation_Output(formToReplace, 1)[0]
-
- thrdPersonAktiv = self.hkldbAktiv_All[thrdPersonAktivindex[0]][0][0].split()[:-1]
-
- thrdPersonAktiv = ' '.join(thrdPersonAktiv)
-
- for verb in verbs_of_sentence_string.split()[:-1]:
- #print(sentence)
- #print(index)
-
- sentence.remove(verb)
-
- thereisasubjectEr = 0
-
- for index in subjectindex:
- for ind in undindex:
- if index - 1 == ind:
- if index - 2 == ('er' or 'Er'):
- thereisasubjectEr = 1
- if index + 1 == ind:
- if index + 2 == 'er' or index + 2 == 'Er':
- thereisasubjectEr = 1
- #print('subjectofsentence', subjectofsentence)
-
- thereisasubjectich = 0
- thereisasubjectdu = 0
- thereisasubjectihr = 0
- thereisasubjectwir = 0
- for word in subjectofsentence:
- if word == 'er' or word == 'Er':
- thereisasubjectEr = 1
- if word == 'ich':
- thereisasubjectich = 1
- if word == 'du':
- thereisasubjectdu = 1
- if word == 'ihr':
- thereisasubjectihr = 1
- if word == 'wir':
- thereisasubjectwir = 1
- if thereisasubjectEr == 1:
- try:
- sentence.remove('Er')
- except:
- sentence.remove('er')
- sentence.append('ihn')
-
- if thereisasubjectich == 1:
- sentence.remove('ich')
- sentence.append('mich')
- if thereisasubjectdu == 1:
- sentence.remove('du')
- sentence.append('dich')
- if thereisasubjectihr == 1:
- sentence.remove('ihr')
- sentence.append('euch')
- if thereisasubjectwir == 1:
- sentence.remove('wir')
- sentence.append('uns')
-
- sentence.append(thrdPersonAktiv)
-
- jemandornot = 1
- wordstodelete = []
- for n in range(len(sentence) - 1):
- if sentence[n] == 'von':
- if sentence[n + 1] == 'ihr':
- sentence[n + 1] = 'sie'
- wordstodelete.append(n)
- jemandornot = 0
- if sentence[n + 1] == 'ihm':
- sentence[n + 1] = 'er'
- wordstodelete.append(n)
- jemandornot = 0
-
- import spacy
- nlp = spacy.load('de_core_news_sm')
- token1 = nlp(sentence[n - 1])
- token2 = nlp(sentence[n + 1])
- for word in token1:
- if word.tag_ != 'NN' and word.tag_ != 'NE':
- for word in token2:
- if word.tag_ == 'NN' or word.tag_ == 'NE':
- wordstodelete.append(n)
-
- jemandornot = 0
- if sentence[n + 1] == 'dem' or sentence[n + 1] == 'einem':
-
- token3 = nlp(sentence[n-1])
- for word in token3:
- if word.tag_ != 'NN' and word.tag_ != 'NE':
- sentence[n + 1] = 'ein'
- wordstodelete.append(n)
- jemandornot = 0
- if sentence[n + 1] == 'der' or sentence[n + 1] == 'einer':
- token4 = nlp(sentence[n-1])
- for word in token4:
- if word.tag_ != 'NN' and word.tag_ != 'NE':
- sentence[n + 1] = 'eine'
- wordstodelete.append(n)
- jemandornot = 0
-
- if sentence[n] == 'vom':
-
- sentence[n] = 'ein'
- jemandornot = 0
-
- for index in wordstodelete[::-1]:
- del sentence[index]
-
- if jemandornot == 1:
- sentence.append('jemand')
-
-
- #print(sentence)
-
- #print('get the tuples and triples to check..')
- tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(sentence)
- #print('done')
- #print(tuplesTocheck, triplesTocheck)
-
- grammpiecessentence = self.gs.createTupleofGrammarpieces( sentence, tuplesTocheck, triplesTocheck, quadruplesTocheck)
-
- if len(grammpiecessentence) > 7:
- print('A sentence is too long, too many permutations. \n piping wrong grammar..')
- endsentences.append(' '.join(grammpiecessentence).split())
-
- else:
-
- #print('the grammpiecessentence', grammpiecessentence)
- #print('genrating the permutations')
- permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
- #print('done')
- #print(permutations)
- #if (len(tuplesTocheck) != 0) or (len(triplesTocheck) != 0):
- # print('filtering the permutations based on the tuples and triples..')
- # filteredpermutations = self.gs.filterpermutationsaccordingtotuples(permutations, tuplesTocheck, triplesTocheck)
- # print('done')
- #else:
- # print('there are no triples or tuples to check..')
- # filteredpermutations = permutations
-
- sentencesToCheck = []
- for sentence in permutations:
- sentencesToCheck.append(' '.join(sentence))
-
- #print('sentencesToCheck', sentencesToCheck)
- #print('classifying the probability for right grammar in the filtered permutations..')
- #print(' '.join(sentence))
- endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(sentence))
- #print('done')
-
- #print('the endsentence', endsentence)
- endsentences.append(endsentence.split())
-
-
-
- if zustandORnot == 0 and vorgangORnot == 0:
- #print('it is coming to the else')
- endsentences.append(sentence)
-
- except:
- print('the sentence ' + str(sentence) + ' caused an error in the module passive2active')
- if endsentences[-1] == sentence:
- pass
- else:
- endsentences.append(sentence)
-
-
-
-
-
-
-
-
-
- return endsentences
-
-
-
- # Vorgangspassiv wird auf selbe Zeit gemappt, 3. Person Singular.
- # Zustandspassiv: Immer eine Zeit dahinter. D.h.
- # Präsens => Präteritum, Präteritum => Perfekt
-
|