You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

679 lines
28 KiB

import spacy
import nltk
from nltk.stem.snowball import SnowballStemmer
import hickle as hkl
import FASTsearch
stemmer = SnowballStemmer("german")
class Passiv2Aktiv(object):
def __init__(self, hklDatabaseDir_Aktiv, hklDatabaseDir_Vorgangspassiv, hklDatabaseDir_Zustandspassiv):
if hklDatabaseDir_Aktiv is not None:
self.AktivDB = hkl.load(hklDatabaseDir_Aktiv)
if hklDatabaseDir_Vorgangspassiv is not None:
self.VorgangspassivDB = hkl.load(hklDatabaseDir_Vorgangspassiv)
if hklDatabaseDir_Zustandspassiv is not None:
self.ZustandspassivDB = hkl.load(hklDatabaseDir_Zustandspassiv)
#print('loading the german spacy model..')
self.nlp = spacy.load('de_core_news_sm')
#print('done')
#print('loading the stemmer..')
self.stemmer = SnowballStemmer("german")
#print('done')
return
def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
with open(csvDbDir) as lines:
self.DB_All = []
for line in lines:
#print(line)
self.DB_All.append(list(eval(line)))
self.hkldb1 = []
self.hkldb2 = []
counter = 0
for n in range(len(self.DB_All)):
counter += 1
if counter % 1000 == 0:
print(counter)
self.hkldb1.append([self.DB_All[n][0][0]] )
self.hkldb2.append([self.DB_All[n][1][0]] )
print('creating the hkl dump of DBAll')
hkl.dump(self.DB_All, 'hkldb_All' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')
#print('done..')
print('Creating the hkl dump of DB 1')
hkl.dump(self.hkldb1, 'hkldb1' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')
#print('done..')
print('Creating the hkl dump of DB 2')
hkl.dump(self.hkldb2, 'hkldb2' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')
#print('done..')
return 'done'
def load_DB_into_FASTsearch(self):
#print('loading the hkldb_All databases..')
self.hkldbAktiv_All = hkl.load('hkldb_AllAktiv.hkl')
#print('first done')
self.hkldbVorgangspassiv_All = hkl.load('hkldb_AllVorgangspassiv.hkl')
#print('second done')
self.hkldbZustandspassiv_All = hkl.load('hkldb_AllZustandspassiv.hkl')
#print('third done')
#print('loading hkldbIndi_Conju 1..')
self.fsearchAktiv1 = FASTsearch.FASTsearch('hkldb1Aktiv.hkl')
#print('done')
#print('loading hkldbIndi_Conju 2..')
self.fsearchAktiv2 = FASTsearch.FASTsearch('hkldb2Aktiv.hkl')
#print('done')
# generate bow model only necessary the first time
#print('generating BoW Model 1..')
#self.fsearchAktiv1.Gen_BoW_Model(20000, "word", punctuation = False)
#print('done')
#print('generating BoW Model 2..')
#self.fsearchAktiv2.Gen_BoW_Model(20000, "word", punctuation = False)
#print('done')
#print('loading the bow model 1')
self.fsearchAktiv1.Load_BoW_Model('bagofwordshkldb1Aktiv.pkl', 'DataBaseOneZeroshkldb1Aktiv.hkl')
#print('done')
#print('loading the bow model 2')
self.fsearchAktiv2.Load_BoW_Model('bagofwordshkldb2Aktiv.pkl', 'DataBaseOneZeroshkldb2Aktiv.hkl')
#print('done')
#print('loading hkldbIndi_Conju 1..')
self.fsearchVorgangspassiv1 = FASTsearch.FASTsearch('hkldb1Vorgangspassiv.hkl')
#print('done')
#print('loading hkldbIndi_Conju 2..')
self.fsearchVorgangspassiv2 = FASTsearch.FASTsearch('hkldb2Vorgangspassiv.hkl')
#print('done')
# uncomment if models are not there
#print('generating BoW Model 1..')
#self.fsearchVorgangspassiv1.Gen_BoW_Model(20000, "word", punctuation = False)
#print('done')
#print('generating BoW Model 2..')
#self.fsearchVorgangspassiv2.Gen_BoW_Model(20000, "word", punctuation = False)
#print('done')
#print('loading the bow model 1')
self.fsearchVorgangspassiv1.Load_BoW_Model('bagofwordshkldb1Vorgangspassiv.pkl', 'DataBaseOneZeroshkldb1Vorgangspassiv.hkl')
#print('done')
#print('loading the bow model 2')
self.fsearchVorgangspassiv2.Load_BoW_Model('bagofwordshkldb2Vorgangspassiv.pkl', 'DataBaseOneZeroshkldb2Vorgangspassiv.hkl')
#print('done')
#print('loading hkldbIndi_Conju 1..')
self.fsearchZustandspassiv1 = FASTsearch.FASTsearch('hkldb1Zustandspassiv.hkl')
#print('done')
#print('loading hkldbIndi_Conju 2..')
self.fsearchZustandspassiv2 = FASTsearch.FASTsearch('hkldb2Zustandspassiv.hkl')
#print('done')
#print('generating BoW Model 1..')
#self.fsearchZustandspassiv1.Gen_BoW_Model(20000, "word", punctuation = False)
#print('done')
#print('generating BoW Model 2..')
#self.fsearchZustandspassiv2.Gen_BoW_Model(20000, "word", punctuation = False)
#print('done')
#print('loading the bow model 1')
self.fsearchZustandspassiv1.Load_BoW_Model('bagofwordshkldb1Zustandspassiv.pkl', 'DataBaseOneZeroshkldb1Zustandspassiv.hkl')
#print('done')
#print('loading the bow model 2')
self.fsearchZustandspassiv2.Load_BoW_Model('bagofwordshkldb2Zustandspassiv.pkl', 'DataBaseOneZeroshkldb2Zustandspassiv.hkl')
#print('done')
import GS_Utils
#print('initializing the gs utils..')
self.gs = GS_Utils.GS_Utils('de_core_news_sm')
#print('done')
from SentGlue import SentGlueMach
#print('loading the Stochastic Gradient models..')
self.sgm = SentGlueMach('trainedSGD.pkl', 'bagofwords.pkl')
#print('done')
#print('initializing the SGM..')
self.sgm.initialize()
#print('done')
#print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
#print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
def replacePassivForms(self,sentences):
endsentences = []
sentencecount = 0
for sentence in sentences:
try:
sentencecount += 1
#print('processing sentence', sentencecount)
doc = self.nlp(' '.join(sentence))
verbs_of_sentence = []
wordindex_to_replace = []
count = 0
subjectofsentence = []
subjectindex = []
erindex = []
Erindex = []
undindex = []
for word in doc:
count += 1
#print(word.text)
#print(word.dep_)
if word.dep_ == 'sb':
#print('oi')
subjectofsentence.append(word.text)
subjectindex.append(count)
if word.text == 'er':
erindex.append(count)
if word.text == 'Er':
Erindex.append(count)
if word.text == 'und':
undindex.append(count)
if word.tag_[0] == 'V':
verbs_of_sentence.append(word.text)
wordindex_to_replace.append(count)
if len(verbs_of_sentence) == 1 and verbs_of_sentence[0] == ('wurde' or 'wird' or 'werden' or 'wirst' or 'werde' or 'war'):
verbs_of_sentence[0] = 'bliblablubdudidu'
verbs_of_sentence_string = ' '.join(verbs_of_sentence)
length_verbs_of_sentence_string = len(verbs_of_sentence_string)
verbs_of_sentence_string += ' ' + str(length_verbs_of_sentence_string)
#print(verbs_of_sentence_string)
bestmatchesZustandspassiv1, matchindexZustandspassiv1 = self.fsearchZustandspassiv1.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)
bestmatchesVorgangspassiv1, matchindexVorgangspassiv1 = self.fsearchVorgangspassiv1.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)
#print('verbs of sentence string', verbs_of_sentence_string)
#print(len(verbs_of_sentence))
#print(matchindexVorgangspassiv1)
#print(matchindexZustandspassiv1)
vorgangORnot = 0
zustandORnot = 0
if (len(verbs_of_sentence) + 1) == matchindexVorgangspassiv1[1]:
workindex = matchindexVorgangspassiv1[0]
vorgangORnot = 1
if (len(verbs_of_sentence) + 1) == matchindexZustandspassiv1[1]:
workindex = matchindexZustandspassiv1[0]
zustandORnot = 1
#print(workindex)
#print(self.hkldbAktiv_All[matchindexVorgangspassiv1[0]])
#print(self.hkldbVorgangspassiv_All[matchindexVorgangspassiv1[0]])
#print(self.hkldbZustandspassiv_All[matchindexZustandspassiv1[0]])
formToReplace = []
if vorgangORnot == 1:
completeform = self.hkldbVorgangspassiv_All[workindex]
if len(verbs_of_sentence_string.split()) != len(completeform[0][0].split()):
vorgangORnot = 0
if vorgangORnot == 1:
completeform = self.hkldbVorgangspassiv_All[workindex]
formToReplace = self.hkldbVorgangspassiv_All[workindex][1][0].split()[-2:]
#print('formtoreplace vorgang',formToReplace)
#print('complete form', completeform)
formToReplace = '3. Person Singular ' + ' '.join(formToReplace)
#print(formToReplace)
thrdPersonAktivindex = self.fsearchAktiv2.search_with_highest_multiplikation_Output(formToReplace, 1)[0]
thrdPersonAktiv = self.hkldbAktiv_All[thrdPersonAktivindex[0]][0][0].split()[:-1]
#print(thrdPersonAktiv)
thrdPersonAktiv = ' '.join(thrdPersonAktiv)
dalist = verbs_of_sentence_string.split()[:-1]
for verb in dalist:
#print(sentence)
#print(index)
sentence.remove(verb)
thereisasubjectEr = 0
for index in subjectindex:
for ind in undindex:
if index - 1 == ind:
if index - 2 == ('er' or 'Er'):
thereisasubjectEr = 1
if index + 1 == ind:
if index + 2 == 'er' or index + 2 == 'Er':
thereisasubjectEr = 1
#print('subjectofsentence', subjectofsentence)
thereisasubjectich = 0
thereisasubjectdu = 0
thereisasubjectihr = 0
thereisasubjectwir = 0
for word in subjectofsentence:
if word == 'er' or word == 'Er':
thereisasubjectEr = 1
if word == 'ich':
thereisasubjectich = 1
if word == 'du':
thereisasubjectdu = 1
if word == 'ihr':
thereisasubjectihr = 1
if word == 'wir':
thereisasubjectwir = 1
#print('there is a subjecter', thereisasubjectEr)
if thereisasubjectEr == 1:
try:
sentence.remove('Er')
except:
sentence.remove('er')
sentence.append('ihn')
if thereisasubjectich == 1:
sentence.remove('ich')
sentence.append('mich')
if thereisasubjectdu == 1:
sentence.remove('du')
sentence.append('dich')
if thereisasubjectihr == 1:
sentence.remove('ihr')
sentence.append('euch')
if thereisasubjectwir == 1:
sentence.remove('wir')
sentence.append('uns')
sentence.append(thrdPersonAktiv)
#print('sentence in the vorgangornot', sentence)
jemandornot = 1
wordstodelete = []
for n in range(len(sentence) - 1):
if sentence[n] == 'von':
if sentence[n + 1] == 'ihr':
sentence[n + 1] = 'sie'
wordstodelete.append(n)
jemandornot = 0
if sentence[n + 1] == 'ihm':
sentence[n + 1] = 'er'
wordstodelete.append(n)
jemandornot = 0
import spacy
nlp = spacy.load('de_core_news_sm')
token1 = nlp(sentence[n - 1])
token2 = nlp(sentence[n + 1])
for word in token1:
if word.tag_ != 'NN' and word.tag_ != 'NE':
for word in token2:
if word.tag_ == 'NN' or word.tag_ == 'NE':
wordstodelete.append(n)
jemandornot = 0
if sentence[n + 1] == 'dem' or sentence[n + 1] == 'einem':
token3 = nlp(sentence[n-1])
for word in token3:
if word.tag_ != 'NN' and word.tag_ != 'NE':
sentence[n + 1] = 'ein'
wordstodelete.append(n)
jemandornot = 0
if sentence[n + 1] == 'der' or sentence[n + 1] == 'einer':
token4 = nlp(sentence[n-1])
for word in token4:
if word.tag_ != 'NN' and word.tag_ != 'NE':
sentence[n + 1] = 'eine'
wordstodelete.append(n)
jemandornot = 0
if sentence[n] == 'vom':
sentence[n] = 'ein'
jemandornot = 0
for index in wordstodelete[::-1]:
del sentence[index]
if jemandornot == 1:
sentence.append('jemand')
#print('sentence checkpoint 2', sentence)
#print('get the tuples and triples to check..')
tuplesTocheck, triplesTocheck, quadruplesToCheck = self.gs.GetTuplesinSentence(sentence)
#print('done')
#print(tuplesTocheck, triplesTocheck)
grammpiecessentence = self.gs.createTupleofGrammarpieces( sentence, tuplesTocheck, triplesTocheck, quadruplesToCheck)
if len(grammpiecessentence) > 7:
print('A sentence is too long, too many permutations. \n piping wrong grammar..')
endsentences.append(' '.join(grammpiecessentence).split())
else:
#print('the grammpiecessentence', grammpiecessentence)
#print('genrating the permutations')
permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
#print('done')
#print(permutations)
#if (len(tuplesTocheck) != 0) or (len(triplesTocheck) != 0):
# print('filtering the permutations based on the tuples and triples..')
# filteredpermutations = self.gs.filterpermutationsaccordingtotuples(permutations, tuplesTocheck, triplesTocheck)
# print('done')
#else:
# print('there are no triples or tuples to check..')
# filteredpermutations = permutations
sentencesToCheck = []
for sentence in permutations:
sentencesToCheck.append(' '.join(sentence))
#print('sentencesToCheck', sentencesToCheck)
#print('classifying the probability for right grammar in the filtered permutations..')
#print(' '.join(sentence))
endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(sentence))
#print('done')
#print('the endsentence', endsentence)
endsentences.append(endsentence.split())
#count1 = 0
#print(subjectindex)
#subjectindex = subjectindex[0]
#if subjectindex != 0:
#for word in sentence[subjectindex - 1:subjectindex + 1]:
#count1 += 1
#if word == 'und':
#thereIsanUnd = count1
#if subjectindex == 0:
#for word in sentence[subjectindex:subjectindex + 1]:
#count1 += 1
#if word == 'und':
#thereIsanUnd = count1
#thereisanEr = 0
#if sentence[subjectindex - 1 + thereIsanUnd] == 'er' or sentence[subjectindex - 1 + thereIsanUnd] == 'Er':
#thereisanEr = 1
#if thereisanEr == 1:
#sentence.remove('Er')
#sentence.remove('er')
#sentence.append('ihn')
#print('zustandornot',zustandORnot)
#print('vorgang', vorgangORnot)
if zustandORnot == 1:
completeform = self.hkldbZustandspassiv_All[workindex]
if len(verbs_of_sentence_string.split()) != len(completeform[0][0].split()):
zustandORnot = 0
if zustandORnot == 1:
#completeform = self.hkldbZustandspassiv_All[workindex]
formToReplace = self.hkldbZustandspassiv_All[workindex][1][0].split()[-2:]
formToReplace = '3. Person Singular ' + ' '.join(formToReplace)
#print('formtoreplace zustand',formToReplace)
#print('complete form', completeform)
thrdPersonAktivindex = self.fsearchAktiv2.search_with_highest_multiplikation_Output(formToReplace, 1)[0]
thrdPersonAktiv = self.hkldbAktiv_All[thrdPersonAktivindex[0]][0][0].split()[:-1]
thrdPersonAktiv = ' '.join(thrdPersonAktiv)
for verb in verbs_of_sentence_string.split()[:-1]:
#print(sentence)
#print(index)
sentence.remove(verb)
thereisasubjectEr = 0
for index in subjectindex:
for ind in undindex:
if index - 1 == ind:
if index - 2 == ('er' or 'Er'):
thereisasubjectEr = 1
if index + 1 == ind:
if index + 2 == 'er' or index + 2 == 'Er':
thereisasubjectEr = 1
#print('subjectofsentence', subjectofsentence)
thereisasubjectich = 0
thereisasubjectdu = 0
thereisasubjectihr = 0
thereisasubjectwir = 0
for word in subjectofsentence:
if word == 'er' or word == 'Er':
thereisasubjectEr = 1
if word == 'ich':
thereisasubjectich = 1
if word == 'du':
thereisasubjectdu = 1
if word == 'ihr':
thereisasubjectihr = 1
if word == 'wir':
thereisasubjectwir = 1
if thereisasubjectEr == 1:
try:
sentence.remove('Er')
except:
sentence.remove('er')
sentence.append('ihn')
if thereisasubjectich == 1:
sentence.remove('ich')
sentence.append('mich')
if thereisasubjectdu == 1:
sentence.remove('du')
sentence.append('dich')
if thereisasubjectihr == 1:
sentence.remove('ihr')
sentence.append('euch')
if thereisasubjectwir == 1:
sentence.remove('wir')
sentence.append('uns')
sentence.append(thrdPersonAktiv)
jemandornot = 1
wordstodelete = []
for n in range(len(sentence) - 1):
if sentence[n] == 'von':
if sentence[n + 1] == 'ihr':
sentence[n + 1] = 'sie'
wordstodelete.append(n)
jemandornot = 0
if sentence[n + 1] == 'ihm':
sentence[n + 1] = 'er'
wordstodelete.append(n)
jemandornot = 0
import spacy
nlp = spacy.load('de_core_news_sm')
token1 = nlp(sentence[n - 1])
token2 = nlp(sentence[n + 1])
for word in token1:
if word.tag_ != 'NN' and word.tag_ != 'NE':
for word in token2:
if word.tag_ == 'NN' or word.tag_ == 'NE':
wordstodelete.append(n)
jemandornot = 0
if sentence[n + 1] == 'dem' or sentence[n + 1] == 'einem':
token3 = nlp(sentence[n-1])
for word in token3:
if word.tag_ != 'NN' and word.tag_ != 'NE':
sentence[n + 1] = 'ein'
wordstodelete.append(n)
jemandornot = 0
if sentence[n + 1] == 'der' or sentence[n + 1] == 'einer':
token4 = nlp(sentence[n-1])
for word in token4:
if word.tag_ != 'NN' and word.tag_ != 'NE':
sentence[n + 1] = 'eine'
wordstodelete.append(n)
jemandornot = 0
if sentence[n] == 'vom':
sentence[n] = 'ein'
jemandornot = 0
for index in wordstodelete[::-1]:
del sentence[index]
if jemandornot == 1:
sentence.append('jemand')
#print(sentence)
#print('get the tuples and triples to check..')
tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(sentence)
#print('done')
#print(tuplesTocheck, triplesTocheck)
grammpiecessentence = self.gs.createTupleofGrammarpieces( sentence, tuplesTocheck, triplesTocheck, quadruplesTocheck)
if len(grammpiecessentence) > 7:
print('A sentence is too long, too many permutations. \n piping wrong grammar..')
endsentences.append(' '.join(grammpiecessentence).split())
else:
#print('the grammpiecessentence', grammpiecessentence)
#print('genrating the permutations')
permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
#print('done')
#print(permutations)
#if (len(tuplesTocheck) != 0) or (len(triplesTocheck) != 0):
# print('filtering the permutations based on the tuples and triples..')
# filteredpermutations = self.gs.filterpermutationsaccordingtotuples(permutations, tuplesTocheck, triplesTocheck)
# print('done')
#else:
# print('there are no triples or tuples to check..')
# filteredpermutations = permutations
sentencesToCheck = []
for sentence in permutations:
sentencesToCheck.append(' '.join(sentence))
#print('sentencesToCheck', sentencesToCheck)
#print('classifying the probability for right grammar in the filtered permutations..')
#print(' '.join(sentence))
endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(sentence))
#print('done')
#print('the endsentence', endsentence)
endsentences.append(endsentence.split())
if zustandORnot == 0 and vorgangORnot == 0:
#print('it is coming to the else')
endsentences.append(sentence)
except:
print('the sentence ' + str(sentence) + ' caused an error in the module passive2active')
if endsentences[-1] == sentence:
pass
else:
endsentences.append(sentence)
return endsentences
# Vorgangspassiv wird auf selbe Zeit gemappt, 3. Person Singular.
# Zustandspassiv: Immer eine Zeit dahinter. D.h.
# Präsens => Präteritum, Präteritum => Perfekt