alpcentaur
/
basabuuka_prototyp



								import spacy

								import nltk

								from nltk.stem.snowball import SnowballStemmer


								import hickle as hkl

								import FASTsearch


								stemmer = SnowballStemmer("german")


								class Passiv2Aktiv(object):


								    def __init__(self, hklDatabaseDir_Aktiv, hklDatabaseDir_Vorgangspassiv, hklDatabaseDir_Zustandspassiv):


								        if hklDatabaseDir_Aktiv is not None:

								            self.AktivDB = hkl.load(hklDatabaseDir_Aktiv)


								        if hklDatabaseDir_Vorgangspassiv is not None:

								            self.VorgangspassivDB = hkl.load(hklDatabaseDir_Vorgangspassiv)


								        if hklDatabaseDir_Zustandspassiv is not None:

								            self.ZustandspassivDB = hkl.load(hklDatabaseDir_Zustandspassiv)


								        #print('loading the german spacy model..')

								        self.nlp = spacy.load('de_core_news_sm')

								        #print('done')


								        #print('loading the stemmer..')

								        self.stemmer = SnowballStemmer("german")

								        #print('done')


								        return


								    def create_hklDB_from_csv(self, csvDbDir, StemOrNot):


								        with open(csvDbDir) as lines:


								            self.DB_All = []


								            for line in lines:


								                #print(line)


								                self.DB_All.append(list(eval(line)))


								            self.hkldb1 = []

								            self.hkldb2 = []


								            counter = 0

								            for n in range(len(self.DB_All)):


								                counter += 1

								                if counter % 1000 == 0:

								                    print(counter)


								                self.hkldb1.append([self.DB_All[n][0][0]] )

								                self.hkldb2.append([self.DB_All[n][1][0]] )


								            print('creating the hkl dump of DBAll')

								            hkl.dump(self.DB_All, 'hkldb_All' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')

								            #print('done..')


								            print('Creating the hkl dump of DB 1')

								            hkl.dump(self.hkldb1, 'hkldb1' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')

								            #print('done..')


								            print('Creating the hkl dump of DB 2')

								            hkl.dump(self.hkldb2, 'hkldb2' + csvDbDir[:-4] + '.hkl', mode='w', compression='lzf')

								            #print('done..')


								        return 'done'


								    def load_DB_into_FASTsearch(self):


								        #print('loading the hkldb_All databases..')

								        self.hkldbAktiv_All = hkl.load('hkldb_AllAktiv.hkl')

								        #print('first done')

								        self.hkldbVorgangspassiv_All = hkl.load('hkldb_AllVorgangspassiv.hkl')

								        #print('second done')

								        self.hkldbZustandspassiv_All = hkl.load('hkldb_AllZustandspassiv.hkl')

								        #print('third done')


								        #print('loading hkldbIndi_Conju 1..')

								        self.fsearchAktiv1 = FASTsearch.FASTsearch('hkldb1Aktiv.hkl')

								        #print('done')


								        #print('loading hkldbIndi_Conju 2..')

								        self.fsearchAktiv2 = FASTsearch.FASTsearch('hkldb2Aktiv.hkl')

								        #print('done')


								        # generate bow model only necessary the first time

								        #print('generating BoW Model 1..')

								        #self.fsearchAktiv1.Gen_BoW_Model(20000, "word", punctuation = False)

								        #print('done')


								        #print('generating BoW Model 2..')

								        #self.fsearchAktiv2.Gen_BoW_Model(20000, "word", punctuation = False)

								        #print('done')


								        #print('loading the bow model 1')

								        self.fsearchAktiv1.Load_BoW_Model('bagofwordshkldb1Aktiv.pkl', 'DataBaseOneZeroshkldb1Aktiv.hkl')

								        #print('done')


								        #print('loading the bow model 2')

								        self.fsearchAktiv2.Load_BoW_Model('bagofwordshkldb2Aktiv.pkl', 'DataBaseOneZeroshkldb2Aktiv.hkl')

								        #print('done')


								        #print('loading hkldbIndi_Conju 1..')

								        self.fsearchVorgangspassiv1 = FASTsearch.FASTsearch('hkldb1Vorgangspassiv.hkl')

								        #print('done')


								        #print('loading hkldbIndi_Conju 2..')

								        self.fsearchVorgangspassiv2 = FASTsearch.FASTsearch('hkldb2Vorgangspassiv.hkl')

								        #print('done')


								        # uncomment if models are not there

								        #print('generating BoW Model 1..')

								        #self.fsearchVorgangspassiv1.Gen_BoW_Model(20000, "word", punctuation = False)

								        #print('done')


								        #print('generating BoW Model 2..')

								        #self.fsearchVorgangspassiv2.Gen_BoW_Model(20000, "word", punctuation = False)

								        #print('done')


								        #print('loading the bow model 1')

								        self.fsearchVorgangspassiv1.Load_BoW_Model('bagofwordshkldb1Vorgangspassiv.pkl', 'DataBaseOneZeroshkldb1Vorgangspassiv.hkl')

								        #print('done')


								        #print('loading the bow model 2')

								        self.fsearchVorgangspassiv2.Load_BoW_Model('bagofwordshkldb2Vorgangspassiv.pkl', 'DataBaseOneZeroshkldb2Vorgangspassiv.hkl')

								        #print('done')


								        #print('loading hkldbIndi_Conju 1..')

								        self.fsearchZustandspassiv1 = FASTsearch.FASTsearch('hkldb1Zustandspassiv.hkl')

								        #print('done')


								        #print('loading hkldbIndi_Conju 2..')

								        self.fsearchZustandspassiv2 = FASTsearch.FASTsearch('hkldb2Zustandspassiv.hkl')

								        #print('done')


								        #print('generating BoW Model 1..')

								        #self.fsearchZustandspassiv1.Gen_BoW_Model(20000, "word", punctuation = False)

								        #print('done')


								        #print('generating BoW Model 2..')

								        #self.fsearchZustandspassiv2.Gen_BoW_Model(20000, "word", punctuation = False)

								        #print('done')


								        #print('loading the bow model 1')

								        self.fsearchZustandspassiv1.Load_BoW_Model('bagofwordshkldb1Zustandspassiv.pkl', 'DataBaseOneZeroshkldb1Zustandspassiv.hkl')

								        #print('done')


								        #print('loading the bow model 2')

								        self.fsearchZustandspassiv2.Load_BoW_Model('bagofwordshkldb2Zustandspassiv.pkl', 'DataBaseOneZeroshkldb2Zustandspassiv.hkl')

								        #print('done')


								        import GS_Utils

								        #print('initializing the gs utils..')

								        self.gs = GS_Utils.GS_Utils('de_core_news_sm')

								        #print('done')


								        from SentGlue import SentGlueMach

								        #print('loading the Stochastic Gradient models..')

								        self.sgm = SentGlueMach('trainedSGD.pkl', 'bagofwords.pkl')

								        #print('done')

								        #print('initializing the SGM..')

								        self.sgm.initialize()

								        #print('done')


								        #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())

								        #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())


								    def replacePassivForms(self,sentences):


								        endsentences = []

								        sentencecount = 0

								        for sentence in sentences:

								            try:

								                sentencecount += 1

								                #print('processing sentence', sentencecount)


								                doc = self.nlp(' '.join(sentence))


								                verbs_of_sentence = []

								                wordindex_to_replace = []

								                count = 0

								                subjectofsentence = []

								                subjectindex = []

								                erindex = []

								                Erindex = []

								                undindex = []


								                for word in doc:


								                    count += 1


								                    #print(word.text)

								                    #print(word.dep_)


								                    if word.dep_ == 'sb':


								                        #print('oi')

								                        subjectofsentence.append(word.text)

								                        subjectindex.append(count)


								                    if word.text == 'er':

								                        erindex.append(count)

								                    if word.text == 'Er':

								                        Erindex.append(count)

								                    if word.text == 'und':

								                        undindex.append(count)


								                    if word.tag_[0] == 'V':

								                        verbs_of_sentence.append(word.text)

								                        wordindex_to_replace.append(count)


								                if len(verbs_of_sentence) == 1 and verbs_of_sentence[0] == ('wurde' or 'wird' or 'werden' or 'wirst' or 'werde' or 'war'):

								                    verbs_of_sentence[0] = 'bliblablubdudidu'


								                verbs_of_sentence_string = ' '.join(verbs_of_sentence)


								                length_verbs_of_sentence_string = len(verbs_of_sentence_string)


								                verbs_of_sentence_string += ' ' + str(length_verbs_of_sentence_string)

								                #print(verbs_of_sentence_string)

								                bestmatchesZustandspassiv1, matchindexZustandspassiv1 = self.fsearchZustandspassiv1.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)


								                bestmatchesVorgangspassiv1, matchindexVorgangspassiv1 = self.fsearchVorgangspassiv1.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)


								                #print('verbs of sentence string', verbs_of_sentence_string)

								                #print(len(verbs_of_sentence))

								                #print(matchindexVorgangspassiv1)

								                #print(matchindexZustandspassiv1)


								                vorgangORnot = 0

								                zustandORnot = 0

								                if (len(verbs_of_sentence) + 1) == matchindexVorgangspassiv1[1]:

								                    workindex = matchindexVorgangspassiv1[0]

								                    vorgangORnot = 1


								                if (len(verbs_of_sentence) + 1) == matchindexZustandspassiv1[1]:

								                    workindex = matchindexZustandspassiv1[0]

								                    zustandORnot = 1


								                #print(workindex)


								                #print(self.hkldbAktiv_All[matchindexVorgangspassiv1[0]])

								                #print(self.hkldbVorgangspassiv_All[matchindexVorgangspassiv1[0]])

								                #print(self.hkldbZustandspassiv_All[matchindexZustandspassiv1[0]])


								                formToReplace = []


								                if vorgangORnot == 1:

								                    completeform = self.hkldbVorgangspassiv_All[workindex]

								                    if len(verbs_of_sentence_string.split()) != len(completeform[0][0].split()):

								                        vorgangORnot = 0


								                if vorgangORnot == 1:

								                    completeform = self.hkldbVorgangspassiv_All[workindex]

								                    formToReplace = self.hkldbVorgangspassiv_All[workindex][1][0].split()[-2:]


								                    #print('formtoreplace vorgang',formToReplace)

								                    #print('complete form', completeform)


								                    formToReplace = '3. Person Singular ' + ' '.join(formToReplace)


								                    #print(formToReplace)


								                    thrdPersonAktivindex = self.fsearchAktiv2.search_with_highest_multiplikation_Output(formToReplace, 1)[0]


								                    thrdPersonAktiv = self.hkldbAktiv_All[thrdPersonAktivindex[0]][0][0].split()[:-1]


								                    #print(thrdPersonAktiv)


								                    thrdPersonAktiv = ' '.join(thrdPersonAktiv)


								                    dalist = verbs_of_sentence_string.split()[:-1]


								                    for verb in dalist:

								                        #print(sentence)

								                        #print(index)


								                        sentence.remove(verb)


								                    thereisasubjectEr = 0


								                    for index in subjectindex:

								                        for ind in undindex:

								                            if index - 1 == ind:

								                                if index - 2 == ('er' or 'Er'):

								                                    thereisasubjectEr = 1

								                            if index + 1 == ind:

								                                if index + 2 == 'er' or index + 2 == 'Er':

								                                    thereisasubjectEr = 1

								                    #print('subjectofsentence', subjectofsentence)

								                    thereisasubjectich = 0

								                    thereisasubjectdu = 0

								                    thereisasubjectihr = 0

								                    thereisasubjectwir = 0

								                    for word in subjectofsentence:

								                        if word == 'er' or word == 'Er':

								                            thereisasubjectEr = 1

								                        if word == 'ich':

								                            thereisasubjectich = 1

								                        if word == 'du':

								                            thereisasubjectdu = 1

								                        if word == 'ihr':

								                            thereisasubjectihr = 1

								                        if word == 'wir':

								                            thereisasubjectwir = 1

								                    #print('there is a subjecter', thereisasubjectEr)

								                    if thereisasubjectEr == 1:

								                        try:

								                            sentence.remove('Er')

								                        except:

								                            sentence.remove('er')

								                        sentence.append('ihn')

								                    if thereisasubjectich == 1:

								                        sentence.remove('ich')

								                        sentence.append('mich')

								                    if thereisasubjectdu == 1:

								                        sentence.remove('du')

								                        sentence.append('dich')

								                    if thereisasubjectihr == 1:

								                        sentence.remove('ihr')

								                        sentence.append('euch')

								                    if thereisasubjectwir == 1:

								                        sentence.remove('wir')

								                        sentence.append('uns')


								                    sentence.append(thrdPersonAktiv)

								                    #print('sentence in the vorgangornot', sentence)

								                    jemandornot = 1

								                    wordstodelete = []

								                    for n in range(len(sentence) - 1):

								                        if sentence[n] == 'von':

								                            if sentence[n + 1] == 'ihr':

								                                sentence[n + 1] = 'sie'

								                                wordstodelete.append(n)

								                                jemandornot = 0

								                            if sentence[n + 1] == 'ihm':

								                                sentence[n + 1] = 'er'

								                                wordstodelete.append(n)

								                                jemandornot = 0

								                            import spacy

								                            nlp = spacy.load('de_core_news_sm')

								                            token1 = nlp(sentence[n - 1])

								                            token2 = nlp(sentence[n + 1])

								                            for word in token1:

								                                if word.tag_ != 'NN' and word.tag_ != 'NE':

								                                    for word in token2:

								                                        if word.tag_ == 'NN' or word.tag_ == 'NE':

								                                            wordstodelete.append(n)


								                                            jemandornot = 0

								                            if sentence[n + 1] == 'dem' or sentence[n + 1] == 'einem':


								                                token3 = nlp(sentence[n-1])

								                                for word in token3:

								                                    if word.tag_ != 'NN' and word.tag_ != 'NE':

								                                        sentence[n + 1] = 'ein'

								                                        wordstodelete.append(n)

								                                        jemandornot = 0

								                            if sentence[n + 1] == 'der' or sentence[n + 1] == 'einer':

								                                token4 = nlp(sentence[n-1])

								                                for word in token4:

								                                    if word.tag_ != 'NN' and word.tag_ != 'NE':

								                                        sentence[n + 1] = 'eine'

								                                        wordstodelete.append(n)

								                                        jemandornot = 0


								                        if sentence[n] == 'vom':


								                            sentence[n] = 'ein'

								                            jemandornot = 0

								                    for index in wordstodelete[::-1]:

								                        del sentence[index]

								                    if jemandornot == 1:

								                        sentence.append('jemand')


								                    #print('sentence checkpoint 2', sentence)


								                    #print('get the tuples and triples to check..')

								                    tuplesTocheck, triplesTocheck, quadruplesToCheck = self.gs.GetTuplesinSentence(sentence)

								                    #print('done')

								                    #print(tuplesTocheck, triplesTocheck)


								                    grammpiecessentence = self.gs.createTupleofGrammarpieces( sentence, tuplesTocheck, triplesTocheck, quadruplesToCheck)


								                    if len(grammpiecessentence) > 7:

								                        print('A sentence is too long, too many permutations. \n piping wrong grammar..')

								                        endsentences.append(' '.join(grammpiecessentence).split())


								                    else:


								                        #print('the grammpiecessentence', grammpiecessentence)

								                        #print('genrating the permutations')

								                        permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)

								                        #print('done')

								                        #print(permutations)

								                        #if (len(tuplesTocheck) != 0) or (len(triplesTocheck) != 0):

								                        #    print('filtering the permutations based on the tuples and triples..')

								                        #    filteredpermutations = self.gs.filterpermutationsaccordingtotuples(permutations, tuplesTocheck, triplesTocheck)

								                        #    print('done')

								                        #else:

								                        #    print('there are no triples or tuples to check..')

								                        #    filteredpermutations = permutations


								                        sentencesToCheck = []

								                        for sentence in permutations:

								                            sentencesToCheck.append(' '.join(sentence))


								                        #print('sentencesToCheck', sentencesToCheck)

								                        #print('classifying the probability for right grammar in the filtered permutations..')

								                        #print(' '.join(sentence))

								                        endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(sentence))

								                        #print('done')


								                        #print('the endsentence', endsentence)

								                        endsentences.append(endsentence.split())


								                #count1 = 0


								                #print(subjectindex)

								                #subjectindex = subjectindex[0]

								                #if subjectindex != 0:

								                    #for word in sentence[subjectindex - 1:subjectindex + 1]:

								                        #count1 += 1

								                        #if word == 'und':

								                            #thereIsanUnd = count1

								                #if subjectindex == 0:

								                    #for word in sentence[subjectindex:subjectindex + 1]:

								                        #count1 += 1

								                        #if word == 'und':

								                            #thereIsanUnd = count1

								                #thereisanEr = 0

								                #if sentence[subjectindex - 1 + thereIsanUnd] == 'er' or sentence[subjectindex - 1 + thereIsanUnd] == 'Er':


								                    #thereisanEr = 1


								                #if thereisanEr == 1:


								                    #sentence.remove('Er')

								                    #sentence.remove('er')

								                    #sentence.append('ihn')


								                #print('zustandornot',zustandORnot)

								                #print('vorgang', vorgangORnot)


								                if zustandORnot == 1:

								                    completeform = self.hkldbZustandspassiv_All[workindex]

								                    if len(verbs_of_sentence_string.split()) != len(completeform[0][0].split()):

								                        zustandORnot = 0


								                if zustandORnot == 1:

								                    #completeform = self.hkldbZustandspassiv_All[workindex]

								                    formToReplace = self.hkldbZustandspassiv_All[workindex][1][0].split()[-2:]

								                    formToReplace = '3. Person Singular ' + ' '.join(formToReplace)

								                    #print('formtoreplace zustand',formToReplace)

								                    #print('complete form', completeform)


								                    thrdPersonAktivindex = self.fsearchAktiv2.search_with_highest_multiplikation_Output(formToReplace, 1)[0]


								                    thrdPersonAktiv = self.hkldbAktiv_All[thrdPersonAktivindex[0]][0][0].split()[:-1]


								                    thrdPersonAktiv = ' '.join(thrdPersonAktiv)


								                    for verb in verbs_of_sentence_string.split()[:-1]:

								                        #print(sentence)

								                        #print(index)


								                        sentence.remove(verb)


								                    thereisasubjectEr = 0


								                    for index in subjectindex:

								                        for ind in undindex:

								                            if index - 1 == ind:

								                                if index - 2 == ('er' or 'Er'):

								                                    thereisasubjectEr = 1

								                            if index + 1 == ind:

								                                if index + 2 == 'er' or index + 2 == 'Er':

								                                    thereisasubjectEr = 1

								                    #print('subjectofsentence', subjectofsentence)


								                    thereisasubjectich = 0

								                    thereisasubjectdu = 0

								                    thereisasubjectihr = 0

								                    thereisasubjectwir = 0

								                    for word in subjectofsentence:

								                        if word == 'er' or word == 'Er':

								                            thereisasubjectEr = 1

								                        if word == 'ich':

								                            thereisasubjectich = 1

								                        if word == 'du':

								                            thereisasubjectdu = 1

								                        if word == 'ihr':

								                            thereisasubjectihr = 1

								                        if word == 'wir':

								                            thereisasubjectwir = 1

								                    if thereisasubjectEr == 1:

								                        try:

								                            sentence.remove('Er')

								                        except:

								                            sentence.remove('er')

								                        sentence.append('ihn')


								                    if thereisasubjectich == 1:

								                        sentence.remove('ich')

								                        sentence.append('mich')

								                    if thereisasubjectdu == 1:

								                        sentence.remove('du')

								                        sentence.append('dich')

								                    if thereisasubjectihr == 1:

								                        sentence.remove('ihr')

								                        sentence.append('euch')

								                    if thereisasubjectwir == 1:

								                        sentence.remove('wir')

								                        sentence.append('uns')


								                    sentence.append(thrdPersonAktiv)


								                    jemandornot = 1

								                    wordstodelete = []

								                    for n in range(len(sentence) - 1):

								                        if sentence[n] == 'von':

								                            if sentence[n + 1] == 'ihr':

								                                sentence[n + 1] = 'sie'

								                                wordstodelete.append(n)

								                                jemandornot = 0

								                            if sentence[n + 1] == 'ihm':

								                                sentence[n + 1] = 'er'

								                                wordstodelete.append(n)

								                                jemandornot = 0


								                            import spacy

								                            nlp = spacy.load('de_core_news_sm')

								                            token1 = nlp(sentence[n - 1])

								                            token2 = nlp(sentence[n + 1])

								                            for word in token1:

								                                if word.tag_ != 'NN' and word.tag_ != 'NE':

								                                    for word in token2:

								                                        if word.tag_ == 'NN' or word.tag_ == 'NE':

								                                            wordstodelete.append(n)


								                                            jemandornot = 0

								                            if sentence[n + 1] == 'dem' or sentence[n + 1] == 'einem':


								                                token3 = nlp(sentence[n-1])

								                                for word in token3:

								                                    if word.tag_ != 'NN' and word.tag_ != 'NE':

								                                        sentence[n + 1] = 'ein'

								                                        wordstodelete.append(n)

								                                        jemandornot = 0

								                            if sentence[n + 1] == 'der' or sentence[n + 1] == 'einer':

								                                token4 = nlp(sentence[n-1])

								                                for word in token4:

								                                    if word.tag_ != 'NN' and word.tag_ != 'NE':

								                                        sentence[n + 1] = 'eine'

								                                        wordstodelete.append(n)

								                                        jemandornot = 0


								                        if sentence[n] == 'vom':


								                            sentence[n] = 'ein'

								                            jemandornot = 0


								                    for index in wordstodelete[::-1]:

								                        del sentence[index]


								                    if jemandornot == 1:

								                        sentence.append('jemand')


								                    #print(sentence)


								                    #print('get the tuples and triples to check..')

								                    tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(sentence)

								                    #print('done')

								                    #print(tuplesTocheck, triplesTocheck)


								                    grammpiecessentence = self.gs.createTupleofGrammarpieces( sentence, tuplesTocheck, triplesTocheck, quadruplesTocheck)


								                    if len(grammpiecessentence) > 7:

								                        print('A sentence is too long, too many permutations. \n piping wrong grammar..')

								                        endsentences.append(' '.join(grammpiecessentence).split())


								                    else:


								                        #print('the grammpiecessentence', grammpiecessentence)

								                        #print('genrating the permutations')

								                        permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)

								                        #print('done')

								                        #print(permutations)

								                        #if (len(tuplesTocheck) != 0) or (len(triplesTocheck) != 0):

								                        #    print('filtering the permutations based on the tuples and triples..')

								                        #    filteredpermutations = self.gs.filterpermutationsaccordingtotuples(permutations, tuplesTocheck, triplesTocheck)

								                        #    print('done')

								                        #else:

								                        #    print('there are no triples or tuples to check..')

								                        #    filteredpermutations = permutations


								                        sentencesToCheck = []

								                        for sentence in permutations:

								                            sentencesToCheck.append(' '.join(sentence))


								                        #print('sentencesToCheck', sentencesToCheck)

								                        #print('classifying the probability for right grammar in the filtered permutations..')

								                        #print(' '.join(sentence))

								                        endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(sentence))

								                        #print('done')


								                        #print('the endsentence', endsentence)

								                        endsentences.append(endsentence.split())


								                if zustandORnot == 0 and vorgangORnot == 0:

								                    #print('it is coming to the else')

								                    endsentences.append(sentence)


								            except:

								                print('the sentence ' + str(sentence) + ' caused an error in the module passive2active')

								                if endsentences[-1] == sentence:

								                    pass

								                else:

								                    endsentences.append(sentence)


								        return endsentences


								            # Vorgangspassiv wird auf selbe Zeit gemappt, 3. Person Singular.

								            # Zustandspassiv: Immer eine Zeit dahinter. D.h.

								            # Präsens => Präteritum, Präteritum => Perfekt