alpcentaur
/
basabuuka_prototyp


								# erstmal schauen, welche Art von Datenbank Leo geparst hat.

								# Dann kann ich schauen welche Art von DB noch erstellt, bzw was noch erweitert werden muss.


								# if in db conjunktive but not in indicative ( oft ist conj und ind gleich, dann macht austasuch nicht sinn. ) then replace


								import spacy

								import nltk

								from nltk.stem.snowball import SnowballStemmer


								import hickle as hkl

								import FASTsearch


								stemmer = SnowballStemmer("german")


								class ConjunctSolve(object):


								    def __init__(self, hklDatabaseDir_Indi_Conju, hklDatabaseDir_Indi_Conju_All):


								        if hklDatabaseDir_Indi_Conju_All is not None:

								            self.Indi_ConjuDB_All = hkl.load(hklDatabaseDir_Indi_Conju_All)


								        #print('loading the german spacy model..')

								        self.nlp = spacy.load('de_core_news_sm')

								        #print('done')


								        #print('loading the stemmer..')

								        self.stemmer = SnowballStemmer("german")

								        #print('done')


								        return


								    def create_hklDB_from_csv(self, csvDbDir, StemOrNot):


								        with open(csvDbDir) as lines:


								            self.Indi_ConjuDB_All = []


								            for line in lines:


								                #print(line)


								                self.Indi_ConjuDB_All.append(list(eval(line)))


								            self.hkldbIndi_Conju1 = []

								            self.hkldbIndi_Conju2 = []


								            counter = 0

								            for n in range(len(self.Indi_ConjuDB_All)):


								                counter += 1

								                if counter % 1000 == 0:

								                    print(counter)


								                self.hkldbIndi_Conju1.append([self.Indi_ConjuDB_All[n][0][0]] )

								                self.hkldbIndi_Conju2.append([self.Indi_ConjuDB_All[n][1][0]] )


								            print('creating the hkl dump of Indi_ConjuDBAll')

								            hkl.dump(self.Indi_ConjuDB_All, 'hkldbIndi_Conju_All.hkl', mode='w', compression='lzf')

								            print('done..')


								            print('Creating the hkl dump of Indi_ConjuDB 1')

								            hkl.dump(self.hkldbIndi_Conju1, 'hkldbIndi_Conju1.hkl', mode='w', compression='lzf')

								            #print('done..')


								            print('Creating the hkl dump of Indi_ConjuDB 2')

								            hkl.dump(self.hkldbIndi_Conju2, 'hkldbIndi_Conju2.hkl', mode='w', compression='lzf')

								            #print('done..')


								        return 'done'


								    def load_DB_into_FASTsearch(self):


								        #print('loading the hkldbIndi_Conju1...')

								        self.hkldbIndi_Conju1 = hkl.load('hkldbIndi_Conju1.hkl')

								        #print('done')


								        #print('loading the hkldbIndi_Conju2...')

								        self.hkldbIndi_Conju2 = hkl.load('hkldbIndi_Conju2.hkl')

								        #print('done')


								        #print('loading hkldbIndi_Conju 1 into FASTsearch..')

								        self.fsearch1 = FASTsearch.FASTsearch('hkldbIndi_Conju1.hkl')

								        #print('done')


								        #print('loading hkldbIndi_Conju 2 into FASTsearch..')

								        self.fsearch2 = FASTsearch.FASTsearch('hkldbIndi_Conju2.hkl')

								        #print('done')


								        #print('generating BoW Model 1..')

								        #self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)

								        #print('done')


								        #print('generating BoW Model 2..')

								        #self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)

								        #print('done')


								        #print('loading the bow model 1')

								        self.fsearch1.Load_BoW_Model('bagofwordshkldbIndi_Conju1.pkl', 'DataBaseOneZeroshkldbIndi_Conju1.hkl')

								        #print('done')


								        #print('loading the bow model 2')

								        self.fsearch2.Load_BoW_Model('bagofwordshkldbIndi_Conju2.pkl', 'DataBaseOneZeroshkldbIndi_Conju2.hkl')

								        #print('done')


								        #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())

								        #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())


								    def replaceConjunctives(self, sentences):

								        outsentences = []


								        sentencecount = 0

								        for sentence in sentences:


								            sentencecount += 1

								            #print('processing sentence', sentencecount)


								            doc = self.nlp(' '.join(sentence))


								            verbs_of_sentence = []

								            wordindex_to_replace = []

								            count = 0

								            thereisanIch = 0

								            thereisaDu = 0

								            thereisaWir = 0

								            thereisanIhr = 0

								            thereisaSie = 0


								            for word in doc:

								                count += 1


								                if word.text == 'ich' or word.text == 'Ich':

								                    thereisanIch = 1

								                if word.text == 'du' or word.text == 'Du':

								                    thereisaDu = 1

								                if word.text == 'wir' or word.text == 'Wir':

								                    thereisaWir = 1

								                if word.text == 'ihr' or word.text == 'Ihr':

								                    thereisanIhr = 1

								                if word.text == 'sie' or word.text == 'Sie':

								                    thereisaSie = 1


								                if word.tag_[0] == 'V':

								                    #print(word.tag_)

								                    #print(word.text)

								                    verbs_of_sentence.append(word.text)


								            for verb in verbs_of_sentence:

								                verbcounter = 0

								                for word in sentence:

								                    verbcounter += 1

								                    if word == verb or word[:-1] == verb or word[1:] == verb:

								                        wordindex_to_replace.append(verbcounter)


								            for n in range(len(verbs_of_sentence)):

								                if verbs_of_sentence[n] == 'habe' or verbs_of_sentence[n] == 'sei':

								                    if thereisanIch == 0:

								                        verbs_of_sentence.append('er/sie/es')


								            if thereisanIch == 1:

								                verbs_of_sentence.append('ich')

								            if thereisaDu == 1:

								                verbs_of_sentence.append('du')

								            if thereisaWir == 1:

								                verbs_of_sentence.append('wir')

								            if thereisanIhr == 1:

								                verbs_of_sentence.append('ihr')

								            if thereisaSie == 1:

								                verbs_of_sentence.append('sie')


								            nothingtodo = 0


								            if nothingtodo == 0:

								                verbs_of_sentence_string = ' '.join(verbs_of_sentence)


								                bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)


								                #print(bestmatches2, matchindex2)


								                indicative_form = self.hkldbIndi_Conju1[matchindex2[0]][0].split()

								                conjunctive_form = self.hkldbIndi_Conju2[matchindex2[0]][0].split()

								                #print('oioioioioi')


								                #print('verbsofsentencestring',verbs_of_sentence_string)

								                #print('indikativform',indicative_form)

								                #print('conjunctive_form', conjunctive_form)


								                therewasaconjunctive = 0

								                for n in range(len(conjunctive_form)):

								                    for m in range(len(verbs_of_sentence)):

								                        if conjunctive_form[n] == verbs_of_sentence[m] and n != 0:

								                            therewasaconjunctive = 1


								                if therewasaconjunctive == 1:


								                    count = 0

								                    exchangeindizee = []

								                    for verb in conjunctive_form:

								                        count += 1

								                        count2 = 0

								                        for ver in verbs_of_sentence:

								                            count2 += 1

								                            #print('Aye')

								                            #print(verb)

								                            #print(ver)

								                            if verb == ver:

								                                exchangeindizee.append([count, count2])


								                    #print('indicative form', indicative_form)


								                    #print('the exchangeindizee ', exchangeindizee)


								                    #print('verbs of sentence before split', verbs_of_sentence)


								                    #print('before exchange')

								                    #print('conjunctive form', conjunctive_form)

								                    #print('verbs of sentence', verbs_of_sentence)

								                    #print('indicative form', indicative_form)

								                    for indizee in exchangeindizee:

								                        #print('indizee',indizee)

								                        #print(indicative_form[indizee[0]-1])

								                        #print(len(verbs_of_sentence))


								                        if indicative_form[indizee[0] - 1] not in  ['euch','ihr','wir','sie','du', 'er/sie/es']:

								                            verbs_of_sentence[indizee[1] - 1] = indicative_form[indizee[0] - 1]


								                    #print('verbs of sentence after change', verbs_of_sentence)


								                donothing = 0


								                if therewasaconjunctive == 0:

								                    donothing = 1

								                #print(conjunctive_form)

								                #print(conjunctive_form[0].split())

								                #print(conjunctive_form[0].split()[0])

								                if thereisanIch == 1 and conjunctive_form[0].split()[0] == 'er/sie/es':

								                    donothing = 1

								                if donothing == 0:

								                    #print(wordindex_to_replace)


								                    if len(verbs_of_sentence) < len(wordindex_to_replace):

								                        thelen = len(verbs_of_sentence)

								                    else:

								                        thelen = len(wordindex_to_replace)

								                    #print('cs sentence and verbsofsentence', sentence, verbs_of_sentence, wordindex_to_replace)

								                    for n in range(thelen):

								                        #print(indicative_form, wordindex_to_replace, sentence, verbs_of_sentence)

								                        wasreplaced = 0

								                        if sentence[wordindex_to_replace[n] - 1][-1] == ',':

								                            changesent = list(sentence[wordindex_to_replace[n] - 1])

								                            changesent[:-1] = list(verbs_of_sentence[n])

								                            sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)

								                            wasreplaced = 1

								                        if sentence[wordindex_to_replace[n] - 1][-1] == '.':

								                            changesent = list(sentence[wordindex_to_replace[n] - 1])

								                            changesent[:-1] = list(verbs_of_sentence[n])

								                            sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)

								                            wasreplaced = 1

								                        if sentence[wordindex_to_replace[n] - 1][-1] == ')':

								                            changesent = list(sentence[wordindex_to_replace[n] - 1])

								                            changesent[:-1] = list(verbs_of_sentence[n])


								                            sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)

								                            wasreplaced = 1

								                        if sentence[wordindex_to_replace[n] - 1][0] == '(':

								                            changesent = list(sentence[wordindex_to_replace[n] - 1])

								                            changesent[1:] = list(verbs_of_sentence[n])

								                            sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)

								                            wasreplaced = 1

								                        if wasreplaced == 0:


								                            sentence[wordindex_to_replace[n] - 1] = verbs_of_sentence[n]

								                #print(word.tag_ )


								            outsentences.append(sentence)


								            #print('the endsentence',sentence)

								        return outsentences