# erstmal schauen, welche Art von Datenbank Leo geparst hat. # Dann kann ich schauen welche Art von DB noch erstellt, bzw was noch erweitert werden muss. # if in db conjunktive but not in indicative ( oft ist conj und ind gleich, dann macht austasuch nicht sinn. ) then replace import spacy import nltk from nltk.stem.snowball import SnowballStemmer import hickle as hkl import FASTsearch stemmer = SnowballStemmer("german") class ConjunctSolve(object): def __init__(self, hklDatabaseDir_Indi_Conju, hklDatabaseDir_Indi_Conju_All): if hklDatabaseDir_Indi_Conju_All is not None: self.Indi_ConjuDB_All = hkl.load(hklDatabaseDir_Indi_Conju_All) #print('loading the german spacy model..') self.nlp = spacy.load('de_core_news_sm') #print('done') #print('loading the stemmer..') self.stemmer = SnowballStemmer("german") #print('done') return def create_hklDB_from_csv(self, csvDbDir, StemOrNot): with open(csvDbDir) as lines: self.Indi_ConjuDB_All = [] for line in lines: #print(line) self.Indi_ConjuDB_All.append(list(eval(line))) self.hkldbIndi_Conju1 = [] self.hkldbIndi_Conju2 = [] counter = 0 for n in range(len(self.Indi_ConjuDB_All)): counter += 1 if counter % 1000 == 0: print(counter) self.hkldbIndi_Conju1.append([self.Indi_ConjuDB_All[n][0][0]] ) self.hkldbIndi_Conju2.append([self.Indi_ConjuDB_All[n][1][0]] ) print('creating the hkl dump of Indi_ConjuDBAll') hkl.dump(self.Indi_ConjuDB_All, 'hkldbIndi_Conju_All.hkl', mode='w', compression='lzf') print('done..') print('Creating the hkl dump of Indi_ConjuDB 1') hkl.dump(self.hkldbIndi_Conju1, 'hkldbIndi_Conju1.hkl', mode='w', compression='lzf') #print('done..') print('Creating the hkl dump of Indi_ConjuDB 2') hkl.dump(self.hkldbIndi_Conju2, 'hkldbIndi_Conju2.hkl', mode='w', compression='lzf') #print('done..') return 'done' def load_DB_into_FASTsearch(self): #print('loading the hkldbIndi_Conju1...') self.hkldbIndi_Conju1 = hkl.load('hkldbIndi_Conju1.hkl') #print('done') #print('loading the hkldbIndi_Conju2...') self.hkldbIndi_Conju2 = hkl.load('hkldbIndi_Conju2.hkl') #print('done') #print('loading hkldbIndi_Conju 1 into FASTsearch..') self.fsearch1 = FASTsearch.FASTsearch('hkldbIndi_Conju1.hkl') #print('done') #print('loading hkldbIndi_Conju 2 into FASTsearch..') self.fsearch2 = FASTsearch.FASTsearch('hkldbIndi_Conju2.hkl') #print('done') #print('generating BoW Model 1..') #self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False) #print('done') #print('generating BoW Model 2..') #self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False) #print('done') #print('loading the bow model 1') self.fsearch1.Load_BoW_Model('bagofwordshkldbIndi_Conju1.pkl', 'DataBaseOneZeroshkldbIndi_Conju1.hkl') #print('done') #print('loading the bow model 2') self.fsearch2.Load_BoW_Model('bagofwordshkldbIndi_Conju2.pkl', 'DataBaseOneZeroshkldbIndi_Conju2.hkl') #print('done') #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names()) #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names()) def replaceConjunctives(self, sentences): outsentences = [] sentencecount = 0 for sentence in sentences: sentencecount += 1 #print('processing sentence', sentencecount) doc = self.nlp(' '.join(sentence)) verbs_of_sentence = [] wordindex_to_replace = [] count = 0 thereisanIch = 0 thereisaDu = 0 thereisaWir = 0 thereisanIhr = 0 thereisaSie = 0 for word in doc: count += 1 if word.text == 'ich' or word.text == 'Ich': thereisanIch = 1 if word.text == 'du' or word.text == 'Du': thereisaDu = 1 if word.text == 'wir' or word.text == 'Wir': thereisaWir = 1 if word.text == 'ihr' or word.text == 'Ihr': thereisanIhr = 1 if word.text == 'sie' or word.text == 'Sie': thereisaSie = 1 if word.tag_[0] == 'V': #print(word.tag_) #print(word.text) verbs_of_sentence.append(word.text) for verb in verbs_of_sentence: verbcounter = 0 for word in sentence: verbcounter += 1 if word == verb or word[:-1] == verb or word[1:] == verb: wordindex_to_replace.append(verbcounter) for n in range(len(verbs_of_sentence)): if verbs_of_sentence[n] == 'habe' or verbs_of_sentence[n] == 'sei': if thereisanIch == 0: verbs_of_sentence.append('er/sie/es') if thereisanIch == 1: verbs_of_sentence.append('ich') if thereisaDu == 1: verbs_of_sentence.append('du') if thereisaWir == 1: verbs_of_sentence.append('wir') if thereisanIhr == 1: verbs_of_sentence.append('ihr') if thereisaSie == 1: verbs_of_sentence.append('sie') nothingtodo = 0 if nothingtodo == 0: verbs_of_sentence_string = ' '.join(verbs_of_sentence) bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1) #print(bestmatches2, matchindex2) indicative_form = self.hkldbIndi_Conju1[matchindex2[0]][0].split() conjunctive_form = self.hkldbIndi_Conju2[matchindex2[0]][0].split() #print('oioioioioi') #print('verbsofsentencestring',verbs_of_sentence_string) #print('indikativform',indicative_form) #print('conjunctive_form', conjunctive_form) therewasaconjunctive = 0 for n in range(len(conjunctive_form)): for m in range(len(verbs_of_sentence)): if conjunctive_form[n] == verbs_of_sentence[m] and n != 0: therewasaconjunctive = 1 if therewasaconjunctive == 1: count = 0 exchangeindizee = [] for verb in conjunctive_form: count += 1 count2 = 0 for ver in verbs_of_sentence: count2 += 1 #print('Aye') #print(verb) #print(ver) if verb == ver: exchangeindizee.append([count, count2]) #print('indicative form', indicative_form) #print('the exchangeindizee ', exchangeindizee) #print('verbs of sentence before split', verbs_of_sentence) #print('before exchange') #print('conjunctive form', conjunctive_form) #print('verbs of sentence', verbs_of_sentence) #print('indicative form', indicative_form) for indizee in exchangeindizee: #print('indizee',indizee) #print(indicative_form[indizee[0]-1]) #print(len(verbs_of_sentence)) if indicative_form[indizee[0] - 1] not in ['euch','ihr','wir','sie','du', 'er/sie/es']: verbs_of_sentence[indizee[1] - 1] = indicative_form[indizee[0] - 1] #print('verbs of sentence after change', verbs_of_sentence) donothing = 0 if therewasaconjunctive == 0: donothing = 1 #print(conjunctive_form) #print(conjunctive_form[0].split()) #print(conjunctive_form[0].split()[0]) if thereisanIch == 1 and conjunctive_form[0].split()[0] == 'er/sie/es': donothing = 1 if donothing == 0: #print(wordindex_to_replace) if len(verbs_of_sentence) < len(wordindex_to_replace): thelen = len(verbs_of_sentence) else: thelen = len(wordindex_to_replace) #print('cs sentence and verbsofsentence', sentence, verbs_of_sentence, wordindex_to_replace) for n in range(thelen): #print(indicative_form, wordindex_to_replace, sentence, verbs_of_sentence) wasreplaced = 0 if sentence[wordindex_to_replace[n] - 1][-1] == ',': changesent = list(sentence[wordindex_to_replace[n] - 1]) changesent[:-1] = list(verbs_of_sentence[n]) sentence[wordindex_to_replace[n] - 1] = ''.join(changesent) wasreplaced = 1 if sentence[wordindex_to_replace[n] - 1][-1] == '.': changesent = list(sentence[wordindex_to_replace[n] - 1]) changesent[:-1] = list(verbs_of_sentence[n]) sentence[wordindex_to_replace[n] - 1] = ''.join(changesent) wasreplaced = 1 if sentence[wordindex_to_replace[n] - 1][-1] == ')': changesent = list(sentence[wordindex_to_replace[n] - 1]) changesent[:-1] = list(verbs_of_sentence[n]) sentence[wordindex_to_replace[n] - 1] = ''.join(changesent) wasreplaced = 1 if sentence[wordindex_to_replace[n] - 1][0] == '(': changesent = list(sentence[wordindex_to_replace[n] - 1]) changesent[1:] = list(verbs_of_sentence[n]) sentence[wordindex_to_replace[n] - 1] = ''.join(changesent) wasreplaced = 1 if wasreplaced == 0: sentence[wordindex_to_replace[n] - 1] = verbs_of_sentence[n] #print(word.tag_ ) outsentences.append(sentence) #print('the endsentence',sentence) return outsentences