|
|
- # erstmal schauen, welche Art von Datenbank Leo geparst hat.
- # Dann kann ich schauen welche Art von DB noch erstellt, bzw was noch erweitert werden muss.
-
-
- # if in db conjunktive but not in indicative ( oft ist conj und ind gleich, dann macht austasuch nicht sinn. ) then replace
-
-
-
-
- import spacy
- import nltk
- from nltk.stem.snowball import SnowballStemmer
-
- import hickle as hkl
- import FASTsearch
-
- stemmer = SnowballStemmer("german")
-
-
- class ConjunctSolve(object):
-
- def __init__(self, hklDatabaseDir_Indi_Conju, hklDatabaseDir_Indi_Conju_All):
-
- if hklDatabaseDir_Indi_Conju_All is not None:
- self.Indi_ConjuDB_All = hkl.load(hklDatabaseDir_Indi_Conju_All)
-
-
- #print('loading the german spacy model..')
- self.nlp = spacy.load('de_core_news_sm')
- #print('done')
-
- #print('loading the stemmer..')
- self.stemmer = SnowballStemmer("german")
- #print('done')
-
- return
-
-
- def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
-
- with open(csvDbDir) as lines:
-
- self.Indi_ConjuDB_All = []
-
- for line in lines:
-
- #print(line)
-
- self.Indi_ConjuDB_All.append(list(eval(line)))
-
-
-
-
-
- self.hkldbIndi_Conju1 = []
- self.hkldbIndi_Conju2 = []
-
-
- counter = 0
- for n in range(len(self.Indi_ConjuDB_All)):
-
- counter += 1
- if counter % 1000 == 0:
- print(counter)
-
-
- self.hkldbIndi_Conju1.append([self.Indi_ConjuDB_All[n][0][0]] )
- self.hkldbIndi_Conju2.append([self.Indi_ConjuDB_All[n][1][0]] )
-
-
-
- print('creating the hkl dump of Indi_ConjuDBAll')
- hkl.dump(self.Indi_ConjuDB_All, 'hkldbIndi_Conju_All.hkl', mode='w', compression='lzf')
- print('done..')
-
- print('Creating the hkl dump of Indi_ConjuDB 1')
- hkl.dump(self.hkldbIndi_Conju1, 'hkldbIndi_Conju1.hkl', mode='w', compression='lzf')
- #print('done..')
-
- print('Creating the hkl dump of Indi_ConjuDB 2')
- hkl.dump(self.hkldbIndi_Conju2, 'hkldbIndi_Conju2.hkl', mode='w', compression='lzf')
- #print('done..')
-
-
-
- return 'done'
-
- def load_DB_into_FASTsearch(self):
-
- #print('loading the hkldbIndi_Conju1...')
- self.hkldbIndi_Conju1 = hkl.load('hkldbIndi_Conju1.hkl')
- #print('done')
-
- #print('loading the hkldbIndi_Conju2...')
- self.hkldbIndi_Conju2 = hkl.load('hkldbIndi_Conju2.hkl')
- #print('done')
-
- #print('loading hkldbIndi_Conju 1 into FASTsearch..')
- self.fsearch1 = FASTsearch.FASTsearch('hkldbIndi_Conju1.hkl')
- #print('done')
-
- #print('loading hkldbIndi_Conju 2 into FASTsearch..')
- self.fsearch2 = FASTsearch.FASTsearch('hkldbIndi_Conju2.hkl')
- #print('done')
-
- #print('generating BoW Model 1..')
- #self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
- #print('done')
-
- #print('generating BoW Model 2..')
- #self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
- #print('done')
-
-
- #print('loading the bow model 1')
- self.fsearch1.Load_BoW_Model('bagofwordshkldbIndi_Conju1.pkl', 'DataBaseOneZeroshkldbIndi_Conju1.hkl')
- #print('done')
-
- #print('loading the bow model 2')
- self.fsearch2.Load_BoW_Model('bagofwordshkldbIndi_Conju2.pkl', 'DataBaseOneZeroshkldbIndi_Conju2.hkl')
- #print('done')
-
-
- #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
- #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
-
-
-
- def replaceConjunctives(self, sentences):
- outsentences = []
-
- sentencecount = 0
- for sentence in sentences:
-
- sentencecount += 1
- #print('processing sentence', sentencecount)
-
- doc = self.nlp(' '.join(sentence))
-
- verbs_of_sentence = []
- wordindex_to_replace = []
- count = 0
- thereisanIch = 0
- thereisaDu = 0
- thereisaWir = 0
- thereisanIhr = 0
- thereisaSie = 0
-
- for word in doc:
- count += 1
-
- if word.text == 'ich' or word.text == 'Ich':
- thereisanIch = 1
- if word.text == 'du' or word.text == 'Du':
- thereisaDu = 1
- if word.text == 'wir' or word.text == 'Wir':
- thereisaWir = 1
- if word.text == 'ihr' or word.text == 'Ihr':
- thereisanIhr = 1
- if word.text == 'sie' or word.text == 'Sie':
- thereisaSie = 1
-
- if word.tag_[0] == 'V':
- #print(word.tag_)
- #print(word.text)
- verbs_of_sentence.append(word.text)
-
- for verb in verbs_of_sentence:
- verbcounter = 0
- for word in sentence:
- verbcounter += 1
- if word == verb or word[:-1] == verb or word[1:] == verb:
- wordindex_to_replace.append(verbcounter)
-
- for n in range(len(verbs_of_sentence)):
- if verbs_of_sentence[n] == 'habe' or verbs_of_sentence[n] == 'sei':
- if thereisanIch == 0:
- verbs_of_sentence.append('er/sie/es')
-
- if thereisanIch == 1:
- verbs_of_sentence.append('ich')
- if thereisaDu == 1:
- verbs_of_sentence.append('du')
- if thereisaWir == 1:
- verbs_of_sentence.append('wir')
- if thereisanIhr == 1:
- verbs_of_sentence.append('ihr')
- if thereisaSie == 1:
- verbs_of_sentence.append('sie')
-
-
-
- nothingtodo = 0
-
- if nothingtodo == 0:
- verbs_of_sentence_string = ' '.join(verbs_of_sentence)
-
- bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)
-
- #print(bestmatches2, matchindex2)
-
- indicative_form = self.hkldbIndi_Conju1[matchindex2[0]][0].split()
- conjunctive_form = self.hkldbIndi_Conju2[matchindex2[0]][0].split()
- #print('oioioioioi')
-
- #print('verbsofsentencestring',verbs_of_sentence_string)
- #print('indikativform',indicative_form)
- #print('conjunctive_form', conjunctive_form)
-
-
-
- therewasaconjunctive = 0
- for n in range(len(conjunctive_form)):
- for m in range(len(verbs_of_sentence)):
- if conjunctive_form[n] == verbs_of_sentence[m] and n != 0:
- therewasaconjunctive = 1
-
- if therewasaconjunctive == 1:
-
-
-
-
-
-
- count = 0
- exchangeindizee = []
- for verb in conjunctive_form:
- count += 1
- count2 = 0
- for ver in verbs_of_sentence:
- count2 += 1
- #print('Aye')
- #print(verb)
- #print(ver)
- if verb == ver:
- exchangeindizee.append([count, count2])
-
- #print('indicative form', indicative_form)
-
- #print('the exchangeindizee ', exchangeindizee)
-
- #print('verbs of sentence before split', verbs_of_sentence)
-
- #print('before exchange')
- #print('conjunctive form', conjunctive_form)
- #print('verbs of sentence', verbs_of_sentence)
- #print('indicative form', indicative_form)
- for indizee in exchangeindizee:
- #print('indizee',indizee)
- #print(indicative_form[indizee[0]-1])
- #print(len(verbs_of_sentence))
-
- if indicative_form[indizee[0] - 1] not in ['euch','ihr','wir','sie','du', 'er/sie/es']:
- verbs_of_sentence[indizee[1] - 1] = indicative_form[indizee[0] - 1]
-
- #print('verbs of sentence after change', verbs_of_sentence)
-
- donothing = 0
-
- if therewasaconjunctive == 0:
- donothing = 1
- #print(conjunctive_form)
- #print(conjunctive_form[0].split())
- #print(conjunctive_form[0].split()[0])
- if thereisanIch == 1 and conjunctive_form[0].split()[0] == 'er/sie/es':
- donothing = 1
- if donothing == 0:
- #print(wordindex_to_replace)
-
- if len(verbs_of_sentence) < len(wordindex_to_replace):
- thelen = len(verbs_of_sentence)
- else:
- thelen = len(wordindex_to_replace)
- #print('cs sentence and verbsofsentence', sentence, verbs_of_sentence, wordindex_to_replace)
- for n in range(thelen):
- #print(indicative_form, wordindex_to_replace, sentence, verbs_of_sentence)
- wasreplaced = 0
- if sentence[wordindex_to_replace[n] - 1][-1] == ',':
- changesent = list(sentence[wordindex_to_replace[n] - 1])
- changesent[:-1] = list(verbs_of_sentence[n])
- sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
- wasreplaced = 1
- if sentence[wordindex_to_replace[n] - 1][-1] == '.':
- changesent = list(sentence[wordindex_to_replace[n] - 1])
- changesent[:-1] = list(verbs_of_sentence[n])
- sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
- wasreplaced = 1
- if sentence[wordindex_to_replace[n] - 1][-1] == ')':
- changesent = list(sentence[wordindex_to_replace[n] - 1])
- changesent[:-1] = list(verbs_of_sentence[n])
-
- sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
- wasreplaced = 1
- if sentence[wordindex_to_replace[n] - 1][0] == '(':
- changesent = list(sentence[wordindex_to_replace[n] - 1])
- changesent[1:] = list(verbs_of_sentence[n])
- sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
- wasreplaced = 1
- if wasreplaced == 0:
-
- sentence[wordindex_to_replace[n] - 1] = verbs_of_sentence[n]
- #print(word.tag_ )
-
-
- outsentences.append(sentence)
-
- #print('the endsentence',sentence)
- return outsentences
-
-
|