310 lines
12 KiB
Python
310 lines
12 KiB
Python
# erstmal schauen, welche Art von Datenbank Leo geparst hat.
|
|
# Dann kann ich schauen welche Art von DB noch erstellt, bzw was noch erweitert werden muss.
|
|
|
|
|
|
# if in db conjunktive but not in indicative ( oft ist conj und ind gleich, dann macht austasuch nicht sinn. ) then replace
|
|
|
|
|
|
|
|
|
|
import spacy
|
|
import nltk
|
|
from nltk.stem.snowball import SnowballStemmer
|
|
|
|
import hickle as hkl
|
|
import FASTsearch
|
|
|
|
stemmer = SnowballStemmer("german")
|
|
|
|
|
|
class ConjunctSolve(object):
|
|
|
|
def __init__(self, hklDatabaseDir_Indi_Conju, hklDatabaseDir_Indi_Conju_All):
|
|
|
|
if hklDatabaseDir_Indi_Conju_All is not None:
|
|
self.Indi_ConjuDB_All = hkl.load(hklDatabaseDir_Indi_Conju_All)
|
|
|
|
|
|
#print('loading the german spacy model..')
|
|
self.nlp = spacy.load('de_core_news_sm')
|
|
#print('done')
|
|
|
|
#print('loading the stemmer..')
|
|
self.stemmer = SnowballStemmer("german")
|
|
#print('done')
|
|
|
|
return
|
|
|
|
|
|
def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
|
|
|
|
with open(csvDbDir) as lines:
|
|
|
|
self.Indi_ConjuDB_All = []
|
|
|
|
for line in lines:
|
|
|
|
#print(line)
|
|
|
|
self.Indi_ConjuDB_All.append(list(eval(line)))
|
|
|
|
|
|
|
|
|
|
|
|
self.hkldbIndi_Conju1 = []
|
|
self.hkldbIndi_Conju2 = []
|
|
|
|
|
|
counter = 0
|
|
for n in range(len(self.Indi_ConjuDB_All)):
|
|
|
|
counter += 1
|
|
if counter % 1000 == 0:
|
|
print(counter)
|
|
|
|
|
|
self.hkldbIndi_Conju1.append([self.Indi_ConjuDB_All[n][0][0]] )
|
|
self.hkldbIndi_Conju2.append([self.Indi_ConjuDB_All[n][1][0]] )
|
|
|
|
|
|
|
|
print('creating the hkl dump of Indi_ConjuDBAll')
|
|
hkl.dump(self.Indi_ConjuDB_All, 'hkldbIndi_Conju_All.hkl', mode='w', compression='lzf')
|
|
print('done..')
|
|
|
|
print('Creating the hkl dump of Indi_ConjuDB 1')
|
|
hkl.dump(self.hkldbIndi_Conju1, 'hkldbIndi_Conju1.hkl', mode='w', compression='lzf')
|
|
#print('done..')
|
|
|
|
print('Creating the hkl dump of Indi_ConjuDB 2')
|
|
hkl.dump(self.hkldbIndi_Conju2, 'hkldbIndi_Conju2.hkl', mode='w', compression='lzf')
|
|
#print('done..')
|
|
|
|
|
|
|
|
return 'done'
|
|
|
|
def load_DB_into_FASTsearch(self):
|
|
|
|
#print('loading the hkldbIndi_Conju1...')
|
|
self.hkldbIndi_Conju1 = hkl.load('hkldbIndi_Conju1.hkl')
|
|
#print('done')
|
|
|
|
#print('loading the hkldbIndi_Conju2...')
|
|
self.hkldbIndi_Conju2 = hkl.load('hkldbIndi_Conju2.hkl')
|
|
#print('done')
|
|
|
|
#print('loading hkldbIndi_Conju 1 into FASTsearch..')
|
|
self.fsearch1 = FASTsearch.FASTsearch('hkldbIndi_Conju1.hkl')
|
|
#print('done')
|
|
|
|
#print('loading hkldbIndi_Conju 2 into FASTsearch..')
|
|
self.fsearch2 = FASTsearch.FASTsearch('hkldbIndi_Conju2.hkl')
|
|
#print('done')
|
|
|
|
#print('generating BoW Model 1..')
|
|
#self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
|
|
#print('done')
|
|
|
|
#print('generating BoW Model 2..')
|
|
#self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
|
|
#print('done')
|
|
|
|
|
|
#print('loading the bow model 1')
|
|
self.fsearch1.Load_BoW_Model('bagofwordshkldbIndi_Conju1.pkl', 'DataBaseOneZeroshkldbIndi_Conju1.hkl')
|
|
#print('done')
|
|
|
|
#print('loading the bow model 2')
|
|
self.fsearch2.Load_BoW_Model('bagofwordshkldbIndi_Conju2.pkl', 'DataBaseOneZeroshkldbIndi_Conju2.hkl')
|
|
#print('done')
|
|
|
|
|
|
#print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
|
|
#print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
|
|
|
|
|
|
|
|
def replaceConjunctives(self, sentences):
|
|
outsentences = []
|
|
|
|
sentencecount = 0
|
|
for sentence in sentences:
|
|
|
|
sentencecount += 1
|
|
#print('processing sentence', sentencecount)
|
|
|
|
doc = self.nlp(' '.join(sentence))
|
|
|
|
verbs_of_sentence = []
|
|
wordindex_to_replace = []
|
|
count = 0
|
|
thereisanIch = 0
|
|
thereisaDu = 0
|
|
thereisaWir = 0
|
|
thereisanIhr = 0
|
|
thereisaSie = 0
|
|
|
|
for word in doc:
|
|
count += 1
|
|
|
|
if word.text == 'ich' or word.text == 'Ich':
|
|
thereisanIch = 1
|
|
if word.text == 'du' or word.text == 'Du':
|
|
thereisaDu = 1
|
|
if word.text == 'wir' or word.text == 'Wir':
|
|
thereisaWir = 1
|
|
if word.text == 'ihr' or word.text == 'Ihr':
|
|
thereisanIhr = 1
|
|
if word.text == 'sie' or word.text == 'Sie':
|
|
thereisaSie = 1
|
|
|
|
if word.tag_[0] == 'V':
|
|
#print(word.tag_)
|
|
#print(word.text)
|
|
verbs_of_sentence.append(word.text)
|
|
|
|
for verb in verbs_of_sentence:
|
|
verbcounter = 0
|
|
for word in sentence:
|
|
verbcounter += 1
|
|
if word == verb or word[:-1] == verb or word[1:] == verb:
|
|
wordindex_to_replace.append(verbcounter)
|
|
|
|
for n in range(len(verbs_of_sentence)):
|
|
if verbs_of_sentence[n] == 'habe' or verbs_of_sentence[n] == 'sei':
|
|
if thereisanIch == 0:
|
|
verbs_of_sentence.append('er/sie/es')
|
|
|
|
if thereisanIch == 1:
|
|
verbs_of_sentence.append('ich')
|
|
if thereisaDu == 1:
|
|
verbs_of_sentence.append('du')
|
|
if thereisaWir == 1:
|
|
verbs_of_sentence.append('wir')
|
|
if thereisanIhr == 1:
|
|
verbs_of_sentence.append('ihr')
|
|
if thereisaSie == 1:
|
|
verbs_of_sentence.append('sie')
|
|
|
|
|
|
|
|
nothingtodo = 0
|
|
|
|
if nothingtodo == 0:
|
|
verbs_of_sentence_string = ' '.join(verbs_of_sentence)
|
|
|
|
bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)
|
|
|
|
#print(bestmatches2, matchindex2)
|
|
|
|
indicative_form = self.hkldbIndi_Conju1[matchindex2[0]][0].split()
|
|
conjunctive_form = self.hkldbIndi_Conju2[matchindex2[0]][0].split()
|
|
#print('oioioioioi')
|
|
|
|
#print('verbsofsentencestring',verbs_of_sentence_string)
|
|
#print('indikativform',indicative_form)
|
|
#print('conjunctive_form', conjunctive_form)
|
|
|
|
|
|
|
|
therewasaconjunctive = 0
|
|
for n in range(len(conjunctive_form)):
|
|
for m in range(len(verbs_of_sentence)):
|
|
if conjunctive_form[n] == verbs_of_sentence[m] and n != 0:
|
|
therewasaconjunctive = 1
|
|
|
|
if therewasaconjunctive == 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
count = 0
|
|
exchangeindizee = []
|
|
for verb in conjunctive_form:
|
|
count += 1
|
|
count2 = 0
|
|
for ver in verbs_of_sentence:
|
|
count2 += 1
|
|
#print('Aye')
|
|
#print(verb)
|
|
#print(ver)
|
|
if verb == ver:
|
|
exchangeindizee.append([count, count2])
|
|
|
|
#print('indicative form', indicative_form)
|
|
|
|
#print('the exchangeindizee ', exchangeindizee)
|
|
|
|
#print('verbs of sentence before split', verbs_of_sentence)
|
|
|
|
#print('before exchange')
|
|
#print('conjunctive form', conjunctive_form)
|
|
#print('verbs of sentence', verbs_of_sentence)
|
|
#print('indicative form', indicative_form)
|
|
for indizee in exchangeindizee:
|
|
#print('indizee',indizee)
|
|
#print(indicative_form[indizee[0]-1])
|
|
#print(len(verbs_of_sentence))
|
|
|
|
if indicative_form[indizee[0] - 1] not in ['euch','ihr','wir','sie','du', 'er/sie/es']:
|
|
verbs_of_sentence[indizee[1] - 1] = indicative_form[indizee[0] - 1]
|
|
|
|
#print('verbs of sentence after change', verbs_of_sentence)
|
|
|
|
donothing = 0
|
|
|
|
if therewasaconjunctive == 0:
|
|
donothing = 1
|
|
#print(conjunctive_form)
|
|
#print(conjunctive_form[0].split())
|
|
#print(conjunctive_form[0].split()[0])
|
|
if thereisanIch == 1 and conjunctive_form[0].split()[0] == 'er/sie/es':
|
|
donothing = 1
|
|
if donothing == 0:
|
|
#print(wordindex_to_replace)
|
|
|
|
if len(verbs_of_sentence) < len(wordindex_to_replace):
|
|
thelen = len(verbs_of_sentence)
|
|
else:
|
|
thelen = len(wordindex_to_replace)
|
|
#print('cs sentence and verbsofsentence', sentence, verbs_of_sentence, wordindex_to_replace)
|
|
for n in range(thelen):
|
|
#print(indicative_form, wordindex_to_replace, sentence, verbs_of_sentence)
|
|
wasreplaced = 0
|
|
if sentence[wordindex_to_replace[n] - 1][-1] == ',':
|
|
changesent = list(sentence[wordindex_to_replace[n] - 1])
|
|
changesent[:-1] = list(verbs_of_sentence[n])
|
|
sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
|
|
wasreplaced = 1
|
|
if sentence[wordindex_to_replace[n] - 1][-1] == '.':
|
|
changesent = list(sentence[wordindex_to_replace[n] - 1])
|
|
changesent[:-1] = list(verbs_of_sentence[n])
|
|
sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
|
|
wasreplaced = 1
|
|
if sentence[wordindex_to_replace[n] - 1][-1] == ')':
|
|
changesent = list(sentence[wordindex_to_replace[n] - 1])
|
|
changesent[:-1] = list(verbs_of_sentence[n])
|
|
|
|
sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
|
|
wasreplaced = 1
|
|
if sentence[wordindex_to_replace[n] - 1][0] == '(':
|
|
changesent = list(sentence[wordindex_to_replace[n] - 1])
|
|
changesent[1:] = list(verbs_of_sentence[n])
|
|
sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
|
|
wasreplaced = 1
|
|
if wasreplaced == 0:
|
|
|
|
sentence[wordindex_to_replace[n] - 1] = verbs_of_sentence[n]
|
|
#print(word.tag_ )
|
|
|
|
|
|
outsentences.append(sentence)
|
|
|
|
#print('the endsentence',sentence)
|
|
return outsentences
|
|
|
|
|