basabuuka_prototyp/Prototyp/SentSeg.py
2020-08-16 19:36:44 +02:00

2212 lines
91 KiB
Python

# split sentences
# in den Listen fehlt noch sondern ( und noch weitere Dinge..)
# Folgende Konjunktionen brauchen keine Satzumformungen:
# Woraufhin, zudem, zumal, umso - desto,
# sondern ist schwierig zu lösen.. am besten mit sondern weg, und anschließend SentGlue
class SentSeg(object):
def __init__(self, language):
self.language = language
self.punktuation_list = ['.', '?', '!', ';', ':']
self.wrappunktuation_list = [',', '-']
self.adversativ_list = ['wohingegen', 'Wohingegen', 'aber', 'Aber', 'wobei', 'Wobei', 'hingegen']
self.final_list = ['damit','Damit', 'um', 'Um']
self.kausal_list = ['weil', 'Weil', 'da', 'Da', 'denn', 'falls', 'Falls' ]
self.konditional_list = ['wenn', 'Wenn', 'sobald', 'Sobald', 'als', 'falls']
self.konsekutiv_list = ['dass', 'Dass']
self.konzessiv_list = ['obwohl', 'Obwohl', 'obgleich', 'Obgleich', 'trotzdem', 'Trotzdem', 'wenngleich', 'doch']
self.lokal_list = ['wo', 'Wo']
self.temporal_list_vor = ['bevor', 'Bevor']
self.temporal_list_nach = ['nachdem', 'Nachdem']
self.instrumental_list = ['indem', 'Indem']
self.indirectspeech_list = ['ob', 'Ob', 'wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso']
self.firstwordlist = []
#self.firstwordlist = ['wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso', 'dies', 'dann', 'jedoch', 'deswegen', 'trotzdem', 'danach', 'davor', 'wenn', 'sobald']
self.full_list = self.adversativ_list + self.final_list + self.kausal_list + self.konditional_list + self.konsekutiv_list + self.konzessiv_list + self.lokal_list + self.temporal_list_nach + self.temporal_list_vor + self.instrumental_list + self.indirectspeech_list
def ReadDoc2Sent(self, document):
splitsentences = []
splitsentence = []
with open(document) as sentences:
counter = 0
for sentence in sentences:
counter += 1
if counter % 1000 == 0:
print(counter)
words = sentence.split()
for word in words:
splitsentence.append(word)
if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
splitsentences.append([splitsentence])
splitsentence = []
return splitsentences
def AndOrSolver(self, sentences, punctuations):
for n in range(len(punctuations)):
if punctuations[n] == ':' or punctuations[n] == '-':
punctuations[n] = '.'
#print(sentences, punctuations)
splitsentences = []
counter = 0
newsentences = []
for sentence in sentences:
newpunctuationsindexes = []
utterancenumber = sentence[2]
commainfo = sentence[1]
commaornot = commainfo[0]
sentence = sentence[0]
counter += 1
doc = self.nlp(' '.join(sentence))
subjectcount = 0
separationwords = []
subjectcounts = []
doccounter = 0
subjectindex = []
rcornot = 0
for word in doc:
doccounter += 1
if word.dep_ == 'sb' or word.dep_ == 'ep':
subjectcount += 1
subjectindex.append(doccounter - 1)
if word.dep_ == 'rc':
rcornot = 1
if word.tag_ == '$,':
subjectcounts.append([subjectcount, doccounter - 2, subjectindex, rcornot])
subjectindex = []
subjectcount = 0
#print('aleaole',sentence[doccounter - 2])
if len(sentence[doccounter - 2]) > 1:
doccounter -= 1
if word.text == 'und' or word.text == 'also' or word.text == 'oder' or word.text == 'schon' or word.text == 'bald' or word.text == 'doch' or word.text == 'jedoch' or word.text == 'sondern':
separationwords.append(doccounter - 1)
#print('separationwords', separationwords)
#print('subjectcounts', subjectcounts)
separationwordstocut = []
listofownsentencessubjectindexes = []
for n in range(len(subjectcounts) - 1):
if subjectcounts[n][0] > 0 and subjectcounts[n + 1][0] > 0 and subjectcounts[n + 1][3] == 0:
listofownsentencessubjectindexes.append(subjectcounts[n])
for m in range(len(separationwords)):
if subjectcounts[n][1] < separationwords[m] < subjectcounts[n + 1][1]:
#print(subjectcounts[n + 1], separationwords[m])
if subjectcounts[n + 1][0] > 1:
if subjectcounts[n + 1][2][0] < separationwords[m] <= subjectcounts[n + 1][2][-1]:
separationwordstocut.append(separationwords[m])
processed = 0
#print('oioioi')
#print(listofownsentencessubjectindexes)
#print(separationwordstocut)
if len(listofownsentencessubjectindexes) > 0:
for n in range(len(listofownsentencessubjectindexes)):
sentence[listofownsentencessubjectindexes[n][1]] = sentence[listofownsentencessubjectindexes[n][1]] + 'alohaseparator'
newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
#print('a new punctuation1')
processed = 1
if len(separationwordstocut) > 0:
for n in range(len(separationwordstocut)):
sentence[separationwordstocut[n] - 1] = sentence[separationwordstocut[n] - 1] + 'alohaseparator'
#print('a new punctuation2')
newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
processed = 1
if processed == 0:
newsentences.append([sentence])
if processed == 1:
#print(sentence)
splitsentence = []
for word in sentence:
splitsentence.append(word)
if word[-14:] == 'alohaseparator':
if splitsentence[-1][-15] == ',':
splitsentence[-1] = splitsentence[-1][:-15]
else:
splitsentence[-1] = splitsentence[-1][:-14]
newsentences.append([splitsentence])
splitsentence = []
newsentences.append([splitsentence])
#print(newpunctuationsindexes)
newpunctuationsindexes = newpunctuationsindexes[::-1]
for n in range(len(newpunctuationsindexes)):
punctuations.insert(newpunctuationsindexes[n][1], newpunctuationsindexes[n][0])
#print(newsentences, punctuations)
return newsentences, punctuations
def LoadBoWModelAndDatabaseOnesZeros(self):
import FASTsearch
#print('loading the tag hkl db..')
self.fsearch1 = FASTsearch.FASTsearch('GS_DB_word.tag_.hkl')
#print('done')
#print('generating BoW Model..')
self.fsearch1.Gen_BoW_Model(1000, "word")
#print('done')
#print('loading the bow model')
self.fsearch1.Load_BoW_Model('bagofwordsGS_DB_word.tag_.pkl', 'DataBaseOneZerosGS_DB_word.tag_.hkl')
#print('done')
#print('loading the dep hkl db..')
self.fsearch2 = FASTsearch.FASTsearch('GS_DB_word.dep_.hkl')
#print('done')
#print('generating BoW Model..')
self.fsearch2.Gen_BoW_Model(1000, "word")
#print('done')
#print('loading the bow model')
self.fsearch2.Load_BoW_Model('bagofwordsGS_DB_word.dep_.pkl', 'DataBaseOneZerosGS_DB_word.dep_.hkl')
#print('done')
def LoadSentGlueSGDandGSUtils(self):
import GS_Utils
#print('initializing the gs utils..')
self.gs = GS_Utils.GS_Utils('de_core_news_sm')
#print('done')
from SentGlue import SentGlueMach
#print('loading the Stochastic Gradient models..')
self.sgm = SentGlueMach('trainedSGD_twolabel.pkl', 'bagofwordstwolabel.pkl')
#print('done')
#print('initializing the SGM..')
self.sgm.initialize()
#print('done')
#print('importing spacy..')
import spacy
#print('done')
#print('importing german model..')
self.nlp = spacy.load('de_core_news_sm')
#print('done')
return 'done'
def CommaSentenceOrNot(self, sentences):
nlp = self.nlp
commasentences = []
counter = 0
#print('creating array of comma or not..')
for sentence in sentences:
doc = nlp(' '.join(sentence[0]))
#print(doc)
counter += 1
#if counter % 100 == 0:
#print(counter)
n = 0
firstone = 0
token = []
nextword = 0
for word in doc:
#print(word.tag_)
# es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
if firstone == 0:
token.append(word.text)
firstone = 1
if nextword == 1:
token.append(word.text)
nextword = 0
if word.tag_ == '$,':
n += 1
nextword = 1
sentence.append([n, token])
commasentences.append(sentence)
#print('done')
return commasentences
def EnumerationSolver(self, sentences):
gs = self.gs
nlp = self.nlp
sgm = self.sgm
enumerationsentences = []
counter = 0
NOTenumerations = []
#print('processing enumerations..')
for sentence in sentences:
doc = nlp(' '.join(sentence[0]))
#print(doc)
counter += 1
#if counter % 100 == 0:
#print(counter)
n = 0
firstone = 0
token = []
nextword = 0
enumeration = False
splitsentence = []
splitsentence_deps = []
splitsentence_tags = []
splitsentences = []
splitsentences_deps = []
splitsentences_tags = []
for word in doc:
#print(word.tag_)
# es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
nextword = 0
if word.tag_ == '$,':
n += 1
nextword = 1
if (word.text == 'und' or word.text == 'oder') and n >= 1:
enumeration = True
break
output = []
if enumeration == True:
for word in doc:
#print(word.text)
if word.text != ',' and word.text != '.' and word.text != 'und':
splitsentence.append(word.text)
splitsentence_deps.append(word.dep_)
splitsentence_tags.append(word.tag_)
if word.text == ',' or word.text == 'und':
#print('oi')
splitsentences.append(splitsentence)
splitsentences_deps.append(splitsentence_deps)
splitsentences_tags.append(splitsentence_tags)
splitsentence = []
splitsentence_deps = []
splitsentence_tags = []
splitsentences.append(splitsentence)
splitsentences_deps.append(splitsentence_deps)
splitsentences_tags.append(splitsentence_tags)
#print( 'splitsentences', splitsentences)
token = []
enumerations = []
enumerationsSPOs = []
NOTenumerations = []
for sentence in splitsentences:
token.append(sentence[0])
if sentence[0] not in self.full_list:
enumerations.append(sentence)
enumerationsSPOs.append(gs.checkSPO(sentence, 0))
else:
NOTenumerations.append(sentence)
#print(enumerationsSPOs)
#print('enumerations', enumerations)
biggest = []
for i in range(len(enumerationsSPOs)):
biggest.append([i, sum(enumerationsSPOs[i])])
sortedbiggest = sorted(biggest[::-1], key=lambda tup: tup[1], reverse=True)
for i in range(len(sortedbiggest)):
if sortedbiggest[i][0] == 0:
mainsentenceIndex = sortedbiggest[i][0]
lastornot = 0
break
if sortedbiggest[i][0] == len(biggest) - 1:
mainsentenceIndex = sortedbiggest[i][0]
lastornot = 1
break
# Hier muss noch für den Fall Er, sie und der Beamte LACHTEN den Clown aus --> das lachten abgefangen werden mit der Datenbank der Fälle, sprich enumeration im spo 1 0 0 + plural muss dann zu singular werden abhängig von den artikeln.
#print('enumerations', enumerations)
mainsentence = enumerations[mainsentenceIndex]
#print('main', mainsentence)
probablemainsentences = []
for i in range(len(enumerations)):
if i != mainsentenceIndex:
iprobablemainsentences = []
probablemainsentence = []
if lastornot == 0:
for j in range(1, len(mainsentence)):
probablemainsentence = mainsentence[0:j] + enumerations[i]
#print(probablemainsentence)
iprobablemainsentences.append(' '.join(probablemainsentence))
if lastornot == 1:
for j in range(1, len(mainsentence)):
probablemainsentence = enumerations[i] + mainsentence[-j:]
iprobablemainsentences.append(' '.join(probablemainsentence))
probablemainsentences.append(iprobablemainsentences)
# hier wird auf noch da geprüft, aber es ist wichtiger in diesem fall, dass ein tuple nicht zerissen vorkommt AENDERN !!!!
#print('probablemainsentences', probablemainsentences)
tuplesToCheck = []
tuples = [['ART', 'NN'], ['APPR','NN'], ['ART', 'CARD']]
for tupl in tuples:
checktupleindex, tupleInWords = gs.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')
if checktupleindex == 2:
tuplesToCheck.append([tupl, tupleInWords])
triplesToCheck = []
triples = [['ART','ADJA','NN'], ['APPR', 'ART', 'NN'], ['KOKOM', 'ART', 'NN']]
for tripl in triples:
checktripleindex, tripleInWords = gs.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')
if checktripleindex == 3:
triplesToCheck.append([tripl, tripleInWords])
#print('tuples to check', tuplesToCheck)
#print('triples to check', triplesToCheck)
#print('probablemainsentences', probablemainsentences)
for probsentences in probablemainsentences:
checktripleindexes = []
checktupleindexes = []
#print(probsentences)
filteredprobsentences = []
for sentence in probsentences:
tuplchecked = 0
triplchecked = 0
#print('sentence and tuples to check', sentence, tuplesToCheck)
for tupl in tuplesToCheck:
checkedsecondtime, tupleinWords = gs.checkForAnnotationTuple(sentence.split(), tupl[0], 'word.tag_', tupl[1])
#print(sentence, checkedsecondtime)
if checkedsecondtime == 1:
tuplchecked = 0
if checkedsecondtime == 2:
tuplchecked = 1
for tripl in triplesToCheck:
checkedsecondtime, tripleinWords = gs.checkForAnnotationTriple(sentence.split(), tripl[0], 'word.tag_', tripl[1])
if checkedsecondtime == 1 or checkedsecondtime == 2:
triplchecked = 0
if checkedsecondtime == 3:
triplchecked = 1
if triplchecked == 1 or tuplchecked == 1:
filteredprobsentences.append(sentence)
#print('filteredprobsentences', filteredprobsentences)
if len(filteredprobsentences) == 0:
filteredprobsentences = probsentences
# here is still the problem, that there are lists of words instead of proper sentences..
#print('filteredprobsentences', filteredprobsentences)
probsMatrix = sgm.predictprobsOnSentenceList(filteredprobsentences, filteredprobsentences)
#print(probsMatrix)
for i in range(len(probsMatrix)):
probsMatrix[i][0] = i
#print(probsMatrix)
sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
#print(sortedprobsMatrix)
bestindex = sortedprobsMatrix[0][0]
#print(bestindex)
#print('probablemainsentences', filteredprobsentences)
probablemainsentence = filteredprobsentences[int(bestindex)]
#print('oi', probablemainsentence)
#print('probablemainsentence', probablemainsentence)
enumerationsentences.append([probablemainsentence])
enumerationsentences.append([' '.join(mainsentence)])
for notenum in NOTenumerations:
#print(enumerationsentences)
#print(enumerationsentences[-1])
#print('enum no1', enumerationsentences)
#print('notenum', notenum)
enumerationsentences[-1].append(' '.join(notenum))
#print('enumsentences',enumerationsentences[-1])
enumerationsentences[-1] = [', '.join(enumerationsentences[-1])]
else:
enumerationsentences.append([sentence])
output.append(enumerationsentences)
for n in range(len(output[0])):
#print('out',output[0][n])
try:
output[0][n] = [output[0][n][0].split()]
except:
output[0][n] = [output[0][n][0][0]]
#print('done')
return output[0]
def GetUtteranceNumber(self, sentences):
nlp = self.nlp
uttersentences = []
for sentence in sentences:
doc = nlp(' '.join(sentence[0]))
subjectcount = 0
for word in doc:
if word.dep_ == 'sb' or word.dep_ == 'ep':
subjectcount += 1
sentence.append(subjectcount)
uttersentences.append(sentence)
return uttersentences
def GetQuestionOrNot(self, sentences):
nlp = self.nlp
uttersentences = []
questionmark = 0
for sentence in sentences:
doc = nlp(' '.join(sentence[0]))
count = 0
for word in doc:
count += 1
if word.text == '?':
questionmark = 1
sentence.append(questionmark)
uttersentences.append(sentence)
return uttersentences
def SplitSentencesIntoHauptNebenTuple(self, sentences, punctuations):
oldsplitsentences = []
#print('hauptneben inputsentences', sentences)
gs = self.gs
#print('importing spacy..')
import spacy
#print('done')
nlp = self.nlp
outputsentences = []
sentencesThatAreOutoutput = []
outsentences = []
for generalindex in range(len(sentences)):
presentence = sentences[generalindex]
splitsentence = []
splitsentence_deps = []
splitsentence_tags = []
splitsentences = []
splitsentences_deps = []
splitsentences_tags = []
commainfo = presentence[1]
outputsentence = []
token = commainfo[1]
commaornot = commainfo[0]
numberutterances = presentence[2]
sentence = presentence[0]
oldsentence = presentence[0]
#print(commaornot)
if commaornot >= 2:
#print('nla')
sentence[0] = sentence[0].title()
doc = nlp(' '.join(sentence))
for word in doc:
#print(word.text)
if word.text != ',' and word.text != '.':
splitsentence.append(word.text)
splitsentence_deps.append(word.dep_)
splitsentence_tags.append(word.tag_)
if word.text == ',':
#print('oi')
splitsentences.append(splitsentence)
splitsentences_deps.append(splitsentence_deps)
splitsentences_tags.append(splitsentence_tags)
splitsentence = []
splitsentence_deps = []
splitsentence_tags = []
splitsentences.append(splitsentence)
splitsentences[0][0] = splitsentences[0][0].lower()
splitsentences_deps.append(splitsentence_deps)
splitsentences_tags.append(splitsentence_tags)
oldsplitsentences = splitsentences
#print(splitsentences)
#print(splitsentences_tags)
#print(splitsentences_deps)
spo = []
for n in range(len(splitsentences)):
prespo = []
prespo = gs.checkSPO(splitsentences_deps[n], 1)
prespo.append( gs.checkForAnnotation(splitsentences[n], 'VVINF', 'word.tag_'))
prespo.append(gs.checkForAnnotation(splitsentences[n], 'VAFIN', 'word.tag_'))
prespo.append(gs.checkForAnnotation(splitsentences[n], 'VVFIN', 'word.tag_'))
prespo.append(gs.checkForAnnotation(splitsentences[n], 'VMFIN', 'word.tag_'))
spo.append(prespo)
#print(splitsentences_deps)
#print(splitsentences)
#print(spo)
indexSPO = []
lastm = len(splitsentences)
for o in range(len(splitsentences)):
m = len(splitsentences) - 1 - o
for n in range(len(splitsentences)):
if m < n - 1 and n < lastm:
#print('spo s',spo[m], spo[n])
sb = spo[m][0] + spo[n][0]
Vafin = 1
if spo[m][3] == 1 or spo[n][3] == 1:
Vafin = spo[m][3] + spo[n][3]
Vvinf = 1
if spo[m][4] == 1 or spo[n][4] == 1:
Vvinf = spo[m][4] + spo[n][4]
Vvfin = 1
if spo[m][5] == 1 or spo[n][5] == 1:
Vvfin = spo[m][5] + spo[n][5]
Vmfin = 1
if spo[m][6] == 1 or spo[n][6] == 1:
Vmfin == spo[m][6] + spo[n][6]
#wrapped = 0
#for n in range(len(indexSPO)):
#if n == indexSPO[n][0] + 1 and n == indexSPO[n][1] - 1:
#wrapped = 1
#print(sb, Vafin, Vvinf, Vvfin, Vmfin, 'm n', m, n)
if sb == 1 and Vafin == 1 and Vvinf == 1 and (Vvfin == 1 or Vmfin == 1):
indexSPO.append([m,n])
#print([m,n])
lastm = m
#print('lastm',lastm)
#print(splitsentences)
Hauptsentences = []
for n in range(len(indexSPO)):
if indexSPO[n][0] > indexSPO[n][1]:
i = 1
j = 0
else:
i = 0
j = 1
Hauptsentences.append([splitsentences[indexSPO[n][i]] + splitsentences[indexSPO[n][j]] , indexSPO[n][i], indexSPO[n][j] ])
HauptSentences = []
for n in range(len(Hauptsentences)):
m = len(Hauptsentences) - 1 - n
HauptSentences.append(Hauptsentences[m])
#print('Hauptsentences', Hauptsentences)
#print('HauptSentences', HauptSentences)
sentencesThatAreOut =[]
for n in range(len(HauptSentences)):
index = HauptSentences[n][1]
finish = 0
#print('Oi',HauptSentences[n])
if n == len(HauptSentences) - 1:
#print('lenHauptsentences', len(HauptSentences))
stopindex = len(splitsentences)
finish = 1
else:
stopindex = HauptSentences[n + 1][1]
#print('stopindex', stopindex)
vvfinisthere = 0
if finish == 0:
if splitsentences_tags[stopindex][0] == 'VVFIN':
stopindex -= 1
vvfinisthere = 1
if splitsentences_tags[index][0] == 'VVFIN':
vvfinisthere = 1
if vvfinisthere == 1:
HNTuple = HauptSentences[n][0] + [','] + splitsentences[index - 1]
outputsentence.append(HNTuple)
sentencesThatAreOut.append(index - 1)
sentencesThatAreOut.append(Hauptsentences[n][1])
sentencesThatAreOut.append(Hauptsentences[n][2])
for m in range(index + 1, stopindex ):
if m != HauptSentences[n][2]:
HNTuple = HauptSentences[n][0] + [','] + splitsentences[m]
#print('check', HauptSentences[n], n)
#print('check', splitsentences[m], m)
#print('double', HNTuple)
outputsentence.append(HNTuple)
sentencesThatAreOut.append(m)
sentencesThatAreOut.append(Hauptsentences[n][1])
sentencesThatAreOut.append(Hauptsentences[n][2])
sentencesThatAreOutoutput.append(sentencesThatAreOut)
cpOrNots = []
rcOrNots = []
for splitsentence in splitsentences_deps:
cpOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'cp')
cpOrNots.append(cpOrNot)
rcOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'rc')
rcOrNots.append(rcOrNot)
#print('Laenge splitsentences', len(splitsentences))
#print('laenge cpOrNots', len(cpOrNots))
#print(cpOrNots)
#print('rc or nots', rcOrNots)
pairs = []
for n in range(len(cpOrNots)):
index = len(cpOrNots) - 1 - n
done = 0
if rcOrNots[index] == 1:
pairs.append([index, index - 1])
done = 1
if done == 0 and cpOrNots[index] == 1:
try:
if splitsentences_tags[index + 1][0] == 'VVFIN':
pairs.append([index, index + 1])
done = 1
except:
pass
try:
if done == 0 and rcOrNots[index - 1] == 0:
pairs.append([index, index - 1])
done = 1
except:
pass
try:
if done == 0 and rcOrNots[index - 1] == 1:
if rcOrNots[index - 2] == 0:
pairs.append([index, index - 2])
except:
pass
for pair in pairs[::-1]:
if pair[0] not in set(sentencesThatAreOut) or pair[1] not in set(sentencesThatAreOut):
outputsentence.append(splitsentences[pair[1]] + [','] + splitsentences[pair[0]])
#print('hnhn',sentences)
sentences[generalindex][0] = outputsentence
#print('outputsentence hntuple',outputsentence)
#outputsentences.append([outputsentence , i])
#print('Oio', outputsentences)
#print(sentencesThatAreOutoutput)
#print(splitsentences)
#print('oioioioioioioio',sentences)
#print(sentences[0][0])
#print('oioi',sentences[n])
#print('malatesta', sentences[n][0][0])
#print('generalindex sentences index 0', sentences[generalindex][0])
try:
if type(sentences[generalindex][0][0]) == str:
sentences[generalindex][0] = [sentences[generalindex][0]]
except:
pass
#print('generalindex sentences index 0', sentences[generalindex][0])
#print('oldsentence', oldsentence)
newgeneratedsentences = len(sentences[generalindex][0])
if newgeneratedsentences > 1:
#print('goti t')
for sentence in sentences[generalindex][0]:
punctuations.insert(generalindex, punctuations[generalindex])
outsentences.append(sentence)
del punctuations[generalindex]
if newgeneratedsentences == 1:
if len(sentences[generalindex][0][0]) > 1:
outsentences.append(sentences[generalindex][0][0])
else:
outsentences.append(oldsentence)
if newgeneratedsentences == 0:
#print('case oldsentence', oldsentence)
outsentences.append(oldsentence)
#print('oioi', sentences[n])
# connect alonestanding commatas with the word before
#print('theoutsentences', outsentences)
for outsentence in outsentences:
todelete = []
for n in range(len(outsentence)):
if outsentence[n] == ',':
todelete.append(n)
outsentence[n-1] = outsentence[n-1] + ','
for deleteindex in todelete[::-1]:
del outsentence[deleteindex]
for index in range(len(outsentences)):
outsentences[index] = [outsentences[index]]
#print('theoutsentences', outsentences)
#removing doubles
doubledsentences = []
for o in range(len(outsentences)):
sentence = outsentences[o][0]
for m in range(len(outsentences)):
if m != o:
count = 0
for n in range(len(sentence)):
if sentence[n] in outsentences[m][0] or sentence[n][:-1] in outsentences[m][0]:
count += 1
if count == len(sentence):
doubledsentences.append(sentence)
punctdeleteindex = []
tmp = set()
for sentence in doubledsentences:
tmp.add(tuple(sentence))
#print(list(tmp))
doubledsentences = []
for tup in tmp:
doubledsentences.append([list(tup)])
#print('doubledsentences',doubledsentences)
punctdeleteindexes = []
for double in doubledsentences:
if double in outsentences:
punctdeleteindex = outsentences[::-1].index(double)
del outsentences[len(outsentences) - 1 - punctdeleteindex]
punctdeleteindexes.append(punctdeleteindex)
for index in punctdeleteindexes[::-1]:
del punctuations[len(outsentences) - 1 - index]
#print('oldsplit',oldsplitsentences)
#print('outsents',outsentences)
for o in range(len(oldsplitsentences)):
for m in range(len(outsentences)):
counter = 0
for n in range(len(oldsplitsentences[o])):
if oldsplitsentences[o][n] in outsentences[m][0] or oldsplitsentences[o][n] + ',' in outsentences[m][0]:
counter += 1
if counter >= len(oldsplitsentences[o]):
break
if m == len(outsentences) - 1 and counter < len(oldsplitsentences[o]):
if o == 0:
outsentences.insert(0,[oldsplitsentences[o]])
punctuations.insert(0, punctuations[0])
else:
newones = []
for i in range(len(outsentences)):
if outsentences[i][0][-1] == oldsplitsentences[o - 1][-1]:
if len(outsentences[i][0]) > 2 and len(oldsplitsentences[o - 1]) > 2:
if outsentences[i][0][-2] == oldsplitsentences[o - 1][-2]:
if outsentences[i][0][-3] == oldsplitsentences[o - 1][-3]:
newones.append([i + 1, [oldsplitsentences[o]]])
for newone in newones[::-1]:
#print(newones)
outsentences.insert(newone[0], newone[1])
punctuations.insert(newone[0], punctuations[newone[0] - 1])
#print('outsentences at the very end ', outsentences, punctuations)
return outsentences, punctuations
# Notiz: Hier muss der Input immer Paare sein, von Hauptsatz/Nebensatz. D.h. eine weitere vorgeschaltete Klasse ist von Nöten.
def SplitCommatas(self, Inputsentences, punctuations):
gs = self.gs
nlp = self.nlp
gramcorr_splitsentences = []
counter = 0
newpunctuationsindex = []
for Inputsentence in Inputsentences:
counter += 1
commainfo = Inputsentence[1]
token = commainfo[1]
commaornot = commainfo[0]
numberutterances = Inputsentence[2]
if commaornot == 0:
gramcorr_splitsentences.append(Inputsentence[0])
if commaornot > 1:
gramcorr_splitsentences.append(Inputsentence[0])
if commaornot == 1:
oldsentence = Inputsentence[0]
Inputsentence = [[Inputsentence[0]]]
for sentence in Inputsentence[0]:
splitsentence = []
splitsentences = []
processed = 0
wasNotInAnyList = 0
try:
for n in range(len(token)):
if token[n] in self.final_list:
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 1:
if token[n] == 'um' or token[n] == 'Um':
splitsentences[n].insert(0,'dies')
splitsentences[n].insert(0,'um')
else:
splitsentences[n].insert(0,'dann')
if n == 0:
if token[n] == 'um' or token[n] == 'Um':
splitsentences[n].insert(0,'dies')
splitsentences[n].insert(0,'um')
splitsentences = splitsentences[::-1]
else:
splitsentences[n].insert(0,'dann')
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
processed = 1
if token[n] in self.adversativ_list:
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
splitsentences[n].append('jedoch')
generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
processed = 1
if token[n] in self.kausal_list:
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
# Da deswegen an den anderen Satz gehaengt wird, muss der input zu commasentences immer ZWEI sentences sein.
#print('splitsentences in kausal', splitsentences)
if n == 1:
splitsentences[n - 1].insert(0,'deswegen')
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
if n == 0:
splitsentences[n + 1].insert(0,'deswegen')
#print('splitsentences in kausal', splitsentences)
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
processed = 1
# from here come konsekutiv sentences, they have to be split according https://www.deutschplus.net/pages/Konsekutivsatz
if token[n] in self.konsekutiv_list:
#print('oi konsekutiv')
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
generalrules = [['KOUS','PPER']]
processed = 1
if token[n] in self.konditional_list:
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 1:
spoCount = gs.checkSPO(splitsentences[n], 0)
spoCount = sum(spoCount)
if spoCount == 2:
thereisanes = 0
for word in splitsentences[n]:
if word == 'es' or word == 'Es':
thereisanes = 1
if thereisanes == 0:
splitsentences[n].append('es')
if n == 0:
spoCount = gs.checkSPO(splitsentences[n], 0)
spoCount = sum(spoCount)
if spoCount == 2:
thereisanes = 0
for word in splitsentences[n]:
if word == 'es' or word == 'Es':
thereisanes = 1
if thereisanes == 0:
splitsentences[n].append('es')
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
generalrules = [['KOUS','PPER']]
processed = 1
if token[n] in self.konzessiv_list:
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 1:
splitsentences[n - 1].insert(0,'trotzdem')
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
if n == 0:
splitsentences[n + 1].insert(0,'trotzdem')
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
processed = 1
if token[n] in self.lokal_list:
#print('lokal ole ole ')
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 1:
splitsentences[n - 1].insert(0,'dort')
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
if n == 0:
splitsentences[n + 1].insert(0,'dort')
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
processed = 1
if token[n] in self.instrumental_list:
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 1:
splitsentences[n - 1].insert(0,'so')
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
if n == 0:
splitsentences[n + 1].insert(0,'so')
generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
processed = 1
if token[n] in self.temporal_list_vor:
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 1:
splitsentences[n].insert(0,'danach')
if n == 0:
splitsentences[n].insert(0,'danach')
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
processed = 1
if token[n] in self.temporal_list_nach:
splitsentence = []
for word in sentence:
if word != token[n]:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 1:
splitsentences[n].insert(0,'davor')
if n == 0:
splitsentences[n].insert(0,'davor')
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
processed = 1
#print(token[n])
if token[n] == 'der' or token[n] == 'welcher':
tokens = self.nlp(' '.join(sentence))
for word in tokens:
if word.dep_ == 'rc':
wordwithrc = word.text
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
oldsplitsentences = splitsentences
splitsentences = []
if rcORnot == 1:
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
# das umtauschen wird hier vollzogen, da ansonsten spacy dieser nicht als PDS einliest.. analog in den anderen.
if wordwithrc in splitsentences[n]:
splitsentences[n][0] = 'dieser'
verb = splitsentences[n][-1]
splitsentences[n] = splitsentences[n][:-1]
splitsentences[n].insert(1, verb)
#print('Vorsicht', splitsentences)
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
processed = 1
else:
splitsentences = oldsplitsentences
splitsentence = []
if token[n] == 'die' or token[n] == 'welche':
tokens = self.nlp(' '.join(sentence))
for word in tokens:
if word.dep_ == 'rc':
wordwithrc = word.text
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
oldsplitsentences = splitsentences
splitsentences = []
if rcORnot == 1:
#print('it went to rcornot in case die')
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if wordwithrc in splitsentences[n]:
#print('wordwithrc was in sentence')
#print(wordwithrc)
#print(splitsentences[n])
#print('wordwithrcend')
splitsentences[n][0] = 'diese'
verb = splitsentences[n][-1]
splitsentences[n] = splitsentences[n][:-1]
splitsentences[n].insert(1, verb)
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
processed = 1
else:
splitsentences = oldsplitsentences
splitsentence = []
if token[n] == 'dem':
tokens = self.nlp(' '.join(sentence))
for word in tokens:
if word.dep_ == 'rc':
wordwithrc = word.text
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
oldsplitsentences = splitsentences
splitsentences = []
if rcORnot == 1:
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',' and word[-1] != '.':
splitsentence.append(word)
if word[-1] == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if wordwithrc in splitsentences[n]:
splitsentences[n][0] = 'diesem'
verb = splitsentences[n][-1]
splitsentences[n] = splitsentences[n][:-1]
splitsentences[n].insert(1, verb)
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
processed = 1
else:
splitsentences = oldsplitsentences
splitsentence = []
if token[n] == 'das' or token[n] == 'welches':
tokens = self.nlp(' '.join(sentence))
for word in tokens:
if word.dep_ == 'rc':
wordwithrc = word.text
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
#print('Oeeee',rcORnot)
oldsplitsentences = splitsentences
splitsentences = []
if rcORnot == 1:
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
#print('splitsentence in das rc', splitsentences)
if wordwithrc in splitsentences[n]:
splitsentences[n][0] = 'dieses'
verb = splitsentences[n][-1]
#print('verb',verb)
splitsentences[n] = splitsentences[n][:-1]
splitsentences[n].insert(1, verb)
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
processed = 1
else:
splitsentences = oldsplitsentences
splitsentence = []
if token[n] == 'dessen' or token[n] == 'wessen':
tokens = self.nlp(' '.join(sentence))
for word in tokens:
if word.dep_ == 'rc':
wordwithrc = word.text
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
oldsplitsentences = splitsentences
splitsentences = []
if rcORnot == 1:
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if wordwithrc in splitsentences[n]:
verb = splitsentences[n][-1]
splitsentences[n] = splitsentences[n][:-1]
splitsentences[n].insert(1, verb)
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
processed = 1
else:
splitsentences = oldsplitsentences
splitsentence = []
if token[n] == 'den' or token[n] == 'welchen':
tokens = self.nlp(' '.join(sentence))
for word in tokens:
if word.dep_ == 'rc':
wordwithrc = word.text
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
oldsplitsentences = splitsentences
splitsentences = []
if rcORnot == 1:
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if wordwithrc in splitsentences[n]:
splitsentences[n][0] = 'diesen'
verb = splitsentences[n][-1]
splitsentences[n] = splitsentences[n][:-1]
splitsentences[n].insert(1, verb)
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
processed = 1
else:
splitsentences = oldsplitsentences
splitsentence = []
if token[n] == 'wem' or token[n] == 'Wem' or token[n] == 'welchem':
daORnot = gs.checkForAnnotation(sentence, 'da', 'word.dep_')
oaORnot = gs.checkForAnnotation(sentence, 'oa', 'word.dep_')
reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
oldsplitsentences = splitsentences
splitsentences = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 0:
index = 1
if n == 1:
index = 0
if reORnot == 1:
pass
if daORnot == 1 and reORnot == 0:
splitsentences[index].insert(1, 'das')
if oaORnot == 1 and reORnot == 0:
splitsentences[index].insert(1, 'dem')
if n == 1:
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
processed = 1
if token[n] in self.indirectspeech_list and token[1] not in self.konsekutiv_list:
reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
oldsplitsentences = splitsentences
splitsentences = []
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
splitsentence = []
splitsentences.append(splitsentence)
if n == 0:
index = 1
if n == 1:
index = 0
if reORnot == 0:
if splitsentences[index][0] != 'was':
splitsentences[index].insert(1, 'das')
if n == 1:
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
processed = 1
if processed == 0 and n == 1:
ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VVINF'], 'word.tag_', 'None')
if ZUVINFTupelORnot == 0:
ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VAINF'], 'word.tag_', 'None')
if ZUVINFTupelORnot == 1:
reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
splitsentence = []
for word in sentence:
if word[-1] == ',':
splitsentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',' :
splitsentence.append(word)
if word[-1] == ',' or word == ',':
splitsentences.append(splitsentence)
processed = 1
splitsentence = []
splitsentences.append(splitsentence)
for m in range(2):
ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VVINF'], 'word.tag_','None')
if ZUINForNOT == 0:
ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VAINF'], 'word.tag_','None')
if ZUINForNOT == 1:
r = m
ZUINForNOT = 0
if r == 0:
index = 1
if r == 1:
index = 0
objectORnot = gs.checkForAnnotation(splitsentences[index] , 'oa', 'word.dep_')
if reORnot == 0 and objectORnot == 0:
splitsentences[index].insert(1, 'das')
if r == 1:
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
else:
processed == 2
except:
wasNotInAnyList = 1
#rules = [['ART','ADJA','NN'], ['ART','ADJA','NE'], ['ART', 'NN'], ['ART', 'NE'], ['APPR','NN'], ['APPR','NE'], ['APPR', 'ART', 'NN'], ['APPR', 'ART', 'NE'], ['APPR','ART','NN','ADJA','NN'], ['APPR','ART','NN','ADJA','NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD']]
#print('B',splitsentences)
endsentences = []
if (processed == 2 or processed == 0) and n == 1:
wasNotInAnyList = 1
try:
if wasNotInAnyList == 0:
newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
#print('splitsentencee', splitsentences)
if len(splitsentences) > 2:
splitsentences = splitsentences[:2]
#print('splitsentenceeeees', splitsentences)
for splitsentence in splitsentences:
#print('splitsentenceeeeeeeeeeee!!',splitsentence)
wordtoputfirst = 'nada'
for word in self.firstwordlist:
if word == splitsentence[0]:
wordtoputfirst = word
splitsentence.remove(word)
#print('get the tuples and triples to check..')
tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(splitsentence)
#print('done')
#print(tuplesTocheck, 'ole', triplesTocheck ,'aiai', quadruplesTocheck)
#print('1')
grammpiecessentence = self.gs.createTupleofGrammarpieces( splitsentence, tuplesTocheck, triplesTocheck, quadruplesTocheck)
#print('grammpiece',grammpiecessentence)
#print('2')
if len(grammpiecessentence) > 7:
print('A sentence is too long, too many permutations. \n piping wrong grammar..')
endsentence = ' '.join(grammpiecessentence)
else:
#print('genrating the permutations')
permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
#print('done')
#print(permutations)
#print('3')
firstwordwithverblist = ['deswegen', 'danach']
permutationstodelete = []
for permutation in permutations:
#print('4')
if permutation[0] in firstwordwithverblist:
#print('4.1')
count = 1
for word in self.nlp(permutation[1]):
#print('4.2')
if word.tag_[0] != 'V':
#print('4.3')
permutationstodelete.append(permutation)
break
else:
break
#for word in self.nlp(permutation[0]):
#print('4.2')
#if word.tag_[0] != 'V':
#print('4.3')
#permutationstodelete.append(permutation)
#break
#else:
#break
for delperm in permutationstodelete:
try:
permutations.remove(delperm)
except:
pass
#print('5')
sentencesToCheck = []
if wordtoputfirst in self.firstwordlist:
for sentence in permutations:
sentencesToCheck.append(wordtoputfirst + ' ' + ' '.join(sentence))
else:
for sentence in permutations:
sentencesToCheck.append(' '.join(sentence))
endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(splitsentence))
#print('done')
#print('endsent',endsentence)
endsentences.append(endsentence)
except:
#print('there was an error')
wasNotInAnyList = 1
endsentences = []
todelete = []
for index in range(len(newpunctuationsindex)):
if newpunctuationsindex[index][0] == counter - 1:
todelete.append(index)
for todel in todelete[::-1]:
del newpunctuationsindex[todel]
if wasNotInAnyList == 1:
#print('was not in any list')
#print(oldsentence)
endsplisentences = []
splisentence = []
for word in oldsentence:
if word[-1] == ',':
splisentence.append(word[:-1])
if word == ',':
pass
if word[-1] != ',':
splisentence.append(word)
if word[-1] == ',' or word == ',':
endsplisentences.append(splisentence)
splisentence = []
endsplisentences.append(splisentence)
newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
#print('endsplisentences',endsplisentences)
for splsentence in endsplisentences:
endsentences.append(' '.join(splsentence))
'''
fsearch1 = self.fsearch1
spacyclass1 = 'word.tag_'
gs_sentence1 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass1)
print('searchPatternMatch for tags')
bestmatches1 = fsearch1.searchPatternMatch(' '.join(gs_sentence1), 1)
print('done')
#print('oioi', bestmatches1)
#print(len(fsearch1.database))
right_gs_tupel1 = []
if len(bestmatches1) < 10:
bestndocs1 = len(bestmatches1)
else:
bestndocs1 = 10
for m in range(bestndocs1):
right_gs_tupel1.append(fsearch1.database[bestmatches1[m][0]])
statistically_correct_sentences1 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence1, right_gs_tupel1)
fsearch2 = self.fsearch2
spacyclass2 = 'word.dep_'
gs_sentence2 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass2)
print('searchPatternMatch for deps')
bestmatches2 = fsearch2.searchPatternMatch(' '.join(gs_sentence2), 1)
print('done')
right_gs_tupel2 = []
if len(bestmatches2) < 10:
bestndocs2 = len(bestmatches2)
else:
bestndocs2 = 10
for m in range(bestndocs2):
right_gs_tupel2.append(fsearch2.database[bestmatches2[m][0]])
#print(' '.join(splitsentence))
statistically_correct_sentences2 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence2, right_gs_tupel2)
print(splitsentence)
Rightsentence = gs.GetBestgsAccordingRules(' '.join(splitsentence) , gs_sentence1, right_gs_tupel1, right_gs_tupel2, statistically_correct_sentences1, statistically_correct_sentences2, rules, generalrules)
'''
for endsentence in endsentences:
gramcorr_splitsentences.append(endsentence.split())
for index in newpunctuationsindex:
punctuations.insert(index[0], index[1])
return gramcorr_splitsentences, punctuations
def putAppendixesIntoOwnSentences(self, sentences, punctuations):
gs = self.gs
#triples = [['NN', 'ART', 'NN'], ['NE', 'ART', 'NN'], ['NN', 'ART', 'NN'], ['NE', 'ART', 'NE']]
quadruples = [['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'ART', 'NN'], ['NE', 'APPR', 'ART', 'NN'], ['NN', 'APPR', 'ART', 'NE'], ['NE', 'APPR', 'ART', 'NE']]
quadruplestochange = []
triplestochange = []
newsentences = []
newpunctuations = []
Whatisofnouns = []
for hauptindex in range(len(sentences)):
sentence = sentences[hauptindex]
#for triple in triples:
# AnnoOrNot, tripleInWords = gs.checkForAnnotationTriple(sentence, triple, 'word.tag_', 'None')
# for tripleinwor in tripleInWords:
# triplestochange.append([triple, tripleinwor])
for quadruple in quadruples:
AnnoOrNot, quadrupleInWords = gs.checkForAnnotationQuadruple(sentence, quadruple, 'word.tag_', 'None')
#print('quadinwords', quadrupleInWords)
#print('ANNOORNOT', AnnoOrNot)
for quadrupleInWo in quadrupleInWords:
quadruplestochange.append([quadruple, quadrupleInWo])
#print('quadstochange',quadruplestochange)
for quad in quadruplestochange:
for n in range(len(sentence) - 4):
if sentence[n] == quad[1][0]:
if sentence[n + 1] == quad[1][1]:
if sentence[n + 2] == quad[1][2]:
artword = None
longerWhatisnoun = 0
for m in range(2):
for word in self.nlp(sentence[n - m]):
if word.tag_ == 'ART':
Nounthatis = sentence[n - m:n + 1]
import spacy
nlp = spacy.load('de_core_news_sm')
token3 = nlp(sentence[n+4])
counter = 0
Whatisnoun = sentence[n + 1:n + 4]
for wor in token3:
counter += 1
if wor.tag_ == 'NN' or wor.tag_ == 'NE':
if counter == 1:
Whatisnoun = sentence[n + 1:n + 5]
longerWhatisnoun = 1
if counter == 2:
Whatisnoun = sentence[n + 1:n + 4]
artword = word.text
#print(sentence[n - 1],'oi')
if ((artword == 'die' or artword == 'Die') and sentence[n][-1] != 'n') or ((artword == 'der' or artword == 'einer' or artword == 'dieser') and (sentence[n - 2] in ['von', 'in', 'auf', 'ueber', 'unter', 'nach', 'mit'])):
if artword == 'der':
Nounthatis[0] = 'die'
donothing = 0
if sentence[n + 1] == 'mit':
if sentence[n + 2] == 'den':
verb = ' hat die '
Whatisnoun = Whatisnoun[2:]
if sentence[n + 2] == 'der':
verb = ' hat eine '
Whatisnoun = Whatisnoun[2:]
if sentence[n + 2] != 'der' and sentence[n + 2] != 'den':
donothing = 1
else:
verb = ' ist '
if donothing == 0:
newsentence = ' '.join(Nounthatis) + verb + ' '.join(Whatisnoun)
newsentences.append([hauptindex + 1, newsentence.split()])
newpunctuations.append([hauptindex + 1, punctuations[hauptindex]])
if longerWhatisnoun == 0:
Whatisofnouns.append([n + 1, n + 4, hauptindex])
else:
Whatisofnouns.append([n + 1, n + 5, hauptindex])
for whatis in Whatisofnouns[::-1]:
thereisacomma = 0
#print(sentences[whatis[2]][whatis[1] - 1])
if sentences[whatis[2]][whatis[1] - 1][-1] == ',':
thereisacomma = 1
if thereisacomma == 1:
#print(sentences[whatis[2]][whatis[0] - 1])
sentences[whatis[2]][whatis[0] - 1] = sentences[whatis[2]][whatis[0] - 1] + ','
del sentences[whatis[2]][whatis[0]:whatis[1]]
for newsent in newsentences[::-1]:
sentences.insert(newsent[0], newsent[1])
for newpunct in newpunctuations[::-1]:
punctuations.insert(newpunct[0], newpunct[1])
for sentence in sentences:
if sentence[-1][-1] == ',':
sentence[-1] = sentence[-1][:-1]
return sentences, punctuations