2223 lines
91 KiB
Python
2223 lines
91 KiB
Python
|
|
# split sentences
|
|
|
|
# in den Listen fehlt noch sondern ( und noch weitere Dinge..)
|
|
|
|
|
|
# Folgende Konjunktionen brauchen keine Satzumformungen:
|
|
# Woraufhin, zudem, zumal, umso - desto,
|
|
|
|
# sondern ist schwierig zu lösen.. am besten mit sondern weg, und anschließend SentGlue
|
|
|
|
|
|
|
|
class SentSeg(object):
|
|
|
|
def __init__(self, language):
|
|
|
|
self.language = language
|
|
|
|
self.punktuation_list = ['.', '?', '!', ';', ':']
|
|
|
|
self.wrappunktuation_list = [',', '-']
|
|
|
|
self.adversativ_list = ['wohingegen', 'Wohingegen', 'aber', 'Aber', 'wobei', 'Wobei', 'hingegen']
|
|
|
|
self.final_list = ['damit','Damit', 'um', 'Um']
|
|
|
|
self.kausal_list = ['weil', 'Weil', 'da', 'Da', 'denn', 'falls', 'Falls' ]
|
|
|
|
self.konditional_list = ['wenn', 'Wenn', 'sobald', 'Sobald', 'als', 'falls']
|
|
|
|
self.konsekutiv_list = ['dass', 'Dass']
|
|
|
|
self.konzessiv_list = ['obwohl', 'Obwohl', 'obgleich', 'Obgleich', 'trotzdem', 'Trotzdem', 'wenngleich', 'doch']
|
|
|
|
self.lokal_list = ['wo', 'Wo']
|
|
|
|
self.temporal_list_vor = ['bevor', 'Bevor']
|
|
|
|
self.temporal_list_nach = ['nachdem', 'Nachdem']
|
|
|
|
self.instrumental_list = ['indem', 'Indem']
|
|
|
|
self.indirectspeech_list = ['ob', 'Ob', 'wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso']
|
|
self.firstwordlist = []
|
|
#self.firstwordlist = ['wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso', 'dies', 'dann', 'jedoch', 'deswegen', 'trotzdem', 'danach', 'davor', 'wenn', 'sobald']
|
|
|
|
self.full_list = self.adversativ_list + self.final_list + self.kausal_list + self.konditional_list + self.konsekutiv_list + self.konzessiv_list + self.lokal_list + self.temporal_list_nach + self.temporal_list_vor + self.instrumental_list + self.indirectspeech_list
|
|
|
|
def ReadDoc2Sent(self, document):
|
|
|
|
splitsentences = []
|
|
splitsentence = []
|
|
|
|
with open(document) as sentences:
|
|
counter = 0
|
|
for sentence in sentences:
|
|
|
|
counter += 1
|
|
if counter % 1000 == 0:
|
|
print(counter)
|
|
|
|
words = sentence.split()
|
|
|
|
|
|
|
|
for word in words:
|
|
|
|
splitsentence.append(word)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
|
|
|
|
splitsentences.append([splitsentence])
|
|
|
|
splitsentence = []
|
|
|
|
return splitsentences
|
|
|
|
|
|
|
|
|
|
|
|
def AndOrSolver(self, sentences, punctuations):
|
|
|
|
for n in range(len(punctuations)):
|
|
if punctuations[n] == ':' or punctuations[n] == '-':
|
|
punctuations[n] = '.'
|
|
|
|
|
|
#print(sentences, punctuations)
|
|
|
|
splitsentences = []
|
|
|
|
counter = 0
|
|
|
|
newsentences = []
|
|
for sentence in sentences:
|
|
newpunctuationsindexes = []
|
|
utterancenumber = sentence[2]
|
|
commainfo = sentence[1]
|
|
commaornot = commainfo[0]
|
|
sentence = sentence[0]
|
|
|
|
|
|
counter += 1
|
|
doc = self.nlp(' '.join(sentence))
|
|
|
|
subjectcount = 0
|
|
separationwords = []
|
|
subjectcounts = []
|
|
doccounter = 0
|
|
subjectindex = []
|
|
rcornot = 0
|
|
for word in doc:
|
|
doccounter += 1
|
|
if word.dep_ == 'sb' or word.dep_ == 'ep':
|
|
subjectcount += 1
|
|
subjectindex.append(doccounter - 1)
|
|
if word.dep_ == 'rc':
|
|
rcornot = 1
|
|
|
|
|
|
if word.tag_ == '$,':
|
|
|
|
subjectcounts.append([subjectcount, doccounter - 2, subjectindex, rcornot])
|
|
subjectindex = []
|
|
subjectcount = 0
|
|
#print('aleaole',sentence[doccounter - 2])
|
|
if len(sentence[doccounter - 2]) > 1:
|
|
|
|
doccounter -= 1
|
|
|
|
if word.text == 'und' or word.text == 'also' or word.text == 'oder' or word.text == 'schon' or word.text == 'bald' or word.text == 'doch' or word.text == 'jedoch' or word.text == 'sondern':
|
|
separationwords.append(doccounter - 1)
|
|
|
|
#print('separationwords', separationwords)
|
|
#print('subjectcounts', subjectcounts)
|
|
|
|
|
|
|
|
separationwordstocut = []
|
|
listofownsentencessubjectindexes = []
|
|
for n in range(len(subjectcounts) - 1):
|
|
if subjectcounts[n][0] > 0 and subjectcounts[n + 1][0] > 0 and subjectcounts[n + 1][3] == 0:
|
|
listofownsentencessubjectindexes.append(subjectcounts[n])
|
|
for m in range(len(separationwords)):
|
|
if subjectcounts[n][1] < separationwords[m] < subjectcounts[n + 1][1]:
|
|
#print(subjectcounts[n + 1], separationwords[m])
|
|
if subjectcounts[n + 1][0] > 1:
|
|
if subjectcounts[n + 1][2][0] < separationwords[m] <= subjectcounts[n + 1][2][-1]:
|
|
separationwordstocut.append(separationwords[m])
|
|
|
|
processed = 0
|
|
|
|
#print('oioioi')
|
|
#print(listofownsentencessubjectindexes)
|
|
#print(separationwordstocut)
|
|
|
|
if len(listofownsentencessubjectindexes) > 0:
|
|
for n in range(len(listofownsentencessubjectindexes)):
|
|
|
|
sentence[listofownsentencessubjectindexes[n][1]] = sentence[listofownsentencessubjectindexes[n][1]] + 'alohaseparator'
|
|
newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
|
|
#print('a new punctuation1')
|
|
processed = 1
|
|
if len(separationwordstocut) > 0:
|
|
for n in range(len(separationwordstocut)):
|
|
sentence[separationwordstocut[n] - 1] = sentence[separationwordstocut[n] - 1] + 'alohaseparator'
|
|
#print('a new punctuation2')
|
|
newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
|
|
processed = 1
|
|
|
|
if processed == 0:
|
|
newsentences.append([sentence])
|
|
|
|
if processed == 1:
|
|
#print(sentence)
|
|
splitsentence = []
|
|
for word in sentence:
|
|
splitsentence.append(word)
|
|
if word[-14:] == 'alohaseparator':
|
|
if splitsentence[-1][-15] == ',':
|
|
splitsentence[-1] = splitsentence[-1][:-15]
|
|
else:
|
|
splitsentence[-1] = splitsentence[-1][:-14]
|
|
newsentences.append([splitsentence])
|
|
splitsentence = []
|
|
newsentences.append([splitsentence])
|
|
|
|
#print(newpunctuationsindexes)
|
|
newpunctuationsindexes = newpunctuationsindexes[::-1]
|
|
for n in range(len(newpunctuationsindexes)):
|
|
punctuations.insert(newpunctuationsindexes[n][1], newpunctuationsindexes[n][0])
|
|
|
|
#print(newsentences, punctuations)
|
|
return newsentences, punctuations
|
|
|
|
|
|
|
|
def LoadBoWModelAndDatabaseOnesZeros(self):
|
|
|
|
|
|
import FASTsearch
|
|
|
|
#print('loading the tag hkl db..')
|
|
self.fsearch1 = FASTsearch.FASTsearch('GS_DB_word.tag_.hkl')
|
|
#print('done')
|
|
|
|
#print('generating BoW Model..')
|
|
self.fsearch1.Gen_BoW_Model(1000, "word")
|
|
#print('done')
|
|
|
|
#print('loading the bow model')
|
|
self.fsearch1.Load_BoW_Model('bagofwordsGS_DB_word.tag_.pkl', 'DataBaseOneZerosGS_DB_word.tag_.hkl')
|
|
#print('done')
|
|
|
|
#print('loading the dep hkl db..')
|
|
self.fsearch2 = FASTsearch.FASTsearch('GS_DB_word.dep_.hkl')
|
|
#print('done')
|
|
|
|
#print('generating BoW Model..')
|
|
self.fsearch2.Gen_BoW_Model(1000, "word")
|
|
#print('done')
|
|
|
|
#print('loading the bow model')
|
|
self.fsearch2.Load_BoW_Model('bagofwordsGS_DB_word.dep_.pkl', 'DataBaseOneZerosGS_DB_word.dep_.hkl')
|
|
#print('done')
|
|
|
|
def LoadSentGlueSGDandGSUtils(self):
|
|
|
|
import GS_Utils
|
|
#print('initializing the gs utils..')
|
|
self.gs = GS_Utils.GS_Utils('de_core_news_sm')
|
|
#print('done')
|
|
|
|
|
|
from SentGlue import SentGlueMach
|
|
#print('loading the Stochastic Gradient models..')
|
|
self.sgm = SentGlueMach('trainedSGD_twolabel.pkl', 'bagofwordstwolabel.pkl')
|
|
#print('done')
|
|
#print('initializing the SGM..')
|
|
self.sgm.initialize()
|
|
#print('done')
|
|
|
|
#print('importing spacy..')
|
|
import spacy
|
|
#print('done')
|
|
|
|
#print('importing german model..')
|
|
self.nlp = spacy.load('de_core_news_sm')
|
|
#print('done')
|
|
|
|
return 'done'
|
|
|
|
def CommaSentenceOrNot(self, sentences):
|
|
|
|
nlp = self.nlp
|
|
|
|
commasentences = []
|
|
counter = 0
|
|
|
|
#print('creating array of comma or not..')
|
|
for sentence in sentences:
|
|
|
|
doc = nlp(' '.join(sentence[0]))
|
|
|
|
#print(doc)
|
|
counter += 1
|
|
#if counter % 100 == 0:
|
|
#print(counter)
|
|
|
|
|
|
n = 0
|
|
firstone = 0
|
|
token = []
|
|
nextword = 0
|
|
for word in doc:
|
|
#print(word.tag_)
|
|
# es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
|
|
|
|
if firstone == 0:
|
|
token.append(word.text)
|
|
|
|
firstone = 1
|
|
|
|
|
|
if nextword == 1:
|
|
token.append(word.text)
|
|
|
|
nextword = 0
|
|
|
|
if word.tag_ == '$,':
|
|
n += 1
|
|
nextword = 1
|
|
|
|
sentence.append([n, token])
|
|
|
|
commasentences.append(sentence)
|
|
|
|
#print('done')
|
|
return commasentences
|
|
|
|
def EnumerationSolver(self, sentences):
|
|
|
|
|
|
gs = self.gs
|
|
|
|
|
|
nlp = self.nlp
|
|
|
|
sgm = self.sgm
|
|
|
|
|
|
enumerationsentences = []
|
|
counter = 0
|
|
NOTenumerations = []
|
|
#print('processing enumerations..')
|
|
for sentence in sentences:
|
|
|
|
doc = nlp(' '.join(sentence[0]))
|
|
|
|
#print(doc)
|
|
counter += 1
|
|
#if counter % 100 == 0:
|
|
#print(counter)
|
|
|
|
n = 0
|
|
firstone = 0
|
|
token = []
|
|
nextword = 0
|
|
enumeration = False
|
|
|
|
splitsentence = []
|
|
splitsentence_deps = []
|
|
splitsentence_tags = []
|
|
splitsentences = []
|
|
splitsentences_deps = []
|
|
splitsentences_tags = []
|
|
|
|
|
|
|
|
for word in doc:
|
|
#print(word.tag_)
|
|
# es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
|
|
|
|
|
|
|
|
nextword = 0
|
|
|
|
if word.tag_ == '$,':
|
|
n += 1
|
|
nextword = 1
|
|
|
|
if (word.text == 'und' or word.text == 'oder') and n >= 1:
|
|
enumeration = True
|
|
break
|
|
|
|
|
|
output = []
|
|
if enumeration == True:
|
|
|
|
for word in doc:
|
|
|
|
#print(word.text)
|
|
|
|
if word.text != ',' and word.text != '.' and word.text != 'und':
|
|
|
|
splitsentence.append(word.text)
|
|
splitsentence_deps.append(word.dep_)
|
|
splitsentence_tags.append(word.tag_)
|
|
|
|
if word.text == ',' or word.text == 'und':
|
|
|
|
#print('oi')
|
|
|
|
splitsentences.append(splitsentence)
|
|
splitsentences_deps.append(splitsentence_deps)
|
|
splitsentences_tags.append(splitsentence_tags)
|
|
splitsentence = []
|
|
splitsentence_deps = []
|
|
splitsentence_tags = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
splitsentences_deps.append(splitsentence_deps)
|
|
splitsentences_tags.append(splitsentence_tags)
|
|
|
|
#print( 'splitsentences', splitsentences)
|
|
|
|
token = []
|
|
enumerations = []
|
|
enumerationsSPOs = []
|
|
NOTenumerations = []
|
|
|
|
for sentence in splitsentences:
|
|
token.append(sentence[0])
|
|
|
|
|
|
if sentence[0] not in self.full_list:
|
|
enumerations.append(sentence)
|
|
enumerationsSPOs.append(gs.checkSPO(sentence, 0))
|
|
else:
|
|
NOTenumerations.append(sentence)
|
|
|
|
#print(enumerationsSPOs)
|
|
|
|
|
|
#print('enumerations', enumerations)
|
|
biggest = []
|
|
for i in range(len(enumerationsSPOs)):
|
|
biggest.append([i, sum(enumerationsSPOs[i])])
|
|
|
|
|
|
sortedbiggest = sorted(biggest[::-1], key=lambda tup: tup[1], reverse=True)
|
|
|
|
for i in range(len(sortedbiggest)):
|
|
if sortedbiggest[i][0] == 0:
|
|
mainsentenceIndex = sortedbiggest[i][0]
|
|
lastornot = 0
|
|
break
|
|
|
|
if sortedbiggest[i][0] == len(biggest) - 1:
|
|
mainsentenceIndex = sortedbiggest[i][0]
|
|
lastornot = 1
|
|
break
|
|
|
|
|
|
# Hier muss noch für den Fall Er, sie und der Beamte LACHTEN den Clown aus --> das lachten abgefangen werden mit der Datenbank der Fälle, sprich enumeration im spo 1 0 0 + plural muss dann zu singular werden abhängig von den artikeln.
|
|
#print('enumerations', enumerations)
|
|
mainsentence = enumerations[mainsentenceIndex]
|
|
#print('main', mainsentence)
|
|
probablemainsentences = []
|
|
for i in range(len(enumerations)):
|
|
if i != mainsentenceIndex:
|
|
iprobablemainsentences = []
|
|
probablemainsentence = []
|
|
if lastornot == 0:
|
|
for j in range(1, len(mainsentence)):
|
|
probablemainsentence = mainsentence[0:j] + enumerations[i]
|
|
#print(probablemainsentence)
|
|
iprobablemainsentences.append(' '.join(probablemainsentence))
|
|
if lastornot == 1:
|
|
for j in range(1, len(mainsentence)):
|
|
probablemainsentence = enumerations[i] + mainsentence[-j:]
|
|
iprobablemainsentences.append(' '.join(probablemainsentence))
|
|
probablemainsentences.append(iprobablemainsentences)
|
|
|
|
|
|
# hier wird auf noch da geprüft, aber es ist wichtiger in diesem fall, dass ein tuple nicht zerissen vorkommt AENDERN !!!!
|
|
|
|
#print('probablemainsentences', probablemainsentences)
|
|
tuplesToCheck = []
|
|
tuples = [['ART', 'NN'], ['APPR','NN'], ['ART', 'CARD']]
|
|
for tupl in tuples:
|
|
|
|
|
|
checktupleindex, tupleInWords = gs.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')
|
|
if checktupleindex == 2:
|
|
tuplesToCheck.append([tupl, tupleInWords])
|
|
triplesToCheck = []
|
|
triples = [['ART','ADJA','NN'], ['APPR', 'ART', 'NN'], ['KOKOM', 'ART', 'NN']]
|
|
for tripl in triples:
|
|
checktripleindex, tripleInWords = gs.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')
|
|
if checktripleindex == 3:
|
|
triplesToCheck.append([tripl, tripleInWords])
|
|
|
|
#print('tuples to check', tuplesToCheck)
|
|
#print('triples to check', triplesToCheck)
|
|
#print('probablemainsentences', probablemainsentences)
|
|
for probsentences in probablemainsentences:
|
|
|
|
checktripleindexes = []
|
|
checktupleindexes = []
|
|
#print(probsentences)
|
|
filteredprobsentences = []
|
|
for sentence in probsentences:
|
|
tuplchecked = 0
|
|
triplchecked = 0
|
|
#print('sentence and tuples to check', sentence, tuplesToCheck)
|
|
for tupl in tuplesToCheck:
|
|
|
|
checkedsecondtime, tupleinWords = gs.checkForAnnotationTuple(sentence.split(), tupl[0], 'word.tag_', tupl[1])
|
|
|
|
#print(sentence, checkedsecondtime)
|
|
if checkedsecondtime == 1:
|
|
|
|
tuplchecked = 0
|
|
if checkedsecondtime == 2:
|
|
|
|
tuplchecked = 1
|
|
|
|
for tripl in triplesToCheck:
|
|
checkedsecondtime, tripleinWords = gs.checkForAnnotationTriple(sentence.split(), tripl[0], 'word.tag_', tripl[1])
|
|
if checkedsecondtime == 1 or checkedsecondtime == 2:
|
|
|
|
triplchecked = 0
|
|
if checkedsecondtime == 3:
|
|
|
|
triplchecked = 1
|
|
|
|
|
|
|
|
|
|
if triplchecked == 1 or tuplchecked == 1:
|
|
filteredprobsentences.append(sentence)
|
|
|
|
#print('filteredprobsentences', filteredprobsentences)
|
|
if len(filteredprobsentences) == 0:
|
|
filteredprobsentences = probsentences
|
|
# here is still the problem, that there are lists of words instead of proper sentences..
|
|
#print('filteredprobsentences', filteredprobsentences)
|
|
probsMatrix = sgm.predictprobsOnSentenceList(filteredprobsentences, filteredprobsentences)
|
|
|
|
#print(probsMatrix)
|
|
|
|
for i in range(len(probsMatrix)):
|
|
probsMatrix[i][0] = i
|
|
|
|
#print(probsMatrix)
|
|
|
|
sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
|
|
|
|
#print(sortedprobsMatrix)
|
|
|
|
bestindex = sortedprobsMatrix[0][0]
|
|
|
|
#print(bestindex)
|
|
#print('probablemainsentences', filteredprobsentences)
|
|
probablemainsentence = filteredprobsentences[int(bestindex)]
|
|
#print('oi', probablemainsentence)
|
|
|
|
#print('probablemainsentence', probablemainsentence)
|
|
enumerationsentences.append([probablemainsentence])
|
|
|
|
|
|
enumerationsentences.append([' '.join(mainsentence)])
|
|
|
|
for notenum in NOTenumerations:
|
|
#print(enumerationsentences)
|
|
#print(enumerationsentences[-1])
|
|
#print('enum no1', enumerationsentences)
|
|
#print('notenum', notenum)
|
|
enumerationsentences[-1].append(' '.join(notenum))
|
|
#print('enumsentences',enumerationsentences[-1])
|
|
enumerationsentences[-1] = [', '.join(enumerationsentences[-1])]
|
|
|
|
|
|
else:
|
|
enumerationsentences.append([sentence])
|
|
|
|
|
|
|
|
output.append(enumerationsentences)
|
|
|
|
|
|
for n in range(len(output[0])):
|
|
#print('out',output[0][n])
|
|
try:
|
|
output[0][n] = [output[0][n][0].split()]
|
|
except:
|
|
output[0][n] = [output[0][n][0][0]]
|
|
|
|
|
|
#print('done')
|
|
return output[0]
|
|
|
|
|
|
def GetUtteranceNumber(self, sentences):
|
|
|
|
nlp = self.nlp
|
|
|
|
uttersentences = []
|
|
|
|
for sentence in sentences:
|
|
|
|
doc = nlp(' '.join(sentence[0]))
|
|
|
|
subjectcount = 0
|
|
|
|
for word in doc:
|
|
|
|
if word.dep_ == 'sb' or word.dep_ == 'ep':
|
|
subjectcount += 1
|
|
|
|
sentence.append(subjectcount)
|
|
uttersentences.append(sentence)
|
|
|
|
return uttersentences
|
|
|
|
def GetQuestionOrNot(self, sentences):
|
|
|
|
nlp = self.nlp
|
|
|
|
uttersentences = []
|
|
questionmark = 0
|
|
for sentence in sentences:
|
|
|
|
doc = nlp(' '.join(sentence[0]))
|
|
|
|
|
|
count = 0
|
|
for word in doc:
|
|
|
|
|
|
count += 1
|
|
|
|
if word.text == '?':
|
|
questionmark = 1
|
|
|
|
sentence.append(questionmark)
|
|
uttersentences.append(sentence)
|
|
|
|
return uttersentences
|
|
|
|
def SplitSentencesIntoHauptNebenTuple(self, sentences, punctuations):
|
|
|
|
|
|
|
|
oldsplitsentences = []
|
|
#print('hauptneben inputsentences', sentences)
|
|
|
|
gs = self.gs
|
|
|
|
#print('importing spacy..')
|
|
import spacy
|
|
#print('done')
|
|
|
|
nlp = self.nlp
|
|
|
|
outputsentences = []
|
|
sentencesThatAreOutoutput = []
|
|
outsentences = []
|
|
for generalindex in range(len(sentences)):
|
|
presentence = sentences[generalindex]
|
|
|
|
splitsentence = []
|
|
splitsentence_deps = []
|
|
splitsentence_tags = []
|
|
splitsentences = []
|
|
splitsentences_deps = []
|
|
splitsentences_tags = []
|
|
commainfo = presentence[1]
|
|
outputsentence = []
|
|
|
|
|
|
token = commainfo[1]
|
|
|
|
commaornot = commainfo[0]
|
|
|
|
numberutterances = presentence[2]
|
|
|
|
sentence = presentence[0]
|
|
|
|
oldsentence = presentence[0]
|
|
|
|
#print(commaornot)
|
|
if commaornot >= 2:
|
|
#print('nla')
|
|
|
|
sentence[0] = sentence[0].title()
|
|
|
|
doc = nlp(' '.join(sentence))
|
|
|
|
|
|
for word in doc:
|
|
|
|
#print(word.text)
|
|
|
|
if word.text != ',' and word.text != '.':
|
|
|
|
splitsentence.append(word.text)
|
|
splitsentence_deps.append(word.dep_)
|
|
splitsentence_tags.append(word.tag_)
|
|
|
|
if word.text == ',':
|
|
|
|
#print('oi')
|
|
|
|
splitsentences.append(splitsentence)
|
|
splitsentences_deps.append(splitsentence_deps)
|
|
splitsentences_tags.append(splitsentence_tags)
|
|
splitsentence = []
|
|
splitsentence_deps = []
|
|
splitsentence_tags = []
|
|
|
|
|
|
splitsentences.append(splitsentence)
|
|
splitsentences[0][0] = splitsentences[0][0].lower()
|
|
splitsentences_deps.append(splitsentence_deps)
|
|
splitsentences_tags.append(splitsentence_tags)
|
|
oldsplitsentences = splitsentences
|
|
#print(splitsentences)
|
|
#print(splitsentences_tags)
|
|
#print(splitsentences_deps)
|
|
spo = []
|
|
|
|
for n in range(len(splitsentences)):
|
|
prespo = []
|
|
prespo = gs.checkSPO(splitsentences_deps[n], 1)
|
|
prespo.append( gs.checkForAnnotation(splitsentences[n], 'VVINF', 'word.tag_'))
|
|
prespo.append(gs.checkForAnnotation(splitsentences[n], 'VAFIN', 'word.tag_'))
|
|
prespo.append(gs.checkForAnnotation(splitsentences[n], 'VVFIN', 'word.tag_'))
|
|
prespo.append(gs.checkForAnnotation(splitsentences[n], 'VMFIN', 'word.tag_'))
|
|
|
|
|
|
|
|
|
|
|
|
spo.append(prespo)
|
|
#print(splitsentences_deps)
|
|
#print(splitsentences)
|
|
#print(spo)
|
|
|
|
indexSPO = []
|
|
lastm = len(splitsentences)
|
|
for o in range(len(splitsentences)):
|
|
|
|
m = len(splitsentences) - 1 - o
|
|
for n in range(len(splitsentences)):
|
|
|
|
|
|
|
|
if m < n - 1 and n < lastm:
|
|
|
|
#print('spo s',spo[m], spo[n])
|
|
sb = spo[m][0] + spo[n][0]
|
|
Vafin = 1
|
|
if spo[m][3] == 1 or spo[n][3] == 1:
|
|
Vafin = spo[m][3] + spo[n][3]
|
|
Vvinf = 1
|
|
if spo[m][4] == 1 or spo[n][4] == 1:
|
|
Vvinf = spo[m][4] + spo[n][4]
|
|
Vvfin = 1
|
|
if spo[m][5] == 1 or spo[n][5] == 1:
|
|
Vvfin = spo[m][5] + spo[n][5]
|
|
Vmfin = 1
|
|
if spo[m][6] == 1 or spo[n][6] == 1:
|
|
Vmfin == spo[m][6] + spo[n][6]
|
|
#wrapped = 0
|
|
#for n in range(len(indexSPO)):
|
|
#if n == indexSPO[n][0] + 1 and n == indexSPO[n][1] - 1:
|
|
#wrapped = 1
|
|
#print(sb, Vafin, Vvinf, Vvfin, Vmfin, 'm n', m, n)
|
|
if sb == 1 and Vafin == 1 and Vvinf == 1 and (Vvfin == 1 or Vmfin == 1):
|
|
|
|
indexSPO.append([m,n])
|
|
#print([m,n])
|
|
lastm = m
|
|
#print('lastm',lastm)
|
|
|
|
|
|
|
|
#print(splitsentences)
|
|
Hauptsentences = []
|
|
for n in range(len(indexSPO)):
|
|
if indexSPO[n][0] > indexSPO[n][1]:
|
|
i = 1
|
|
j = 0
|
|
else:
|
|
i = 0
|
|
j = 1
|
|
Hauptsentences.append([splitsentences[indexSPO[n][i]] + splitsentences[indexSPO[n][j]] , indexSPO[n][i], indexSPO[n][j] ])
|
|
|
|
HauptSentences = []
|
|
for n in range(len(Hauptsentences)):
|
|
m = len(Hauptsentences) - 1 - n
|
|
HauptSentences.append(Hauptsentences[m])
|
|
|
|
#print('Hauptsentences', Hauptsentences)
|
|
#print('HauptSentences', HauptSentences)
|
|
sentencesThatAreOut =[]
|
|
|
|
for n in range(len(HauptSentences)):
|
|
index = HauptSentences[n][1]
|
|
finish = 0
|
|
#print('Oi',HauptSentences[n])
|
|
if n == len(HauptSentences) - 1:
|
|
|
|
#print('lenHauptsentences', len(HauptSentences))
|
|
|
|
stopindex = len(splitsentences)
|
|
finish = 1
|
|
else:
|
|
stopindex = HauptSentences[n + 1][1]
|
|
#print('stopindex', stopindex)
|
|
vvfinisthere = 0
|
|
if finish == 0:
|
|
if splitsentences_tags[stopindex][0] == 'VVFIN':
|
|
stopindex -= 1
|
|
vvfinisthere = 1
|
|
|
|
if splitsentences_tags[index][0] == 'VVFIN':
|
|
vvfinisthere = 1
|
|
|
|
if vvfinisthere == 1:
|
|
|
|
|
|
HNTuple = HauptSentences[n][0] + [','] + splitsentences[index - 1]
|
|
outputsentence.append(HNTuple)
|
|
sentencesThatAreOut.append(index - 1)
|
|
sentencesThatAreOut.append(Hauptsentences[n][1])
|
|
sentencesThatAreOut.append(Hauptsentences[n][2])
|
|
|
|
for m in range(index + 1, stopindex ):
|
|
if m != HauptSentences[n][2]:
|
|
HNTuple = HauptSentences[n][0] + [','] + splitsentences[m]
|
|
#print('check', HauptSentences[n], n)
|
|
#print('check', splitsentences[m], m)
|
|
#print('double', HNTuple)
|
|
outputsentence.append(HNTuple)
|
|
|
|
|
|
|
|
sentencesThatAreOut.append(m)
|
|
sentencesThatAreOut.append(Hauptsentences[n][1])
|
|
sentencesThatAreOut.append(Hauptsentences[n][2])
|
|
|
|
sentencesThatAreOutoutput.append(sentencesThatAreOut)
|
|
|
|
|
|
cpOrNots = []
|
|
rcOrNots = []
|
|
for splitsentence in splitsentences_deps:
|
|
cpOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'cp')
|
|
cpOrNots.append(cpOrNot)
|
|
rcOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'rc')
|
|
rcOrNots.append(rcOrNot)
|
|
|
|
#print('Laenge splitsentences', len(splitsentences))
|
|
#print('laenge cpOrNots', len(cpOrNots))
|
|
#print(cpOrNots)
|
|
#print('rc or nots', rcOrNots)
|
|
pairs = []
|
|
for n in range(len(cpOrNots)):
|
|
index = len(cpOrNots) - 1 - n
|
|
done = 0
|
|
if rcOrNots[index] == 1:
|
|
pairs.append([index, index - 1])
|
|
done = 1
|
|
|
|
|
|
if done == 0 and cpOrNots[index] == 1:
|
|
try:
|
|
if splitsentences_tags[index + 1][0] == 'VVFIN':
|
|
pairs.append([index, index + 1])
|
|
done = 1
|
|
except:
|
|
pass
|
|
try:
|
|
if done == 0 and rcOrNots[index - 1] == 0:
|
|
pairs.append([index, index - 1])
|
|
done = 1
|
|
except:
|
|
pass
|
|
try:
|
|
if done == 0 and rcOrNots[index - 1] == 1:
|
|
if rcOrNots[index - 2] == 0:
|
|
pairs.append([index, index - 2])
|
|
except:
|
|
pass
|
|
|
|
for pair in pairs[::-1]:
|
|
if pair[0] not in set(sentencesThatAreOut) or pair[1] not in set(sentencesThatAreOut):
|
|
outputsentence.append(splitsentences[pair[1]] + [','] + splitsentences[pair[0]])
|
|
#print('hnhn',sentences)
|
|
sentences[generalindex][0] = outputsentence
|
|
|
|
#print('outputsentence hntuple',outputsentence)
|
|
#outputsentences.append([outputsentence , i])
|
|
|
|
#print('Oio', outputsentences)
|
|
#print(sentencesThatAreOutoutput)
|
|
#print(splitsentences)
|
|
#print('oioioioioioioio',sentences)
|
|
|
|
#print(sentences[0][0])
|
|
|
|
|
|
#print('oioi',sentences[n])
|
|
#print('malatesta', sentences[n][0][0])
|
|
#print('generalindex sentences index 0', sentences[generalindex][0])
|
|
try:
|
|
if type(sentences[generalindex][0][0]) == str:
|
|
sentences[generalindex][0] = [sentences[generalindex][0]]
|
|
except:
|
|
pass
|
|
#print('generalindex sentences index 0', sentences[generalindex][0])
|
|
#print('oldsentence', oldsentence)
|
|
newgeneratedsentences = len(sentences[generalindex][0])
|
|
if newgeneratedsentences > 1:
|
|
#print('goti t')
|
|
for sentence in sentences[generalindex][0]:
|
|
punctuations.insert(generalindex, punctuations[generalindex])
|
|
outsentences.append(sentence)
|
|
del punctuations[generalindex]
|
|
if newgeneratedsentences == 1:
|
|
if len(sentences[generalindex][0][0]) > 1:
|
|
outsentences.append(sentences[generalindex][0][0])
|
|
else:
|
|
outsentences.append(oldsentence)
|
|
if newgeneratedsentences == 0:
|
|
#print('case oldsentence', oldsentence)
|
|
outsentences.append(oldsentence)
|
|
#print('oioi', sentences[n])
|
|
# connect alonestanding commatas with the word before
|
|
#print('theoutsentences', outsentences)
|
|
for outsentence in outsentences:
|
|
todelete = []
|
|
for n in range(len(outsentence)):
|
|
if outsentence[n] == ',':
|
|
todelete.append(n)
|
|
outsentence[n-1] = outsentence[n-1] + ','
|
|
for deleteindex in todelete[::-1]:
|
|
del outsentence[deleteindex]
|
|
|
|
for index in range(len(outsentences)):
|
|
outsentences[index] = [outsentences[index]]
|
|
#print('theoutsentences', outsentences)
|
|
|
|
#removing doubles
|
|
doubledsentences = []
|
|
for o in range(len(outsentences)):
|
|
sentence = outsentences[o][0]
|
|
for m in range(len(outsentences)):
|
|
if m != o:
|
|
count = 0
|
|
for n in range(len(sentence)):
|
|
if sentence[n] in outsentences[m][0] or sentence[n][:-1] in outsentences[m][0]:
|
|
count += 1
|
|
if count == len(sentence):
|
|
doubledsentences.append(sentence)
|
|
punctdeleteindex = []
|
|
tmp = set()
|
|
for sentence in doubledsentences:
|
|
tmp.add(tuple(sentence))
|
|
#print(list(tmp))
|
|
doubledsentences = []
|
|
for tup in tmp:
|
|
doubledsentences.append([list(tup)])
|
|
#print('doubledsentences',doubledsentences)
|
|
punctdeleteindexes = []
|
|
for double in doubledsentences:
|
|
if double in outsentences:
|
|
punctdeleteindex = outsentences[::-1].index(double)
|
|
del outsentences[len(outsentences) - 1 - punctdeleteindex]
|
|
punctdeleteindexes.append(punctdeleteindex)
|
|
|
|
for index in punctdeleteindexes[::-1]:
|
|
del punctuations[len(outsentences) - 1 - index]
|
|
|
|
#print('oldsplit',oldsplitsentences)
|
|
#print('outsents',outsentences)
|
|
|
|
for o in range(len(oldsplitsentences)):
|
|
for m in range(len(outsentences)):
|
|
counter = 0
|
|
for n in range(len(oldsplitsentences[o])):
|
|
if oldsplitsentences[o][n] in outsentences[m][0] or oldsplitsentences[o][n] + ',' in outsentences[m][0]:
|
|
counter += 1
|
|
if counter >= len(oldsplitsentences[o]):
|
|
break
|
|
if m == len(outsentences) - 1 and counter < len(oldsplitsentences[o]):
|
|
if o == 0:
|
|
outsentences.insert(0,[oldsplitsentences[o]])
|
|
punctuations.insert(0, punctuations[0])
|
|
else:
|
|
newones = []
|
|
for i in range(len(outsentences)):
|
|
if outsentences[i][0][-1] == oldsplitsentences[o - 1][-1]:
|
|
if len(outsentences[i][0]) > 2 and len(oldsplitsentences[o - 1]) > 2:
|
|
if outsentences[i][0][-2] == oldsplitsentences[o - 1][-2]:
|
|
if outsentences[i][0][-3] == oldsplitsentences[o - 1][-3]:
|
|
newones.append([i + 1, [oldsplitsentences[o]]])
|
|
for newone in newones[::-1]:
|
|
#print(newones)
|
|
outsentences.insert(newone[0], newone[1])
|
|
punctuations.insert(newone[0], punctuations[newone[0] - 1])
|
|
|
|
|
|
|
|
|
|
|
|
#print('outsentences at the very end ', outsentences, punctuations)
|
|
return outsentences, punctuations
|
|
|
|
|
|
# Notiz: Hier muss der Input immer Paare sein, von Hauptsatz/Nebensatz. D.h. eine weitere vorgeschaltete Klasse ist von Nöten.
|
|
|
|
def SplitCommatas(self, Inputsentences, punctuations):
|
|
|
|
gs = self.gs
|
|
|
|
nlp = self.nlp
|
|
|
|
gramcorr_splitsentences = []
|
|
counter = 0
|
|
newpunctuationsindex = []
|
|
for Inputsentence in Inputsentences:
|
|
|
|
counter += 1
|
|
|
|
|
|
commainfo = Inputsentence[1]
|
|
|
|
|
|
token = commainfo[1]
|
|
|
|
commaornot = commainfo[0]
|
|
|
|
numberutterances = Inputsentence[2]
|
|
|
|
|
|
if commaornot == 0:
|
|
gramcorr_splitsentences.append(Inputsentence[0])
|
|
|
|
if commaornot > 1:
|
|
gramcorr_splitsentences.append(Inputsentence[0])
|
|
|
|
if commaornot == 1:
|
|
oldsentence = Inputsentence[0]
|
|
Inputsentence = [[Inputsentence[0]]]
|
|
|
|
|
|
|
|
|
|
for sentence in Inputsentence[0]:
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences = []
|
|
|
|
|
|
|
|
|
|
processed = 0
|
|
wasNotInAnyList = 0
|
|
try:
|
|
for n in range(len(token)):
|
|
|
|
if token[n] in self.final_list:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
|
|
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
if n == 1:
|
|
|
|
|
|
if token[n] == 'um' or token[n] == 'Um':
|
|
|
|
splitsentences[n].insert(0,'dies')
|
|
splitsentences[n].insert(0,'um')
|
|
else:
|
|
splitsentences[n].insert(0,'dann')
|
|
|
|
|
|
|
|
if n == 0:
|
|
|
|
if token[n] == 'um' or token[n] == 'Um':
|
|
splitsentences[n].insert(0,'dies')
|
|
splitsentences[n].insert(0,'um')
|
|
splitsentences = splitsentences[::-1]
|
|
else:
|
|
splitsentences[n].insert(0,'dann')
|
|
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
|
|
processed = 1
|
|
|
|
if token[n] in self.adversativ_list:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentences[n].append('jedoch')
|
|
|
|
|
|
generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
|
|
processed = 1
|
|
|
|
if token[n] in self.kausal_list:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
# Da deswegen an den anderen Satz gehaengt wird, muss der input zu commasentences immer ZWEI sentences sein.
|
|
#print('splitsentences in kausal', splitsentences)
|
|
if n == 1:
|
|
splitsentences[n - 1].insert(0,'deswegen')
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
|
|
|
|
if n == 0:
|
|
splitsentences[n + 1].insert(0,'deswegen')
|
|
|
|
|
|
|
|
|
|
#print('splitsentences in kausal', splitsentences)
|
|
|
|
|
|
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
|
|
processed = 1
|
|
|
|
# from here come konsekutiv sentences, they have to be split according https://www.deutschplus.net/pages/Konsekutivsatz
|
|
if token[n] in self.konsekutiv_list:
|
|
#print('oi konsekutiv')
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
generalrules = [['KOUS','PPER']]
|
|
processed = 1
|
|
|
|
|
|
if token[n] in self.konditional_list:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if n == 1:
|
|
|
|
spoCount = gs.checkSPO(splitsentences[n], 0)
|
|
|
|
spoCount = sum(spoCount)
|
|
|
|
if spoCount == 2:
|
|
thereisanes = 0
|
|
for word in splitsentences[n]:
|
|
if word == 'es' or word == 'Es':
|
|
thereisanes = 1
|
|
if thereisanes == 0:
|
|
splitsentences[n].append('es')
|
|
|
|
|
|
if n == 0:
|
|
|
|
|
|
spoCount = gs.checkSPO(splitsentences[n], 0)
|
|
|
|
spoCount = sum(spoCount)
|
|
|
|
if spoCount == 2:
|
|
|
|
thereisanes = 0
|
|
for word in splitsentences[n]:
|
|
if word == 'es' or word == 'Es':
|
|
thereisanes = 1
|
|
if thereisanes == 0:
|
|
splitsentences[n].append('es')
|
|
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
generalrules = [['KOUS','PPER']]
|
|
processed = 1
|
|
|
|
if token[n] in self.konzessiv_list:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if n == 1:
|
|
splitsentences[n - 1].insert(0,'trotzdem')
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
|
|
|
|
if n == 0:
|
|
splitsentences[n + 1].insert(0,'trotzdem')
|
|
|
|
|
|
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
|
|
processed = 1
|
|
|
|
if token[n] in self.lokal_list:
|
|
#print('lokal ole ole ')
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if n == 1:
|
|
splitsentences[n - 1].insert(0,'dort')
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
|
|
|
|
if n == 0:
|
|
splitsentences[n + 1].insert(0,'dort')
|
|
|
|
|
|
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
|
|
processed = 1
|
|
|
|
if token[n] in self.instrumental_list:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if n == 1:
|
|
splitsentences[n - 1].insert(0,'so')
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
|
|
|
|
if n == 0:
|
|
splitsentences[n + 1].insert(0,'so')
|
|
|
|
|
|
generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
|
|
processed = 1
|
|
|
|
if token[n] in self.temporal_list_vor:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if n == 1:
|
|
splitsentences[n].insert(0,'danach')
|
|
|
|
|
|
|
|
|
|
if n == 0:
|
|
splitsentences[n].insert(0,'danach')
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
|
|
processed = 1
|
|
|
|
if token[n] in self.temporal_list_nach:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
if word != token[n]:
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if n == 1:
|
|
splitsentences[n].insert(0,'davor')
|
|
|
|
|
|
|
|
|
|
if n == 0:
|
|
splitsentences[n].insert(0,'davor')
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
|
|
processed = 1
|
|
|
|
#print(token[n])
|
|
if token[n] == 'der' or token[n] == 'welcher':
|
|
|
|
tokens = self.nlp(' '.join(sentence))
|
|
for word in tokens:
|
|
if word.dep_ == 'rc':
|
|
wordwithrc = word.text
|
|
|
|
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
|
|
|
|
oldsplitsentences = splitsentences
|
|
splitsentences = []
|
|
|
|
if rcORnot == 1:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
# das umtauschen wird hier vollzogen, da ansonsten spacy dieser nicht als PDS einliest.. analog in den anderen.
|
|
|
|
if wordwithrc in splitsentences[n]:
|
|
|
|
splitsentences[n][0] = 'dieser'
|
|
|
|
verb = splitsentences[n][-1]
|
|
|
|
splitsentences[n] = splitsentences[n][:-1]
|
|
splitsentences[n].insert(1, verb)
|
|
|
|
|
|
#print('Vorsicht', splitsentences)
|
|
|
|
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
|
|
processed = 1
|
|
else:
|
|
splitsentences = oldsplitsentences
|
|
splitsentence = []
|
|
|
|
if token[n] == 'die' or token[n] == 'welche':
|
|
|
|
|
|
tokens = self.nlp(' '.join(sentence))
|
|
for word in tokens:
|
|
if word.dep_ == 'rc':
|
|
wordwithrc = word.text
|
|
|
|
|
|
|
|
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
|
|
|
|
|
|
oldsplitsentences = splitsentences
|
|
splitsentences = []
|
|
|
|
|
|
if rcORnot == 1:
|
|
#print('it went to rcornot in case die')
|
|
|
|
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
if wordwithrc in splitsentences[n]:
|
|
#print('wordwithrc was in sentence')
|
|
#print(wordwithrc)
|
|
#print(splitsentences[n])
|
|
#print('wordwithrcend')
|
|
splitsentences[n][0] = 'diese'
|
|
|
|
verb = splitsentences[n][-1]
|
|
|
|
splitsentences[n] = splitsentences[n][:-1]
|
|
splitsentences[n].insert(1, verb)
|
|
|
|
|
|
|
|
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
|
|
processed = 1
|
|
else:
|
|
|
|
splitsentences = oldsplitsentences
|
|
splitsentence = []
|
|
|
|
if token[n] == 'dem':
|
|
|
|
tokens = self.nlp(' '.join(sentence))
|
|
for word in tokens:
|
|
if word.dep_ == 'rc':
|
|
wordwithrc = word.text
|
|
|
|
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
|
|
|
|
oldsplitsentences = splitsentences
|
|
splitsentences = []
|
|
|
|
if rcORnot == 1:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',' and word[-1] != '.':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if wordwithrc in splitsentences[n]:
|
|
|
|
splitsentences[n][0] = 'diesem'
|
|
|
|
verb = splitsentences[n][-1]
|
|
|
|
splitsentences[n] = splitsentences[n][:-1]
|
|
splitsentences[n].insert(1, verb)
|
|
|
|
|
|
|
|
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
|
|
processed = 1
|
|
else:
|
|
splitsentences = oldsplitsentences
|
|
splitsentence = []
|
|
|
|
if token[n] == 'das' or token[n] == 'welches':
|
|
|
|
tokens = self.nlp(' '.join(sentence))
|
|
for word in tokens:
|
|
if word.dep_ == 'rc':
|
|
wordwithrc = word.text
|
|
|
|
|
|
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
|
|
|
|
#print('Oeeee',rcORnot)
|
|
oldsplitsentences = splitsentences
|
|
splitsentences = []
|
|
if rcORnot == 1:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
#print('splitsentence in das rc', splitsentences)
|
|
if wordwithrc in splitsentences[n]:
|
|
|
|
splitsentences[n][0] = 'dieses'
|
|
|
|
verb = splitsentences[n][-1]
|
|
#print('verb',verb)
|
|
splitsentences[n] = splitsentences[n][:-1]
|
|
splitsentences[n].insert(1, verb)
|
|
|
|
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
|
|
processed = 1
|
|
else:
|
|
splitsentences = oldsplitsentences
|
|
splitsentence = []
|
|
|
|
if token[n] == 'dessen' or token[n] == 'wessen':
|
|
|
|
tokens = self.nlp(' '.join(sentence))
|
|
for word in tokens:
|
|
if word.dep_ == 'rc':
|
|
wordwithrc = word.text
|
|
|
|
|
|
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
|
|
|
|
oldsplitsentences = splitsentences
|
|
splitsentences = []
|
|
|
|
if rcORnot == 1:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
if wordwithrc in splitsentences[n]:
|
|
verb = splitsentences[n][-1]
|
|
|
|
splitsentences[n] = splitsentences[n][:-1]
|
|
splitsentences[n].insert(1, verb)
|
|
|
|
|
|
|
|
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
|
|
processed = 1
|
|
else:
|
|
splitsentences = oldsplitsentences
|
|
splitsentence = []
|
|
|
|
if token[n] == 'den' or token[n] == 'welchen':
|
|
|
|
tokens = self.nlp(' '.join(sentence))
|
|
for word in tokens:
|
|
if word.dep_ == 'rc':
|
|
wordwithrc = word.text
|
|
|
|
|
|
rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
|
|
|
|
oldsplitsentences = splitsentences
|
|
splitsentences = []
|
|
|
|
if rcORnot == 1:
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if wordwithrc in splitsentences[n]:
|
|
|
|
splitsentences[n][0] = 'diesen'
|
|
|
|
verb = splitsentences[n][-1]
|
|
|
|
splitsentences[n] = splitsentences[n][:-1]
|
|
splitsentences[n].insert(1, verb)
|
|
|
|
|
|
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
|
|
processed = 1
|
|
else:
|
|
splitsentences = oldsplitsentences
|
|
splitsentence = []
|
|
|
|
|
|
if token[n] == 'wem' or token[n] == 'Wem' or token[n] == 'welchem':
|
|
|
|
daORnot = gs.checkForAnnotation(sentence, 'da', 'word.dep_')
|
|
|
|
oaORnot = gs.checkForAnnotation(sentence, 'oa', 'word.dep_')
|
|
|
|
reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
|
|
|
|
oldsplitsentences = splitsentences
|
|
splitsentences = []
|
|
|
|
for word in sentence:
|
|
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if n == 0:
|
|
index = 1
|
|
if n == 1:
|
|
index = 0
|
|
|
|
if reORnot == 1:
|
|
pass
|
|
if daORnot == 1 and reORnot == 0:
|
|
splitsentences[index].insert(1, 'das')
|
|
|
|
if oaORnot == 1 and reORnot == 0:
|
|
splitsentences[index].insert(1, 'dem')
|
|
|
|
if n == 1:
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
|
|
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
|
|
processed = 1
|
|
|
|
if token[n] in self.indirectspeech_list and token[1] not in self.konsekutiv_list:
|
|
|
|
|
|
reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
|
|
oldsplitsentences = splitsentences
|
|
splitsentences = []
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
|
|
if n == 0:
|
|
index = 1
|
|
if n == 1:
|
|
index = 0
|
|
|
|
if reORnot == 0:
|
|
if splitsentences[index][0] != 'was':
|
|
splitsentences[index].insert(1, 'das')
|
|
|
|
if n == 1:
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
|
|
|
|
generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
|
|
processed = 1
|
|
|
|
if processed == 0 and n == 1:
|
|
|
|
ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VVINF'], 'word.tag_', 'None')
|
|
if ZUVINFTupelORnot == 0:
|
|
ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VAINF'], 'word.tag_', 'None')
|
|
|
|
if ZUVINFTupelORnot == 1:
|
|
|
|
reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
|
|
splitsentence = []
|
|
for word in sentence:
|
|
|
|
|
|
if word[-1] == ',':
|
|
splitsentence.append(word[:-1])
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',' :
|
|
splitsentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
splitsentences.append(splitsentence)
|
|
processed = 1
|
|
splitsentence = []
|
|
|
|
splitsentences.append(splitsentence)
|
|
|
|
for m in range(2):
|
|
ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VVINF'], 'word.tag_','None')
|
|
|
|
if ZUINForNOT == 0:
|
|
ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VAINF'], 'word.tag_','None')
|
|
|
|
|
|
if ZUINForNOT == 1:
|
|
r = m
|
|
ZUINForNOT = 0
|
|
|
|
|
|
|
|
if r == 0:
|
|
index = 1
|
|
if r == 1:
|
|
index = 0
|
|
|
|
objectORnot = gs.checkForAnnotation(splitsentences[index] , 'oa', 'word.dep_')
|
|
|
|
if reORnot == 0 and objectORnot == 0:
|
|
splitsentences[index].insert(1, 'das')
|
|
|
|
if r == 1:
|
|
splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
|
|
else:
|
|
processed == 2
|
|
|
|
|
|
except:
|
|
wasNotInAnyList = 1
|
|
|
|
|
|
#rules = [['ART','ADJA','NN'], ['ART','ADJA','NE'], ['ART', 'NN'], ['ART', 'NE'], ['APPR','NN'], ['APPR','NE'], ['APPR', 'ART', 'NN'], ['APPR', 'ART', 'NE'], ['APPR','ART','NN','ADJA','NN'], ['APPR','ART','NN','ADJA','NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD']]
|
|
|
|
#print('B',splitsentences)
|
|
endsentences = []
|
|
if (processed == 2 or processed == 0) and n == 1:
|
|
wasNotInAnyList = 1
|
|
|
|
|
|
try:
|
|
if wasNotInAnyList == 0:
|
|
newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
|
|
#print('splitsentencee', splitsentences)
|
|
if len(splitsentences) > 2:
|
|
splitsentences = splitsentences[:2]
|
|
|
|
#print('splitsentenceeeees', splitsentences)
|
|
|
|
for splitsentence in splitsentences:
|
|
|
|
#print('splitsentenceeeeeeeeeeee!!',splitsentence)
|
|
wordtoputfirst = 'nada'
|
|
for word in self.firstwordlist:
|
|
if word == splitsentence[0]:
|
|
wordtoputfirst = word
|
|
splitsentence.remove(word)
|
|
|
|
|
|
|
|
#print('get the tuples and triples to check..')
|
|
tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(splitsentence)
|
|
#print('done')
|
|
#print(tuplesTocheck, 'ole', triplesTocheck ,'aiai', quadruplesTocheck)
|
|
#print('1')
|
|
grammpiecessentence = self.gs.createTupleofGrammarpieces( splitsentence, tuplesTocheck, triplesTocheck, quadruplesTocheck)
|
|
|
|
#print('grammpiece',grammpiecessentence)
|
|
#print('2')
|
|
if len(grammpiecessentence) > 7:
|
|
print('A sentence is too long, too many permutations. \n piping wrong grammar..')
|
|
endsentence = ' '.join(grammpiecessentence)
|
|
|
|
else:
|
|
#print('genrating the permutations')
|
|
permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
|
|
#print('done')
|
|
#print(permutations)
|
|
#print('3')
|
|
firstwordwithverblist = ['deswegen', 'danach']
|
|
permutationstodelete = []
|
|
for permutation in permutations:
|
|
#print('4')
|
|
if permutation[0] in firstwordwithverblist:
|
|
#print('4.1')
|
|
count = 1
|
|
for word in self.nlp(permutation[1]):
|
|
#print('4.2')
|
|
if word.tag_[0] != 'V':
|
|
#print('4.3')
|
|
permutationstodelete.append(permutation)
|
|
break
|
|
else:
|
|
break
|
|
#for word in self.nlp(permutation[0]):
|
|
#print('4.2')
|
|
#if word.tag_[0] != 'V':
|
|
#print('4.3')
|
|
#permutationstodelete.append(permutation)
|
|
#break
|
|
#else:
|
|
#break
|
|
for delperm in permutationstodelete:
|
|
try:
|
|
permutations.remove(delperm)
|
|
except:
|
|
|
|
pass
|
|
#print('5')
|
|
|
|
sentencesToCheck = []
|
|
if wordtoputfirst in self.firstwordlist:
|
|
for sentence in permutations:
|
|
sentencesToCheck.append(wordtoputfirst + ' ' + ' '.join(sentence))
|
|
else:
|
|
for sentence in permutations:
|
|
sentencesToCheck.append(' '.join(sentence))
|
|
|
|
endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(splitsentence))
|
|
#print('done')
|
|
#print('endsent',endsentence)
|
|
endsentences.append(endsentence)
|
|
except:
|
|
#print('there was an error')
|
|
wasNotInAnyList = 1
|
|
endsentences = []
|
|
todelete = []
|
|
for index in range(len(newpunctuationsindex)):
|
|
if newpunctuationsindex[index][0] == counter - 1:
|
|
todelete.append(index)
|
|
for todel in todelete[::-1]:
|
|
del newpunctuationsindex[todel]
|
|
|
|
|
|
if wasNotInAnyList == 1:
|
|
#print('was not in any list')
|
|
#print(oldsentence)
|
|
endsplisentences = []
|
|
splisentence = []
|
|
for word in oldsentence:
|
|
|
|
|
|
if word[-1] == ',':
|
|
splisentence.append(word[:-1])
|
|
|
|
if word == ',':
|
|
pass
|
|
if word[-1] != ',':
|
|
splisentence.append(word)
|
|
|
|
if word[-1] == ',' or word == ',':
|
|
|
|
endsplisentences.append(splisentence)
|
|
|
|
splisentence = []
|
|
|
|
endsplisentences.append(splisentence)
|
|
|
|
newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
|
|
|
|
#print('endsplisentences',endsplisentences)
|
|
for splsentence in endsplisentences:
|
|
|
|
endsentences.append(' '.join(splsentence))
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
fsearch1 = self.fsearch1
|
|
spacyclass1 = 'word.tag_'
|
|
|
|
|
|
gs_sentence1 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass1)
|
|
|
|
|
|
print('searchPatternMatch for tags')
|
|
bestmatches1 = fsearch1.searchPatternMatch(' '.join(gs_sentence1), 1)
|
|
print('done')
|
|
|
|
#print('oioi', bestmatches1)
|
|
|
|
#print(len(fsearch1.database))
|
|
right_gs_tupel1 = []
|
|
|
|
if len(bestmatches1) < 10:
|
|
bestndocs1 = len(bestmatches1)
|
|
else:
|
|
bestndocs1 = 10
|
|
|
|
for m in range(bestndocs1):
|
|
right_gs_tupel1.append(fsearch1.database[bestmatches1[m][0]])
|
|
|
|
|
|
statistically_correct_sentences1 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence1, right_gs_tupel1)
|
|
|
|
|
|
fsearch2 = self.fsearch2
|
|
|
|
spacyclass2 = 'word.dep_'
|
|
|
|
gs_sentence2 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass2)
|
|
|
|
print('searchPatternMatch for deps')
|
|
bestmatches2 = fsearch2.searchPatternMatch(' '.join(gs_sentence2), 1)
|
|
print('done')
|
|
|
|
right_gs_tupel2 = []
|
|
|
|
|
|
if len(bestmatches2) < 10:
|
|
bestndocs2 = len(bestmatches2)
|
|
else:
|
|
bestndocs2 = 10
|
|
|
|
|
|
for m in range(bestndocs2):
|
|
right_gs_tupel2.append(fsearch2.database[bestmatches2[m][0]])
|
|
|
|
#print(' '.join(splitsentence))
|
|
|
|
statistically_correct_sentences2 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence2, right_gs_tupel2)
|
|
|
|
|
|
print(splitsentence)
|
|
|
|
|
|
Rightsentence = gs.GetBestgsAccordingRules(' '.join(splitsentence) , gs_sentence1, right_gs_tupel1, right_gs_tupel2, statistically_correct_sentences1, statistically_correct_sentences2, rules, generalrules)
|
|
|
|
|
|
'''
|
|
for endsentence in endsentences:
|
|
gramcorr_splitsentences.append(endsentence.split())
|
|
|
|
for index in newpunctuationsindex:
|
|
punctuations.insert(index[0], index[1])
|
|
|
|
return gramcorr_splitsentences, punctuations
|
|
|
|
|
|
|
|
def putAppendixesIntoOwnSentences(self, sentences, punctuations):
|
|
|
|
gs = self.gs
|
|
#triples = [['NN', 'ART', 'NN'], ['NE', 'ART', 'NN'], ['NN', 'ART', 'NN'], ['NE', 'ART', 'NE']]
|
|
quadruples = [['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'ART', 'NN'], ['NE', 'APPR', 'ART', 'NN'], ['NN', 'APPR', 'ART', 'NE'], ['NE', 'APPR', 'ART', 'NE']]
|
|
quadruplestochange = []
|
|
triplestochange = []
|
|
newsentences = []
|
|
newpunctuations = []
|
|
Whatisofnouns = []
|
|
oldsentences = sentences
|
|
oldpunctuations = punctuations
|
|
for hauptindex in range(len(sentences)):
|
|
|
|
sentence = sentences[hauptindex]
|
|
try:
|
|
#for triple in triples:
|
|
# AnnoOrNot, tripleInWords = gs.checkForAnnotationTriple(sentence, triple, 'word.tag_', 'None')
|
|
# for tripleinwor in tripleInWords:
|
|
# triplestochange.append([triple, tripleinwor])
|
|
|
|
for quadruple in quadruples:
|
|
AnnoOrNot, quadrupleInWords = gs.checkForAnnotationQuadruple(sentence, quadruple, 'word.tag_', 'None')
|
|
#print('quadinwords', quadrupleInWords)
|
|
#print('ANNOORNOT', AnnoOrNot)
|
|
for quadrupleInWo in quadrupleInWords:
|
|
quadruplestochange.append([quadruple, quadrupleInWo])
|
|
|
|
#print('quadstochange',quadruplestochange)
|
|
for quad in quadruplestochange:
|
|
for n in range(len(sentence) - 4):
|
|
if sentence[n] == quad[1][0]:
|
|
if sentence[n + 1] == quad[1][1]:
|
|
if sentence[n + 2] == quad[1][2]:
|
|
artword = None
|
|
longerWhatisnoun = 0
|
|
for m in range(2):
|
|
for word in self.nlp(sentence[n - m]):
|
|
if word.tag_ == 'ART':
|
|
Nounthatis = sentence[n - m:n + 1]
|
|
import spacy
|
|
nlp = spacy.load('de_core_news_sm')
|
|
token3 = nlp(sentence[n+4])
|
|
counter = 0
|
|
Whatisnoun = sentence[n + 1:n + 4]
|
|
for wor in token3:
|
|
counter += 1
|
|
if wor.tag_ == 'NN' or wor.tag_ == 'NE':
|
|
if counter == 1:
|
|
Whatisnoun = sentence[n + 1:n + 5]
|
|
longerWhatisnoun = 1
|
|
if counter == 2:
|
|
Whatisnoun = sentence[n + 1:n + 4]
|
|
|
|
|
|
|
|
artword = word.text
|
|
#print(sentence[n - 1],'oi')
|
|
if ((artword == 'die' or artword == 'Die') and sentence[n][-1] != 'n') or ((artword == 'der' or artword == 'einer' or artword == 'dieser') and (sentence[n - 2] in ['von', 'in', 'auf', 'ueber', 'unter', 'nach', 'mit'])):
|
|
|
|
if artword == 'der':
|
|
Nounthatis[0] = 'die'
|
|
|
|
donothing = 0
|
|
if sentence[n + 1] == 'mit':
|
|
if sentence[n + 2] == 'den':
|
|
verb = ' hat die '
|
|
Whatisnoun = Whatisnoun[2:]
|
|
if sentence[n + 2] == 'der':
|
|
verb = ' hat eine '
|
|
Whatisnoun = Whatisnoun[2:]
|
|
if sentence[n + 2] != 'der' and sentence[n + 2] != 'den':
|
|
donothing = 1
|
|
else:
|
|
verb = ' ist '
|
|
if donothing == 0:
|
|
newsentence = ' '.join(Nounthatis) + verb + ' '.join(Whatisnoun)
|
|
|
|
|
|
newsentences.append([hauptindex + 1, newsentence.split()])
|
|
newpunctuations.append([hauptindex + 1, punctuations[hauptindex]])
|
|
if longerWhatisnoun == 0:
|
|
Whatisofnouns.append([n + 1, n + 4, hauptindex])
|
|
else:
|
|
Whatisofnouns.append([n + 1, n + 5, hauptindex])
|
|
except:
|
|
print('Konnte nicht ' + str(sentence) + 'in Characterisierung pro Satz prozessieren..')
|
|
try:
|
|
for whatis in Whatisofnouns[::-1]:
|
|
thereisacomma = 0
|
|
#print(sentences[whatis[2]][whatis[1] - 1])
|
|
if sentences[whatis[2]][whatis[1] - 1][-1] == ',':
|
|
|
|
thereisacomma = 1
|
|
if thereisacomma == 1:
|
|
#print(sentences[whatis[2]][whatis[0] - 1])
|
|
sentences[whatis[2]][whatis[0] - 1] = sentences[whatis[2]][whatis[0] - 1] + ','
|
|
del sentences[whatis[2]][whatis[0]:whatis[1]]
|
|
for newsent in newsentences[::-1]:
|
|
sentences.insert(newsent[0], newsent[1])
|
|
for newpunct in newpunctuations[::-1]:
|
|
punctuations.insert(newpunct[0], newpunct[1])
|
|
for sentence in sentences:
|
|
if sentence[-1][-1] == ',':
|
|
sentence[-1] = sentence[-1][:-1]
|
|
except:
|
|
print('konnte nicht die gesammelten Characterisierungen prozessieren')
|
|
sentences = oldsentences
|
|
punctuations = oldpunctuations
|
|
|
|
|
|
|
|
return sentences, punctuations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|