2020-08-16 19:36:44 +02:00
# split sentences
# in den Listen fehlt noch sondern ( und noch weitere Dinge..)
# Folgende Konjunktionen brauchen keine Satzumformungen:
# Woraufhin, zudem, zumal, umso - desto,
# sondern ist schwierig zu lösen.. am besten mit sondern weg, und anschließend SentGlue
class SentSeg ( object ) :
def __init__ ( self , language ) :
self . language = language
self . punktuation_list = [ ' . ' , ' ? ' , ' ! ' , ' ; ' , ' : ' ]
self . wrappunktuation_list = [ ' , ' , ' - ' ]
self . adversativ_list = [ ' wohingegen ' , ' Wohingegen ' , ' aber ' , ' Aber ' , ' wobei ' , ' Wobei ' , ' hingegen ' ]
self . final_list = [ ' damit ' , ' Damit ' , ' um ' , ' Um ' ]
self . kausal_list = [ ' weil ' , ' Weil ' , ' da ' , ' Da ' , ' denn ' , ' falls ' , ' Falls ' ]
self . konditional_list = [ ' wenn ' , ' Wenn ' , ' sobald ' , ' Sobald ' , ' als ' , ' falls ' ]
self . konsekutiv_list = [ ' dass ' , ' Dass ' ]
self . konzessiv_list = [ ' obwohl ' , ' Obwohl ' , ' obgleich ' , ' Obgleich ' , ' trotzdem ' , ' Trotzdem ' , ' wenngleich ' , ' doch ' ]
self . lokal_list = [ ' wo ' , ' Wo ' ]
self . temporal_list_vor = [ ' bevor ' , ' Bevor ' ]
self . temporal_list_nach = [ ' nachdem ' , ' Nachdem ' ]
self . instrumental_list = [ ' indem ' , ' Indem ' ]
self . indirectspeech_list = [ ' ob ' , ' Ob ' , ' wann ' , ' Wann ' , ' wer ' , ' Wer ' , ' wie ' , ' Wie ' , ' warum ' , ' Warum ' , ' weshalb ' , ' Weshalb ' , ' wieso ' , ' Wieso ' ]
self . firstwordlist = [ ]
#self.firstwordlist = ['wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso', 'dies', 'dann', 'jedoch', 'deswegen', 'trotzdem', 'danach', 'davor', 'wenn', 'sobald']
self . full_list = self . adversativ_list + self . final_list + self . kausal_list + self . konditional_list + self . konsekutiv_list + self . konzessiv_list + self . lokal_list + self . temporal_list_nach + self . temporal_list_vor + self . instrumental_list + self . indirectspeech_list
def ReadDoc2Sent ( self , document ) :
splitsentences = [ ]
splitsentence = [ ]
with open ( document ) as sentences :
counter = 0
for sentence in sentences :
counter + = 1
if counter % 1000 == 0 :
print ( counter )
words = sentence . split ( )
for word in words :
splitsentence . append ( word )
if ( word [ - 1 ] in self . punktuation_list or word in self . punktuation_list ) and len ( word ) > 2 :
splitsentences . append ( [ splitsentence ] )
splitsentence = [ ]
return splitsentences
def AndOrSolver ( self , sentences , punctuations ) :
for n in range ( len ( punctuations ) ) :
if punctuations [ n ] == ' : ' or punctuations [ n ] == ' - ' :
punctuations [ n ] = ' . '
#print(sentences, punctuations)
splitsentences = [ ]
counter = 0
newsentences = [ ]
for sentence in sentences :
newpunctuationsindexes = [ ]
utterancenumber = sentence [ 2 ]
commainfo = sentence [ 1 ]
commaornot = commainfo [ 0 ]
sentence = sentence [ 0 ]
counter + = 1
doc = self . nlp ( ' ' . join ( sentence ) )
subjectcount = 0
separationwords = [ ]
subjectcounts = [ ]
doccounter = 0
subjectindex = [ ]
rcornot = 0
for word in doc :
doccounter + = 1
if word . dep_ == ' sb ' or word . dep_ == ' ep ' :
subjectcount + = 1
subjectindex . append ( doccounter - 1 )
if word . dep_ == ' rc ' :
rcornot = 1
if word . tag_ == ' $, ' :
subjectcounts . append ( [ subjectcount , doccounter - 2 , subjectindex , rcornot ] )
subjectindex = [ ]
subjectcount = 0
#print('aleaole',sentence[doccounter - 2])
if len ( sentence [ doccounter - 2 ] ) > 1 :
doccounter - = 1
if word . text == ' und ' or word . text == ' also ' or word . text == ' oder ' or word . text == ' schon ' or word . text == ' bald ' or word . text == ' doch ' or word . text == ' jedoch ' or word . text == ' sondern ' :
separationwords . append ( doccounter - 1 )
#print('separationwords', separationwords)
#print('subjectcounts', subjectcounts)
separationwordstocut = [ ]
listofownsentencessubjectindexes = [ ]
for n in range ( len ( subjectcounts ) - 1 ) :
if subjectcounts [ n ] [ 0 ] > 0 and subjectcounts [ n + 1 ] [ 0 ] > 0 and subjectcounts [ n + 1 ] [ 3 ] == 0 :
listofownsentencessubjectindexes . append ( subjectcounts [ n ] )
for m in range ( len ( separationwords ) ) :
if subjectcounts [ n ] [ 1 ] < separationwords [ m ] < subjectcounts [ n + 1 ] [ 1 ] :
#print(subjectcounts[n + 1], separationwords[m])
if subjectcounts [ n + 1 ] [ 0 ] > 1 :
if subjectcounts [ n + 1 ] [ 2 ] [ 0 ] < separationwords [ m ] < = subjectcounts [ n + 1 ] [ 2 ] [ - 1 ] :
separationwordstocut . append ( separationwords [ m ] )
processed = 0
#print('oioioi')
#print(listofownsentencessubjectindexes)
#print(separationwordstocut)
if len ( listofownsentencessubjectindexes ) > 0 :
for n in range ( len ( listofownsentencessubjectindexes ) ) :
sentence [ listofownsentencessubjectindexes [ n ] [ 1 ] ] = sentence [ listofownsentencessubjectindexes [ n ] [ 1 ] ] + ' alohaseparator '
newpunctuationsindexes . append ( [ punctuations [ counter - 1 ] , counter - 1 ] )
#print('a new punctuation1')
processed = 1
if len ( separationwordstocut ) > 0 :
for n in range ( len ( separationwordstocut ) ) :
sentence [ separationwordstocut [ n ] - 1 ] = sentence [ separationwordstocut [ n ] - 1 ] + ' alohaseparator '
#print('a new punctuation2')
newpunctuationsindexes . append ( [ punctuations [ counter - 1 ] , counter - 1 ] )
processed = 1
if processed == 0 :
newsentences . append ( [ sentence ] )
if processed == 1 :
#print(sentence)
splitsentence = [ ]
for word in sentence :
splitsentence . append ( word )
if word [ - 14 : ] == ' alohaseparator ' :
if splitsentence [ - 1 ] [ - 15 ] == ' , ' :
splitsentence [ - 1 ] = splitsentence [ - 1 ] [ : - 15 ]
else :
splitsentence [ - 1 ] = splitsentence [ - 1 ] [ : - 14 ]
newsentences . append ( [ splitsentence ] )
splitsentence = [ ]
newsentences . append ( [ splitsentence ] )
#print(newpunctuationsindexes)
newpunctuationsindexes = newpunctuationsindexes [ : : - 1 ]
for n in range ( len ( newpunctuationsindexes ) ) :
punctuations . insert ( newpunctuationsindexes [ n ] [ 1 ] , newpunctuationsindexes [ n ] [ 0 ] )
#print(newsentences, punctuations)
return newsentences , punctuations
def LoadBoWModelAndDatabaseOnesZeros ( self ) :
import FASTsearch
#print('loading the tag hkl db..')
self . fsearch1 = FASTsearch . FASTsearch ( ' GS_DB_word.tag_.hkl ' )
#print('done')
#print('generating BoW Model..')
self . fsearch1 . Gen_BoW_Model ( 1000 , " word " )
#print('done')
#print('loading the bow model')
self . fsearch1 . Load_BoW_Model ( ' bagofwordsGS_DB_word.tag_.pkl ' , ' DataBaseOneZerosGS_DB_word.tag_.hkl ' )
#print('done')
#print('loading the dep hkl db..')
self . fsearch2 = FASTsearch . FASTsearch ( ' GS_DB_word.dep_.hkl ' )
#print('done')
#print('generating BoW Model..')
self . fsearch2 . Gen_BoW_Model ( 1000 , " word " )
#print('done')
#print('loading the bow model')
self . fsearch2 . Load_BoW_Model ( ' bagofwordsGS_DB_word.dep_.pkl ' , ' DataBaseOneZerosGS_DB_word.dep_.hkl ' )
#print('done')
def LoadSentGlueSGDandGSUtils ( self ) :
import GS_Utils
#print('initializing the gs utils..')
self . gs = GS_Utils . GS_Utils ( ' de_core_news_sm ' )
#print('done')
from SentGlue import SentGlueMach
#print('loading the Stochastic Gradient models..')
self . sgm = SentGlueMach ( ' trainedSGD_twolabel.pkl ' , ' bagofwordstwolabel.pkl ' )
#print('done')
#print('initializing the SGM..')
self . sgm . initialize ( )
#print('done')
#print('importing spacy..')
import spacy
#print('done')
#print('importing german model..')
self . nlp = spacy . load ( ' de_core_news_sm ' )
#print('done')
return ' done '
def CommaSentenceOrNot ( self , sentences ) :
nlp = self . nlp
commasentences = [ ]
counter = 0
#print('creating array of comma or not..')
for sentence in sentences :
doc = nlp ( ' ' . join ( sentence [ 0 ] ) )
#print(doc)
counter + = 1
#if counter % 100 == 0:
#print(counter)
n = 0
firstone = 0
token = [ ]
nextword = 0
for word in doc :
#print(word.tag_)
# es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
if firstone == 0 :
token . append ( word . text )
firstone = 1
if nextword == 1 :
token . append ( word . text )
nextword = 0
if word . tag_ == ' $, ' :
n + = 1
nextword = 1
sentence . append ( [ n , token ] )
commasentences . append ( sentence )
#print('done')
return commasentences
def EnumerationSolver ( self , sentences ) :
gs = self . gs
nlp = self . nlp
sgm = self . sgm
enumerationsentences = [ ]
counter = 0
NOTenumerations = [ ]
#print('processing enumerations..')
for sentence in sentences :
doc = nlp ( ' ' . join ( sentence [ 0 ] ) )
#print(doc)
counter + = 1
#if counter % 100 == 0:
#print(counter)
n = 0
firstone = 0
token = [ ]
nextword = 0
enumeration = False
splitsentence = [ ]
splitsentence_deps = [ ]
splitsentence_tags = [ ]
splitsentences = [ ]
splitsentences_deps = [ ]
splitsentences_tags = [ ]
for word in doc :
#print(word.tag_)
# es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
nextword = 0
if word . tag_ == ' $, ' :
n + = 1
nextword = 1
if ( word . text == ' und ' or word . text == ' oder ' ) and n > = 1 :
enumeration = True
break
output = [ ]
if enumeration == True :
for word in doc :
#print(word.text)
if word . text != ' , ' and word . text != ' . ' and word . text != ' und ' :
splitsentence . append ( word . text )
splitsentence_deps . append ( word . dep_ )
splitsentence_tags . append ( word . tag_ )
if word . text == ' , ' or word . text == ' und ' :
#print('oi')
splitsentences . append ( splitsentence )
splitsentences_deps . append ( splitsentence_deps )
splitsentences_tags . append ( splitsentence_tags )
splitsentence = [ ]
splitsentence_deps = [ ]
splitsentence_tags = [ ]
splitsentences . append ( splitsentence )
splitsentences_deps . append ( splitsentence_deps )
splitsentences_tags . append ( splitsentence_tags )
#print( 'splitsentences', splitsentences)
token = [ ]
enumerations = [ ]
enumerationsSPOs = [ ]
NOTenumerations = [ ]
for sentence in splitsentences :
token . append ( sentence [ 0 ] )
if sentence [ 0 ] not in self . full_list :
enumerations . append ( sentence )
enumerationsSPOs . append ( gs . checkSPO ( sentence , 0 ) )
else :
NOTenumerations . append ( sentence )
#print(enumerationsSPOs)
#print('enumerations', enumerations)
biggest = [ ]
for i in range ( len ( enumerationsSPOs ) ) :
biggest . append ( [ i , sum ( enumerationsSPOs [ i ] ) ] )
sortedbiggest = sorted ( biggest [ : : - 1 ] , key = lambda tup : tup [ 1 ] , reverse = True )
for i in range ( len ( sortedbiggest ) ) :
if sortedbiggest [ i ] [ 0 ] == 0 :
mainsentenceIndex = sortedbiggest [ i ] [ 0 ]
lastornot = 0
break
if sortedbiggest [ i ] [ 0 ] == len ( biggest ) - 1 :
mainsentenceIndex = sortedbiggest [ i ] [ 0 ]
lastornot = 1
break
# Hier muss noch für den Fall Er, sie und der Beamte LACHTEN den Clown aus --> das lachten abgefangen werden mit der Datenbank der Fälle, sprich enumeration im spo 1 0 0 + plural muss dann zu singular werden abhängig von den artikeln.
#print('enumerations', enumerations)
mainsentence = enumerations [ mainsentenceIndex ]
#print('main', mainsentence)
probablemainsentences = [ ]
for i in range ( len ( enumerations ) ) :
if i != mainsentenceIndex :
iprobablemainsentences = [ ]
probablemainsentence = [ ]
if lastornot == 0 :
for j in range ( 1 , len ( mainsentence ) ) :
probablemainsentence = mainsentence [ 0 : j ] + enumerations [ i ]
#print(probablemainsentence)
iprobablemainsentences . append ( ' ' . join ( probablemainsentence ) )
if lastornot == 1 :
for j in range ( 1 , len ( mainsentence ) ) :
probablemainsentence = enumerations [ i ] + mainsentence [ - j : ]
iprobablemainsentences . append ( ' ' . join ( probablemainsentence ) )
probablemainsentences . append ( iprobablemainsentences )
# hier wird auf noch da geprüft, aber es ist wichtiger in diesem fall, dass ein tuple nicht zerissen vorkommt AENDERN !!!!
#print('probablemainsentences', probablemainsentences)
tuplesToCheck = [ ]
tuples = [ [ ' ART ' , ' NN ' ] , [ ' APPR ' , ' NN ' ] , [ ' ART ' , ' CARD ' ] ]
for tupl in tuples :
checktupleindex , tupleInWords = gs . checkForAnnotationTuple ( mainsentence , tupl , ' word.tag_ ' , ' None ' )
if checktupleindex == 2 :
tuplesToCheck . append ( [ tupl , tupleInWords ] )
triplesToCheck = [ ]
triples = [ [ ' ART ' , ' ADJA ' , ' NN ' ] , [ ' APPR ' , ' ART ' , ' NN ' ] , [ ' KOKOM ' , ' ART ' , ' NN ' ] ]
for tripl in triples :
checktripleindex , tripleInWords = gs . checkForAnnotationTriple ( mainsentence , tripl , ' word.tag_ ' , ' None ' )
if checktripleindex == 3 :
triplesToCheck . append ( [ tripl , tripleInWords ] )
#print('tuples to check', tuplesToCheck)
#print('triples to check', triplesToCheck)
#print('probablemainsentences', probablemainsentences)
for probsentences in probablemainsentences :
checktripleindexes = [ ]
checktupleindexes = [ ]
#print(probsentences)
filteredprobsentences = [ ]
for sentence in probsentences :
tuplchecked = 0
triplchecked = 0
#print('sentence and tuples to check', sentence, tuplesToCheck)
for tupl in tuplesToCheck :
checkedsecondtime , tupleinWords = gs . checkForAnnotationTuple ( sentence . split ( ) , tupl [ 0 ] , ' word.tag_ ' , tupl [ 1 ] )
#print(sentence, checkedsecondtime)
if checkedsecondtime == 1 :
tuplchecked = 0
if checkedsecondtime == 2 :
tuplchecked = 1
for tripl in triplesToCheck :
checkedsecondtime , tripleinWords = gs . checkForAnnotationTriple ( sentence . split ( ) , tripl [ 0 ] , ' word.tag_ ' , tripl [ 1 ] )
if checkedsecondtime == 1 or checkedsecondtime == 2 :
triplchecked = 0
if checkedsecondtime == 3 :
triplchecked = 1
if triplchecked == 1 or tuplchecked == 1 :
filteredprobsentences . append ( sentence )
#print('filteredprobsentences', filteredprobsentences)
if len ( filteredprobsentences ) == 0 :
filteredprobsentences = probsentences
# here is still the problem, that there are lists of words instead of proper sentences..
#print('filteredprobsentences', filteredprobsentences)
probsMatrix = sgm . predictprobsOnSentenceList ( filteredprobsentences , filteredprobsentences )
#print(probsMatrix)
for i in range ( len ( probsMatrix ) ) :
probsMatrix [ i ] [ 0 ] = i
#print(probsMatrix)
sortedprobsMatrix = sorted ( probsMatrix [ : : - 1 ] , key = lambda tup : tup [ 1 ] , reverse = True )
#print(sortedprobsMatrix)
bestindex = sortedprobsMatrix [ 0 ] [ 0 ]
#print(bestindex)
#print('probablemainsentences', filteredprobsentences)
probablemainsentence = filteredprobsentences [ int ( bestindex ) ]
#print('oi', probablemainsentence)
#print('probablemainsentence', probablemainsentence)
enumerationsentences . append ( [ probablemainsentence ] )
enumerationsentences . append ( [ ' ' . join ( mainsentence ) ] )
for notenum in NOTenumerations :
#print(enumerationsentences)
#print(enumerationsentences[-1])
#print('enum no1', enumerationsentences)
#print('notenum', notenum)
enumerationsentences [ - 1 ] . append ( ' ' . join ( notenum ) )
#print('enumsentences',enumerationsentences[-1])
enumerationsentences [ - 1 ] = [ ' , ' . join ( enumerationsentences [ - 1 ] ) ]
else :
enumerationsentences . append ( [ sentence ] )
output . append ( enumerationsentences )
for n in range ( len ( output [ 0 ] ) ) :
#print('out',output[0][n])
try :
output [ 0 ] [ n ] = [ output [ 0 ] [ n ] [ 0 ] . split ( ) ]
except :
output [ 0 ] [ n ] = [ output [ 0 ] [ n ] [ 0 ] [ 0 ] ]
#print('done')
return output [ 0 ]
def GetUtteranceNumber ( self , sentences ) :
nlp = self . nlp
uttersentences = [ ]
for sentence in sentences :
doc = nlp ( ' ' . join ( sentence [ 0 ] ) )
subjectcount = 0
for word in doc :
if word . dep_ == ' sb ' or word . dep_ == ' ep ' :
subjectcount + = 1
sentence . append ( subjectcount )
uttersentences . append ( sentence )
return uttersentences
def GetQuestionOrNot ( self , sentences ) :
nlp = self . nlp
uttersentences = [ ]
questionmark = 0
for sentence in sentences :
doc = nlp ( ' ' . join ( sentence [ 0 ] ) )
count = 0
for word in doc :
count + = 1
if word . text == ' ? ' :
questionmark = 1
sentence . append ( questionmark )
uttersentences . append ( sentence )
return uttersentences
def SplitSentencesIntoHauptNebenTuple ( self , sentences , punctuations ) :
oldsplitsentences = [ ]
#print('hauptneben inputsentences', sentences)
gs = self . gs
#print('importing spacy..')
import spacy
#print('done')
nlp = self . nlp
outputsentences = [ ]
sentencesThatAreOutoutput = [ ]
outsentences = [ ]
for generalindex in range ( len ( sentences ) ) :
presentence = sentences [ generalindex ]
splitsentence = [ ]
splitsentence_deps = [ ]
splitsentence_tags = [ ]
splitsentences = [ ]
splitsentences_deps = [ ]
splitsentences_tags = [ ]
commainfo = presentence [ 1 ]
outputsentence = [ ]
token = commainfo [ 1 ]
commaornot = commainfo [ 0 ]
numberutterances = presentence [ 2 ]
sentence = presentence [ 0 ]
oldsentence = presentence [ 0 ]
#print(commaornot)
if commaornot > = 2 :
#print('nla')
sentence [ 0 ] = sentence [ 0 ] . title ( )
doc = nlp ( ' ' . join ( sentence ) )
for word in doc :
#print(word.text)
if word . text != ' , ' and word . text != ' . ' :
splitsentence . append ( word . text )
splitsentence_deps . append ( word . dep_ )
splitsentence_tags . append ( word . tag_ )
if word . text == ' , ' :
#print('oi')
splitsentences . append ( splitsentence )
splitsentences_deps . append ( splitsentence_deps )
splitsentences_tags . append ( splitsentence_tags )
splitsentence = [ ]
splitsentence_deps = [ ]
splitsentence_tags = [ ]
splitsentences . append ( splitsentence )
splitsentences [ 0 ] [ 0 ] = splitsentences [ 0 ] [ 0 ] . lower ( )
splitsentences_deps . append ( splitsentence_deps )
splitsentences_tags . append ( splitsentence_tags )
oldsplitsentences = splitsentences
#print(splitsentences)
#print(splitsentences_tags)
#print(splitsentences_deps)
spo = [ ]
for n in range ( len ( splitsentences ) ) :
prespo = [ ]
prespo = gs . checkSPO ( splitsentences_deps [ n ] , 1 )
prespo . append ( gs . checkForAnnotation ( splitsentences [ n ] , ' VVINF ' , ' word.tag_ ' ) )
prespo . append ( gs . checkForAnnotation ( splitsentences [ n ] , ' VAFIN ' , ' word.tag_ ' ) )
prespo . append ( gs . checkForAnnotation ( splitsentences [ n ] , ' VVFIN ' , ' word.tag_ ' ) )
prespo . append ( gs . checkForAnnotation ( splitsentences [ n ] , ' VMFIN ' , ' word.tag_ ' ) )
spo . append ( prespo )
#print(splitsentences_deps)
#print(splitsentences)
#print(spo)
indexSPO = [ ]
lastm = len ( splitsentences )
for o in range ( len ( splitsentences ) ) :
m = len ( splitsentences ) - 1 - o
for n in range ( len ( splitsentences ) ) :
if m < n - 1 and n < lastm :
#print('spo s',spo[m], spo[n])
sb = spo [ m ] [ 0 ] + spo [ n ] [ 0 ]
Vafin = 1
if spo [ m ] [ 3 ] == 1 or spo [ n ] [ 3 ] == 1 :
Vafin = spo [ m ] [ 3 ] + spo [ n ] [ 3 ]
Vvinf = 1
if spo [ m ] [ 4 ] == 1 or spo [ n ] [ 4 ] == 1 :
Vvinf = spo [ m ] [ 4 ] + spo [ n ] [ 4 ]
Vvfin = 1
if spo [ m ] [ 5 ] == 1 or spo [ n ] [ 5 ] == 1 :
Vvfin = spo [ m ] [ 5 ] + spo [ n ] [ 5 ]
Vmfin = 1
if spo [ m ] [ 6 ] == 1 or spo [ n ] [ 6 ] == 1 :
Vmfin == spo [ m ] [ 6 ] + spo [ n ] [ 6 ]
#wrapped = 0
#for n in range(len(indexSPO)):
#if n == indexSPO[n][0] + 1 and n == indexSPO[n][1] - 1:
#wrapped = 1
#print(sb, Vafin, Vvinf, Vvfin, Vmfin, 'm n', m, n)
if sb == 1 and Vafin == 1 and Vvinf == 1 and ( Vvfin == 1 or Vmfin == 1 ) :
indexSPO . append ( [ m , n ] )
#print([m,n])
lastm = m
#print('lastm',lastm)
#print(splitsentences)
Hauptsentences = [ ]
for n in range ( len ( indexSPO ) ) :
if indexSPO [ n ] [ 0 ] > indexSPO [ n ] [ 1 ] :
i = 1
j = 0
else :
i = 0
j = 1
Hauptsentences . append ( [ splitsentences [ indexSPO [ n ] [ i ] ] + splitsentences [ indexSPO [ n ] [ j ] ] , indexSPO [ n ] [ i ] , indexSPO [ n ] [ j ] ] )
HauptSentences = [ ]
for n in range ( len ( Hauptsentences ) ) :
m = len ( Hauptsentences ) - 1 - n
HauptSentences . append ( Hauptsentences [ m ] )
#print('Hauptsentences', Hauptsentences)
#print('HauptSentences', HauptSentences)
sentencesThatAreOut = [ ]
for n in range ( len ( HauptSentences ) ) :
index = HauptSentences [ n ] [ 1 ]
finish = 0
#print('Oi',HauptSentences[n])
if n == len ( HauptSentences ) - 1 :
#print('lenHauptsentences', len(HauptSentences))
stopindex = len ( splitsentences )
finish = 1
else :
stopindex = HauptSentences [ n + 1 ] [ 1 ]
#print('stopindex', stopindex)
vvfinisthere = 0
if finish == 0 :
if splitsentences_tags [ stopindex ] [ 0 ] == ' VVFIN ' :
stopindex - = 1
vvfinisthere = 1
if splitsentences_tags [ index ] [ 0 ] == ' VVFIN ' :
vvfinisthere = 1
if vvfinisthere == 1 :
HNTuple = HauptSentences [ n ] [ 0 ] + [ ' , ' ] + splitsentences [ index - 1 ]
outputsentence . append ( HNTuple )
sentencesThatAreOut . append ( index - 1 )
sentencesThatAreOut . append ( Hauptsentences [ n ] [ 1 ] )
sentencesThatAreOut . append ( Hauptsentences [ n ] [ 2 ] )
for m in range ( index + 1 , stopindex ) :
if m != HauptSentences [ n ] [ 2 ] :
HNTuple = HauptSentences [ n ] [ 0 ] + [ ' , ' ] + splitsentences [ m ]
#print('check', HauptSentences[n], n)
#print('check', splitsentences[m], m)
#print('double', HNTuple)
outputsentence . append ( HNTuple )
sentencesThatAreOut . append ( m )
sentencesThatAreOut . append ( Hauptsentences [ n ] [ 1 ] )
sentencesThatAreOut . append ( Hauptsentences [ n ] [ 2 ] )
sentencesThatAreOutoutput . append ( sentencesThatAreOut )
cpOrNots = [ ]
rcOrNots = [ ]
for splitsentence in splitsentences_deps :
cpOrNot = gs . checkForAnnotationInTokenizedSentence ( splitsentence , ' cp ' )
cpOrNots . append ( cpOrNot )
rcOrNot = gs . checkForAnnotationInTokenizedSentence ( splitsentence , ' rc ' )
rcOrNots . append ( rcOrNot )
#print('Laenge splitsentences', len(splitsentences))
#print('laenge cpOrNots', len(cpOrNots))
#print(cpOrNots)
#print('rc or nots', rcOrNots)
pairs = [ ]
for n in range ( len ( cpOrNots ) ) :
index = len ( cpOrNots ) - 1 - n
done = 0
if rcOrNots [ index ] == 1 :
pairs . append ( [ index , index - 1 ] )
done = 1
if done == 0 and cpOrNots [ index ] == 1 :
try :
if splitsentences_tags [ index + 1 ] [ 0 ] == ' VVFIN ' :
pairs . append ( [ index , index + 1 ] )
done = 1
except :
pass
try :
if done == 0 and rcOrNots [ index - 1 ] == 0 :
pairs . append ( [ index , index - 1 ] )
done = 1
except :
pass
try :
if done == 0 and rcOrNots [ index - 1 ] == 1 :
if rcOrNots [ index - 2 ] == 0 :
pairs . append ( [ index , index - 2 ] )
except :
pass
for pair in pairs [ : : - 1 ] :
if pair [ 0 ] not in set ( sentencesThatAreOut ) or pair [ 1 ] not in set ( sentencesThatAreOut ) :
outputsentence . append ( splitsentences [ pair [ 1 ] ] + [ ' , ' ] + splitsentences [ pair [ 0 ] ] )
#print('hnhn',sentences)
sentences [ generalindex ] [ 0 ] = outputsentence
#print('outputsentence hntuple',outputsentence)
#outputsentences.append([outputsentence , i])
#print('Oio', outputsentences)
#print(sentencesThatAreOutoutput)
#print(splitsentences)
#print('oioioioioioioio',sentences)
#print(sentences[0][0])
#print('oioi',sentences[n])
#print('malatesta', sentences[n][0][0])
#print('generalindex sentences index 0', sentences[generalindex][0])
try :
if type ( sentences [ generalindex ] [ 0 ] [ 0 ] ) == str :
sentences [ generalindex ] [ 0 ] = [ sentences [ generalindex ] [ 0 ] ]
except :
pass
#print('generalindex sentences index 0', sentences[generalindex][0])
#print('oldsentence', oldsentence)
newgeneratedsentences = len ( sentences [ generalindex ] [ 0 ] )
if newgeneratedsentences > 1 :
#print('goti t')
for sentence in sentences [ generalindex ] [ 0 ] :
punctuations . insert ( generalindex , punctuations [ generalindex ] )
outsentences . append ( sentence )
del punctuations [ generalindex ]
if newgeneratedsentences == 1 :
if len ( sentences [ generalindex ] [ 0 ] [ 0 ] ) > 1 :
outsentences . append ( sentences [ generalindex ] [ 0 ] [ 0 ] )
else :
outsentences . append ( oldsentence )
if newgeneratedsentences == 0 :
#print('case oldsentence', oldsentence)
outsentences . append ( oldsentence )
#print('oioi', sentences[n])
# connect alonestanding commatas with the word before
#print('theoutsentences', outsentences)
for outsentence in outsentences :
todelete = [ ]
for n in range ( len ( outsentence ) ) :
if outsentence [ n ] == ' , ' :
todelete . append ( n )
outsentence [ n - 1 ] = outsentence [ n - 1 ] + ' , '
for deleteindex in todelete [ : : - 1 ] :
del outsentence [ deleteindex ]
for index in range ( len ( outsentences ) ) :
outsentences [ index ] = [ outsentences [ index ] ]
#print('theoutsentences', outsentences)
#removing doubles
doubledsentences = [ ]
for o in range ( len ( outsentences ) ) :
sentence = outsentences [ o ] [ 0 ]
for m in range ( len ( outsentences ) ) :
if m != o :
count = 0
for n in range ( len ( sentence ) ) :
if sentence [ n ] in outsentences [ m ] [ 0 ] or sentence [ n ] [ : - 1 ] in outsentences [ m ] [ 0 ] :
count + = 1
if count == len ( sentence ) :
doubledsentences . append ( sentence )
punctdeleteindex = [ ]
tmp = set ( )
for sentence in doubledsentences :
tmp . add ( tuple ( sentence ) )
#print(list(tmp))
doubledsentences = [ ]
for tup in tmp :
doubledsentences . append ( [ list ( tup ) ] )
#print('doubledsentences',doubledsentences)
punctdeleteindexes = [ ]
for double in doubledsentences :
if double in outsentences :
punctdeleteindex = outsentences [ : : - 1 ] . index ( double )
del outsentences [ len ( outsentences ) - 1 - punctdeleteindex ]
punctdeleteindexes . append ( punctdeleteindex )
for index in punctdeleteindexes [ : : - 1 ] :
del punctuations [ len ( outsentences ) - 1 - index ]
#print('oldsplit',oldsplitsentences)
#print('outsents',outsentences)
for o in range ( len ( oldsplitsentences ) ) :
for m in range ( len ( outsentences ) ) :
counter = 0
for n in range ( len ( oldsplitsentences [ o ] ) ) :
if oldsplitsentences [ o ] [ n ] in outsentences [ m ] [ 0 ] or oldsplitsentences [ o ] [ n ] + ' , ' in outsentences [ m ] [ 0 ] :
counter + = 1
if counter > = len ( oldsplitsentences [ o ] ) :
break
if m == len ( outsentences ) - 1 and counter < len ( oldsplitsentences [ o ] ) :
if o == 0 :
outsentences . insert ( 0 , [ oldsplitsentences [ o ] ] )
punctuations . insert ( 0 , punctuations [ 0 ] )
else :
newones = [ ]
for i in range ( len ( outsentences ) ) :
if outsentences [ i ] [ 0 ] [ - 1 ] == oldsplitsentences [ o - 1 ] [ - 1 ] :
if len ( outsentences [ i ] [ 0 ] ) > 2 and len ( oldsplitsentences [ o - 1 ] ) > 2 :
if outsentences [ i ] [ 0 ] [ - 2 ] == oldsplitsentences [ o - 1 ] [ - 2 ] :
if outsentences [ i ] [ 0 ] [ - 3 ] == oldsplitsentences [ o - 1 ] [ - 3 ] :
newones . append ( [ i + 1 , [ oldsplitsentences [ o ] ] ] )
for newone in newones [ : : - 1 ] :
#print(newones)
outsentences . insert ( newone [ 0 ] , newone [ 1 ] )
punctuations . insert ( newone [ 0 ] , punctuations [ newone [ 0 ] - 1 ] )
#print('outsentences at the very end ', outsentences, punctuations)
return outsentences , punctuations
# Notiz: Hier muss der Input immer Paare sein, von Hauptsatz/Nebensatz. D.h. eine weitere vorgeschaltete Klasse ist von Nöten.
def SplitCommatas ( self , Inputsentences , punctuations ) :
gs = self . gs
nlp = self . nlp
gramcorr_splitsentences = [ ]
counter = 0
newpunctuationsindex = [ ]
for Inputsentence in Inputsentences :
counter + = 1
commainfo = Inputsentence [ 1 ]
token = commainfo [ 1 ]
commaornot = commainfo [ 0 ]
numberutterances = Inputsentence [ 2 ]
if commaornot == 0 :
gramcorr_splitsentences . append ( Inputsentence [ 0 ] )
if commaornot > 1 :
gramcorr_splitsentences . append ( Inputsentence [ 0 ] )
if commaornot == 1 :
oldsentence = Inputsentence [ 0 ]
Inputsentence = [ [ Inputsentence [ 0 ] ] ]
for sentence in Inputsentence [ 0 ] :
splitsentence = [ ]
splitsentences = [ ]
processed = 0
wasNotInAnyList = 0
try :
for n in range ( len ( token ) ) :
if token [ n ] in self . final_list :
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 1 :
if token [ n ] == ' um ' or token [ n ] == ' Um ' :
splitsentences [ n ] . insert ( 0 , ' dies ' )
splitsentences [ n ] . insert ( 0 , ' um ' )
else :
splitsentences [ n ] . insert ( 0 , ' dann ' )
if n == 0 :
if token [ n ] == ' um ' or token [ n ] == ' Um ' :
splitsentences [ n ] . insert ( 0 , ' dies ' )
splitsentences [ n ] . insert ( 0 , ' um ' )
splitsentences = splitsentences [ : : - 1 ]
else :
splitsentences [ n ] . insert ( 0 , ' dann ' )
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
generalrules = [ [ ' ADV ' , ' VAFIN ' ] , [ ' ADV ' , ' VVFIN ' ] ]
processed = 1
if token [ n ] in self . adversativ_list :
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
splitsentences [ n ] . append ( ' jedoch ' )
generalrules = [ [ ' ADV ' , ' VAFIN ' ] , [ ' ADV ' , ' VVFIN ' ] ]
processed = 1
if token [ n ] in self . kausal_list :
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
# Da deswegen an den anderen Satz gehaengt wird, muss der input zu commasentences immer ZWEI sentences sein.
#print('splitsentences in kausal', splitsentences)
if n == 1 :
splitsentences [ n - 1 ] . insert ( 0 , ' deswegen ' )
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
if n == 0 :
splitsentences [ n + 1 ] . insert ( 0 , ' deswegen ' )
#print('splitsentences in kausal', splitsentences)
generalrules = [ [ ' PROAV ' , ' VAFIN ' ] , [ ' PROAV ' , ' VVFIN ' ] ]
processed = 1
# from here come konsekutiv sentences, they have to be split according https://www.deutschplus.net/pages/Konsekutivsatz
if token [ n ] in self . konsekutiv_list :
#print('oi konsekutiv')
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
generalrules = [ [ ' KOUS ' , ' PPER ' ] ]
processed = 1
if token [ n ] in self . konditional_list :
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 1 :
spoCount = gs . checkSPO ( splitsentences [ n ] , 0 )
spoCount = sum ( spoCount )
if spoCount == 2 :
thereisanes = 0
for word in splitsentences [ n ] :
if word == ' es ' or word == ' Es ' :
thereisanes = 1
if thereisanes == 0 :
splitsentences [ n ] . append ( ' es ' )
if n == 0 :
spoCount = gs . checkSPO ( splitsentences [ n ] , 0 )
spoCount = sum ( spoCount )
if spoCount == 2 :
thereisanes = 0
for word in splitsentences [ n ] :
if word == ' es ' or word == ' Es ' :
thereisanes = 1
if thereisanes == 0 :
splitsentences [ n ] . append ( ' es ' )
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
generalrules = [ [ ' KOUS ' , ' PPER ' ] ]
processed = 1
if token [ n ] in self . konzessiv_list :
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 1 :
splitsentences [ n - 1 ] . insert ( 0 , ' trotzdem ' )
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
if n == 0 :
splitsentences [ n + 1 ] . insert ( 0 , ' trotzdem ' )
generalrules = [ [ ' PROAV ' , ' VAFIN ' ] , [ ' PROAV ' , ' VVFIN ' ] ]
processed = 1
if token [ n ] in self . lokal_list :
#print('lokal ole ole ')
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 1 :
splitsentences [ n - 1 ] . insert ( 0 , ' dort ' )
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
if n == 0 :
splitsentences [ n + 1 ] . insert ( 0 , ' dort ' )
generalrules = [ [ ' PROAV ' , ' VAFIN ' ] , [ ' PROAV ' , ' VVFIN ' ] ]
processed = 1
if token [ n ] in self . instrumental_list :
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 1 :
splitsentences [ n - 1 ] . insert ( 0 , ' so ' )
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
if n == 0 :
splitsentences [ n + 1 ] . insert ( 0 , ' so ' )
generalrules = [ [ ' ADV ' , ' VAFIN ' ] , [ ' ADV ' , ' VVFIN ' ] ]
processed = 1
if token [ n ] in self . temporal_list_vor :
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 1 :
splitsentences [ n ] . insert ( 0 , ' danach ' )
if n == 0 :
splitsentences [ n ] . insert ( 0 , ' danach ' )
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
generalrules = [ [ ' PROAV ' , ' VAFIN ' ] , [ ' PROAV ' , ' VVFIN ' ] ]
processed = 1
if token [ n ] in self . temporal_list_nach :
splitsentence = [ ]
for word in sentence :
if word != token [ n ] :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 1 :
splitsentences [ n ] . insert ( 0 , ' davor ' )
if n == 0 :
splitsentences [ n ] . insert ( 0 , ' davor ' )
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
generalrules = [ [ ' PROAV ' , ' VAFIN ' ] , [ ' PROAV ' , ' VVFIN ' ] ]
processed = 1
#print(token[n])
if token [ n ] == ' der ' or token [ n ] == ' welcher ' :
tokens = self . nlp ( ' ' . join ( sentence ) )
for word in tokens :
if word . dep_ == ' rc ' :
wordwithrc = word . text
rcORnot = gs . checkForAnnotation ( sentence , ' rc ' , ' word.dep_ ' )
oldsplitsentences = splitsentences
splitsentences = [ ]
if rcORnot == 1 :
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
# das umtauschen wird hier vollzogen, da ansonsten spacy dieser nicht als PDS einliest.. analog in den anderen.
if wordwithrc in splitsentences [ n ] :
splitsentences [ n ] [ 0 ] = ' dieser '
verb = splitsentences [ n ] [ - 1 ]
splitsentences [ n ] = splitsentences [ n ] [ : - 1 ]
splitsentences [ n ] . insert ( 1 , verb )
#print('Vorsicht', splitsentences)
generalrules = [ [ ' PDS ' , ' VAFIN ' ] , [ ' PDS ' , ' VVFIN ' ] ]
processed = 1
else :
splitsentences = oldsplitsentences
splitsentence = [ ]
if token [ n ] == ' die ' or token [ n ] == ' welche ' :
tokens = self . nlp ( ' ' . join ( sentence ) )
for word in tokens :
if word . dep_ == ' rc ' :
wordwithrc = word . text
rcORnot = gs . checkForAnnotation ( sentence , ' rc ' , ' word.dep_ ' )
oldsplitsentences = splitsentences
splitsentences = [ ]
if rcORnot == 1 :
#print('it went to rcornot in case die')
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if wordwithrc in splitsentences [ n ] :
#print('wordwithrc was in sentence')
#print(wordwithrc)
#print(splitsentences[n])
#print('wordwithrcend')
splitsentences [ n ] [ 0 ] = ' diese '
verb = splitsentences [ n ] [ - 1 ]
splitsentences [ n ] = splitsentences [ n ] [ : - 1 ]
splitsentences [ n ] . insert ( 1 , verb )
generalrules = [ [ ' PDS ' , ' VAFIN ' ] , [ ' PDS ' , ' VVFIN ' ] ]
processed = 1
else :
splitsentences = oldsplitsentences
splitsentence = [ ]
if token [ n ] == ' dem ' :
tokens = self . nlp ( ' ' . join ( sentence ) )
for word in tokens :
if word . dep_ == ' rc ' :
wordwithrc = word . text
rcORnot = gs . checkForAnnotation ( sentence , ' rc ' , ' word.dep_ ' )
oldsplitsentences = splitsentences
splitsentences = [ ]
if rcORnot == 1 :
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' and word [ - 1 ] != ' . ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if wordwithrc in splitsentences [ n ] :
splitsentences [ n ] [ 0 ] = ' diesem '
verb = splitsentences [ n ] [ - 1 ]
splitsentences [ n ] = splitsentences [ n ] [ : - 1 ]
splitsentences [ n ] . insert ( 1 , verb )
generalrules = [ [ ' PDS ' , ' VAFIN ' ] , [ ' PDS ' , ' VVFIN ' ] ]
processed = 1
else :
splitsentences = oldsplitsentences
splitsentence = [ ]
if token [ n ] == ' das ' or token [ n ] == ' welches ' :
tokens = self . nlp ( ' ' . join ( sentence ) )
for word in tokens :
if word . dep_ == ' rc ' :
wordwithrc = word . text
rcORnot = gs . checkForAnnotation ( sentence , ' rc ' , ' word.dep_ ' )
#print('Oeeee',rcORnot)
oldsplitsentences = splitsentences
splitsentences = [ ]
if rcORnot == 1 :
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
#print('splitsentence in das rc', splitsentences)
if wordwithrc in splitsentences [ n ] :
splitsentences [ n ] [ 0 ] = ' dieses '
verb = splitsentences [ n ] [ - 1 ]
#print('verb',verb)
splitsentences [ n ] = splitsentences [ n ] [ : - 1 ]
splitsentences [ n ] . insert ( 1 , verb )
generalrules = [ [ ' PDS ' , ' VAFIN ' ] , [ ' PDS ' , ' VVFIN ' ] ]
processed = 1
else :
splitsentences = oldsplitsentences
splitsentence = [ ]
if token [ n ] == ' dessen ' or token [ n ] == ' wessen ' :
tokens = self . nlp ( ' ' . join ( sentence ) )
for word in tokens :
if word . dep_ == ' rc ' :
wordwithrc = word . text
rcORnot = gs . checkForAnnotation ( sentence , ' rc ' , ' word.dep_ ' )
oldsplitsentences = splitsentences
splitsentences = [ ]
if rcORnot == 1 :
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if wordwithrc in splitsentences [ n ] :
verb = splitsentences [ n ] [ - 1 ]
splitsentences [ n ] = splitsentences [ n ] [ : - 1 ]
splitsentences [ n ] . insert ( 1 , verb )
generalrules = [ [ ' PDS ' , ' VAFIN ' ] , [ ' PDS ' , ' VVFIN ' ] ]
processed = 1
else :
splitsentences = oldsplitsentences
splitsentence = [ ]
if token [ n ] == ' den ' or token [ n ] == ' welchen ' :
tokens = self . nlp ( ' ' . join ( sentence ) )
for word in tokens :
if word . dep_ == ' rc ' :
wordwithrc = word . text
rcORnot = gs . checkForAnnotation ( sentence , ' rc ' , ' word.dep_ ' )
oldsplitsentences = splitsentences
splitsentences = [ ]
if rcORnot == 1 :
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if wordwithrc in splitsentences [ n ] :
splitsentences [ n ] [ 0 ] = ' diesen '
verb = splitsentences [ n ] [ - 1 ]
splitsentences [ n ] = splitsentences [ n ] [ : - 1 ]
splitsentences [ n ] . insert ( 1 , verb )
generalrules = [ [ ' PDS ' , ' VAFIN ' ] , [ ' PDS ' , ' VVFIN ' ] ]
processed = 1
else :
splitsentences = oldsplitsentences
splitsentence = [ ]
if token [ n ] == ' wem ' or token [ n ] == ' Wem ' or token [ n ] == ' welchem ' :
daORnot = gs . checkForAnnotation ( sentence , ' da ' , ' word.dep_ ' )
oaORnot = gs . checkForAnnotation ( sentence , ' oa ' , ' word.dep_ ' )
reORnot = gs . checkForAnnotation ( sentence , ' re ' , ' word.dep_ ' )
oldsplitsentences = splitsentences
splitsentences = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 0 :
index = 1
if n == 1 :
index = 0
if reORnot == 1 :
pass
if daORnot == 1 and reORnot == 0 :
splitsentences [ index ] . insert ( 1 , ' das ' )
if oaORnot == 1 and reORnot == 0 :
splitsentences [ index ] . insert ( 1 , ' dem ' )
if n == 1 :
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
generalrules = [ [ ' PDS ' , ' VAFIN ' ] , [ ' PDS ' , ' VVFIN ' ] ]
processed = 1
if token [ n ] in self . indirectspeech_list and token [ 1 ] not in self . konsekutiv_list :
reORnot = gs . checkForAnnotation ( sentence , ' re ' , ' word.dep_ ' )
oldsplitsentences = splitsentences
splitsentences = [ ]
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
splitsentence = [ ]
splitsentences . append ( splitsentence )
if n == 0 :
index = 1
if n == 1 :
index = 0
if reORnot == 0 :
if splitsentences [ index ] [ 0 ] != ' was ' :
splitsentences [ index ] . insert ( 1 , ' das ' )
if n == 1 :
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
generalrules = [ [ ' PDS ' , ' VAFIN ' ] , [ ' PDS ' , ' VVFIN ' ] ]
processed = 1
if processed == 0 and n == 1 :
ZUVINFTupelORnot = gs . checkForAnnotationTuple ( sentence , [ ' PTKZU ' , ' VVINF ' ] , ' word.tag_ ' , ' None ' )
if ZUVINFTupelORnot == 0 :
ZUVINFTupelORnot = gs . checkForAnnotationTuple ( sentence , [ ' PTKZU ' , ' VAINF ' ] , ' word.tag_ ' , ' None ' )
if ZUVINFTupelORnot == 1 :
reORnot = gs . checkForAnnotation ( sentence , ' re ' , ' word.dep_ ' )
splitsentence = [ ]
for word in sentence :
if word [ - 1 ] == ' , ' :
splitsentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splitsentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
splitsentences . append ( splitsentence )
processed = 1
splitsentence = [ ]
splitsentences . append ( splitsentence )
for m in range ( 2 ) :
ZUINForNOT = gs . checkForAnnotationTuple ( splitsentences [ m ] , [ ' PTKZU ' , ' VVINF ' ] , ' word.tag_ ' , ' None ' )
if ZUINForNOT == 0 :
ZUINForNOT = gs . checkForAnnotationTuple ( splitsentences [ m ] , [ ' PTKZU ' , ' VAINF ' ] , ' word.tag_ ' , ' None ' )
if ZUINForNOT == 1 :
r = m
ZUINForNOT = 0
if r == 0 :
index = 1
if r == 1 :
index = 0
objectORnot = gs . checkForAnnotation ( splitsentences [ index ] , ' oa ' , ' word.dep_ ' )
if reORnot == 0 and objectORnot == 0 :
splitsentences [ index ] . insert ( 1 , ' das ' )
if r == 1 :
splitsentences [ 0 ] , splitsentences [ 1 ] = splitsentences [ 1 ] , splitsentences [ 0 ]
else :
processed == 2
except :
wasNotInAnyList = 1
#rules = [['ART','ADJA','NN'], ['ART','ADJA','NE'], ['ART', 'NN'], ['ART', 'NE'], ['APPR','NN'], ['APPR','NE'], ['APPR', 'ART', 'NN'], ['APPR', 'ART', 'NE'], ['APPR','ART','NN','ADJA','NN'], ['APPR','ART','NN','ADJA','NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD']]
#print('B',splitsentences)
endsentences = [ ]
if ( processed == 2 or processed == 0 ) and n == 1 :
wasNotInAnyList = 1
try :
if wasNotInAnyList == 0 :
newpunctuationsindex . insert ( 0 , [ counter - 1 , punctuations [ counter - 1 ] ] )
#print('splitsentencee', splitsentences)
if len ( splitsentences ) > 2 :
splitsentences = splitsentences [ : 2 ]
#print('splitsentenceeeees', splitsentences)
for splitsentence in splitsentences :
#print('splitsentenceeeeeeeeeeee!!',splitsentence)
wordtoputfirst = ' nada '
for word in self . firstwordlist :
if word == splitsentence [ 0 ] :
wordtoputfirst = word
splitsentence . remove ( word )
#print('get the tuples and triples to check..')
tuplesTocheck , triplesTocheck , quadruplesTocheck = self . gs . GetTuplesinSentence ( splitsentence )
#print('done')
#print(tuplesTocheck, 'ole', triplesTocheck ,'aiai', quadruplesTocheck)
#print('1')
grammpiecessentence = self . gs . createTupleofGrammarpieces ( splitsentence , tuplesTocheck , triplesTocheck , quadruplesTocheck )
#print('grammpiece',grammpiecessentence)
#print('2')
if len ( grammpiecessentence ) > 7 :
print ( ' A sentence is too long, too many permutations. \n piping wrong grammar.. ' )
endsentence = ' ' . join ( grammpiecessentence )
else :
#print('genrating the permutations')
permutations = self . sgm . GeneratePermutationsOfSentence ( grammpiecessentence )
#print('done')
#print(permutations)
#print('3')
firstwordwithverblist = [ ' deswegen ' , ' danach ' ]
permutationstodelete = [ ]
for permutation in permutations :
#print('4')
if permutation [ 0 ] in firstwordwithverblist :
#print('4.1')
count = 1
for word in self . nlp ( permutation [ 1 ] ) :
#print('4.2')
if word . tag_ [ 0 ] != ' V ' :
#print('4.3')
permutationstodelete . append ( permutation )
break
else :
break
#for word in self.nlp(permutation[0]):
#print('4.2')
#if word.tag_[0] != 'V':
#print('4.3')
#permutationstodelete.append(permutation)
#break
#else:
#break
for delperm in permutationstodelete :
try :
permutations . remove ( delperm )
except :
pass
#print('5')
sentencesToCheck = [ ]
if wordtoputfirst in self . firstwordlist :
for sentence in permutations :
sentencesToCheck . append ( wordtoputfirst + ' ' + ' ' . join ( sentence ) )
else :
for sentence in permutations :
sentencesToCheck . append ( ' ' . join ( sentence ) )
endsentence = self . sgm . GetBestSentenceFromSentencesAccordingToGrammar ( sentencesToCheck , ' ' . join ( splitsentence ) )
#print('done')
#print('endsent',endsentence)
endsentences . append ( endsentence )
except :
#print('there was an error')
wasNotInAnyList = 1
endsentences = [ ]
todelete = [ ]
for index in range ( len ( newpunctuationsindex ) ) :
if newpunctuationsindex [ index ] [ 0 ] == counter - 1 :
todelete . append ( index )
for todel in todelete [ : : - 1 ] :
del newpunctuationsindex [ todel ]
if wasNotInAnyList == 1 :
#print('was not in any list')
#print(oldsentence)
endsplisentences = [ ]
splisentence = [ ]
for word in oldsentence :
if word [ - 1 ] == ' , ' :
splisentence . append ( word [ : - 1 ] )
if word == ' , ' :
pass
if word [ - 1 ] != ' , ' :
splisentence . append ( word )
if word [ - 1 ] == ' , ' or word == ' , ' :
endsplisentences . append ( splisentence )
splisentence = [ ]
endsplisentences . append ( splisentence )
newpunctuationsindex . insert ( 0 , [ counter - 1 , punctuations [ counter - 1 ] ] )
#print('endsplisentences',endsplisentences)
for splsentence in endsplisentences :
endsentences . append ( ' ' . join ( splsentence ) )
'''
fsearch1 = self . fsearch1
spacyclass1 = ' word.tag_ '
gs_sentence1 = gs . Sentence2GrammarSchema ( ' ' . join ( splitsentence ) , spacyclass1 )
print ( ' searchPatternMatch for tags ' )
bestmatches1 = fsearch1 . searchPatternMatch ( ' ' . join ( gs_sentence1 ) , 1 )
print ( ' done ' )
#print('oioi', bestmatches1)
#print(len(fsearch1.database))
right_gs_tupel1 = [ ]
if len ( bestmatches1 ) < 10 :
bestndocs1 = len ( bestmatches1 )
else :
bestndocs1 = 10
for m in range ( bestndocs1 ) :
right_gs_tupel1 . append ( fsearch1 . database [ bestmatches1 [ m ] [ 0 ] ] )
statistically_correct_sentences1 = gs . Sentence2RightGrammarTupel ( ' ' . join ( splitsentence ) , gs_sentence1 , right_gs_tupel1 )
fsearch2 = self . fsearch2
spacyclass2 = ' word.dep_ '
gs_sentence2 = gs . Sentence2GrammarSchema ( ' ' . join ( splitsentence ) , spacyclass2 )
print ( ' searchPatternMatch for deps ' )
bestmatches2 = fsearch2 . searchPatternMatch ( ' ' . join ( gs_sentence2 ) , 1 )
print ( ' done ' )
right_gs_tupel2 = [ ]
if len ( bestmatches2 ) < 10 :
bestndocs2 = len ( bestmatches2 )
else :
bestndocs2 = 10
for m in range ( bestndocs2 ) :
right_gs_tupel2 . append ( fsearch2 . database [ bestmatches2 [ m ] [ 0 ] ] )
#print(' '.join(splitsentence))
statistically_correct_sentences2 = gs . Sentence2RightGrammarTupel ( ' ' . join ( splitsentence ) , gs_sentence2 , right_gs_tupel2 )
print ( splitsentence )
Rightsentence = gs . GetBestgsAccordingRules ( ' ' . join ( splitsentence ) , gs_sentence1 , right_gs_tupel1 , right_gs_tupel2 , statistically_correct_sentences1 , statistically_correct_sentences2 , rules , generalrules )
'''
for endsentence in endsentences :
gramcorr_splitsentences . append ( endsentence . split ( ) )
for index in newpunctuationsindex :
punctuations . insert ( index [ 0 ] , index [ 1 ] )
return gramcorr_splitsentences , punctuations
def putAppendixesIntoOwnSentences ( self , sentences , punctuations ) :
gs = self . gs
#triples = [['NN', 'ART', 'NN'], ['NE', 'ART', 'NN'], ['NN', 'ART', 'NN'], ['NE', 'ART', 'NE']]
quadruples = [ [ ' NN ' , ' APPR ' , ' NE ' , ' NN ' ] , [ ' NN ' , ' APPR ' , ' NE ' , ' NN ' ] , [ ' NN ' , ' APPR ' , ' ART ' , ' NN ' ] , [ ' NE ' , ' APPR ' , ' ART ' , ' NN ' ] , [ ' NN ' , ' APPR ' , ' ART ' , ' NE ' ] , [ ' NE ' , ' APPR ' , ' ART ' , ' NE ' ] ]
quadruplestochange = [ ]
triplestochange = [ ]
newsentences = [ ]
newpunctuations = [ ]
Whatisofnouns = [ ]
2020-09-06 01:33:50 +02:00
oldsentences = sentences
oldpunctuations = punctuations
2020-08-16 19:36:44 +02:00
for hauptindex in range ( len ( sentences ) ) :
2020-09-06 01:33:50 +02:00
2020-08-16 19:36:44 +02:00
sentence = sentences [ hauptindex ]
2020-09-06 01:33:50 +02:00
try :
#for triple in triples:
# AnnoOrNot, tripleInWords = gs.checkForAnnotationTriple(sentence, triple, 'word.tag_', 'None')
# for tripleinwor in tripleInWords:
# triplestochange.append([triple, tripleinwor])
for quadruple in quadruples :
AnnoOrNot , quadrupleInWords = gs . checkForAnnotationQuadruple ( sentence , quadruple , ' word.tag_ ' , ' None ' )
#print('quadinwords', quadrupleInWords)
#print('ANNOORNOT', AnnoOrNot)
for quadrupleInWo in quadrupleInWords :
quadruplestochange . append ( [ quadruple , quadrupleInWo ] )
#print('quadstochange',quadruplestochange)
for quad in quadruplestochange :
for n in range ( len ( sentence ) - 4 ) :
if sentence [ n ] == quad [ 1 ] [ 0 ] :
if sentence [ n + 1 ] == quad [ 1 ] [ 1 ] :
if sentence [ n + 2 ] == quad [ 1 ] [ 2 ] :
artword = None
longerWhatisnoun = 0
for m in range ( 2 ) :
for word in self . nlp ( sentence [ n - m ] ) :
if word . tag_ == ' ART ' :
Nounthatis = sentence [ n - m : n + 1 ]
import spacy
nlp = spacy . load ( ' de_core_news_sm ' )
token3 = nlp ( sentence [ n + 4 ] )
counter = 0
Whatisnoun = sentence [ n + 1 : n + 4 ]
for wor in token3 :
counter + = 1
if wor . tag_ == ' NN ' or wor . tag_ == ' NE ' :
if counter == 1 :
Whatisnoun = sentence [ n + 1 : n + 5 ]
longerWhatisnoun = 1
if counter == 2 :
Whatisnoun = sentence [ n + 1 : n + 4 ]
artword = word . text
#print(sentence[n - 1],'oi')
if ( ( artword == ' die ' or artword == ' Die ' ) and sentence [ n ] [ - 1 ] != ' n ' ) or ( ( artword == ' der ' or artword == ' einer ' or artword == ' dieser ' ) and ( sentence [ n - 2 ] in [ ' von ' , ' in ' , ' auf ' , ' ueber ' , ' unter ' , ' nach ' , ' mit ' ] ) ) :
if artword == ' der ' :
Nounthatis [ 0 ] = ' die '
donothing = 0
if sentence [ n + 1 ] == ' mit ' :
if sentence [ n + 2 ] == ' den ' :
verb = ' hat die '
Whatisnoun = Whatisnoun [ 2 : ]
if sentence [ n + 2 ] == ' der ' :
verb = ' hat eine '
Whatisnoun = Whatisnoun [ 2 : ]
if sentence [ n + 2 ] != ' der ' and sentence [ n + 2 ] != ' den ' :
donothing = 1
2020-08-16 19:36:44 +02:00
else :
2020-09-06 01:33:50 +02:00
verb = ' ist '
if donothing == 0 :
newsentence = ' ' . join ( Nounthatis ) + verb + ' ' . join ( Whatisnoun )
newsentences . append ( [ hauptindex + 1 , newsentence . split ( ) ] )
newpunctuations . append ( [ hauptindex + 1 , punctuations [ hauptindex ] ] )
if longerWhatisnoun == 0 :
Whatisofnouns . append ( [ n + 1 , n + 4 , hauptindex ] )
else :
Whatisofnouns . append ( [ n + 1 , n + 5 , hauptindex ] )
except :
print ( ' Konnte nicht ' + str ( sentence ) + ' in Characterisierung pro Satz prozessieren.. ' )
try :
for whatis in Whatisofnouns [ : : - 1 ] :
thereisacomma = 0
#print(sentences[whatis[2]][whatis[1] - 1])
if sentences [ whatis [ 2 ] ] [ whatis [ 1 ] - 1 ] [ - 1 ] == ' , ' :
thereisacomma = 1
if thereisacomma == 1 :
#print(sentences[whatis[2]][whatis[0] - 1])
sentences [ whatis [ 2 ] ] [ whatis [ 0 ] - 1 ] = sentences [ whatis [ 2 ] ] [ whatis [ 0 ] - 1 ] + ' , '
del sentences [ whatis [ 2 ] ] [ whatis [ 0 ] : whatis [ 1 ] ]
for newsent in newsentences [ : : - 1 ] :
sentences . insert ( newsent [ 0 ] , newsent [ 1 ] )
for newpunct in newpunctuations [ : : - 1 ] :
punctuations . insert ( newpunct [ 0 ] , newpunct [ 1 ] )
for sentence in sentences :
if sentence [ - 1 ] [ - 1 ] == ' , ' :
sentence [ - 1 ] = sentence [ - 1 ] [ : - 1 ]
except :
print ( ' konnte nicht die gesammelten Characterisierungen prozessieren ' )
sentences = oldsentences
punctuations = oldpunctuations
2020-08-16 19:36:44 +02:00
return sentences , punctuations