basabuuka_prototyp/Prototyp/SentSeg.py


# split sentences

# in den Listen fehlt noch sondern ( und noch weitere Dinge..)


# Folgende Konjunktionen brauchen keine Satzumformungen:
    # Woraufhin, zudem, zumal, umso - desto, 

# sondern ist schwierig zu lösen.. am besten mit sondern weg, und anschließend SentGlue


class SentSeg(object):
    
    def __init__(self, language):
        
        self.language = language
        
        self.punktuation_list = ['.', '?', '!', ';', ':']
        
        self.wrappunktuation_list = [',', '-']
        
        self.adversativ_list = ['wohingegen', 'Wohingegen', 'aber', 'Aber', 'wobei', 'Wobei', 'hingegen']
        
        self.final_list = ['damit','Damit', 'um', 'Um']
        
        self.kausal_list = ['weil', 'Weil', 'da', 'Da', 'denn', 'falls', 'Falls' ]
        
        self.konditional_list = ['wenn', 'Wenn', 'sobald', 'Sobald', 'als', 'falls']
        
        self.konsekutiv_list = ['dass', 'Dass']
        
        self.konzessiv_list = ['obwohl', 'Obwohl', 'obgleich', 'Obgleich', 'trotzdem', 'Trotzdem', 'wenngleich', 'doch']
        
        self.lokal_list = ['wo', 'Wo']
        
        self.temporal_list_vor = ['bevor', 'Bevor']
        
        self.temporal_list_nach = ['nachdem', 'Nachdem']
        
        self.instrumental_list = ['indem', 'Indem']
        
        self.indirectspeech_list = ['ob', 'Ob', 'wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso']
        self.firstwordlist = []
        #self.firstwordlist = ['wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso', 'dies', 'dann', 'jedoch', 'deswegen', 'trotzdem', 'danach', 'davor', 'wenn', 'sobald']
        
        self.full_list = self.adversativ_list + self.final_list + self.kausal_list + self.konditional_list + self.konsekutiv_list + self.konzessiv_list + self.lokal_list + self.temporal_list_nach + self.temporal_list_vor + self.instrumental_list + self.indirectspeech_list
    
    def ReadDoc2Sent(self, document):
        
        splitsentences = []
        splitsentence = []
        
        with open(document) as sentences:
            counter = 0
            for sentence in sentences:
                
                counter += 1
                if counter % 1000 == 0:
                    print(counter)
                
                words = sentence.split()
                
                
                for word in words:
                    
                    splitsentence.append(word)
                    
                    
                    if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
                        
                        splitsentences.append([splitsentence])
                        
                        splitsentence = []
        
        return splitsentences
    
    
    def AndOrSolver(self, sentences, punctuations):
        
        for n in range(len(punctuations)):
            if punctuations[n] == ':' or punctuations[n] == '-':
                punctuations[n] = '.'
        
        
        #print(sentences, punctuations)
        
        splitsentences = []
        
        counter = 0
        
        newsentences = []
        for sentence in sentences:
            newpunctuationsindexes = []
            utterancenumber = sentence[2]
            commainfo = sentence[1]
            commaornot = commainfo[0]
            sentence = sentence[0]
            
            
            counter += 1
            doc = self.nlp(' '.join(sentence))
            
            subjectcount = 0
            separationwords = []
            subjectcounts = []
            doccounter = 0
            subjectindex = []
            rcornot = 0
            for word in doc:
                doccounter += 1
                if word.dep_ == 'sb' or word.dep_ == 'ep':
                    subjectcount += 1
                    subjectindex.append(doccounter - 1)
                if word.dep_ == 'rc':
                    rcornot = 1
                
                
                if word.tag_ == '$,':
                    
                    subjectcounts.append([subjectcount, doccounter - 2, subjectindex, rcornot])
                    subjectindex = []
                    subjectcount = 0
                    #print('aleaole',sentence[doccounter - 2])
                    if len(sentence[doccounter - 2]) > 1:
                        
                        doccounter -= 1
            
                if word.text == 'und' or word.text == 'also' or word.text == 'oder' or word.text == 'schon' or word.text == 'bald' or word.text == 'doch' or word.text == 'jedoch' or word.text == 'sondern':
                    separationwords.append(doccounter - 1)
            
            #print('separationwords', separationwords)
            #print('subjectcounts', subjectcounts)
                
                
            separationwordstocut = []
            listofownsentencessubjectindexes = []
            for n in range(len(subjectcounts) - 1):
                if subjectcounts[n][0] > 0 and subjectcounts[n + 1][0] > 0 and subjectcounts[n + 1][3] == 0:
                    listofownsentencessubjectindexes.append(subjectcounts[n])
                for m in range(len(separationwords)):
                    if subjectcounts[n][1] < separationwords[m] < subjectcounts[n + 1][1]:
                        #print(subjectcounts[n + 1],  separationwords[m])
                        if subjectcounts[n + 1][0] > 1:
                            if subjectcounts[n + 1][2][0] < separationwords[m] <= subjectcounts[n + 1][2][-1]:
                                separationwordstocut.append(separationwords[m]) 
            
            processed = 0
            
            #print('oioioi')
            #print(listofownsentencessubjectindexes)
            #print(separationwordstocut)
            
            if len(listofownsentencessubjectindexes) > 0:
                for n in range(len(listofownsentencessubjectindexes)):
                    
                    sentence[listofownsentencessubjectindexes[n][1]] = sentence[listofownsentencessubjectindexes[n][1]] + 'alohaseparator'
                    newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
                    #print('a new punctuation1')
                processed = 1
            if len(separationwordstocut) > 0:
                for n in range(len(separationwordstocut)):
                    sentence[separationwordstocut[n] - 1] = sentence[separationwordstocut[n] - 1] + 'alohaseparator'
                    #print('a new punctuation2')
                    newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
                    processed = 1
            
            if processed == 0:
                newsentences.append([sentence])
            
            if processed == 1:
                #print(sentence)
                splitsentence = []
                for word in sentence:
                    splitsentence.append(word)
                    if word[-14:] == 'alohaseparator':
                        if splitsentence[-1][-15] == ',':
                            splitsentence[-1] = splitsentence[-1][:-15]
                        else:
                            splitsentence[-1] = splitsentence[-1][:-14]
                        newsentences.append([splitsentence])
                        splitsentence = []
                newsentences.append([splitsentence])
            
            #print(newpunctuationsindexes)
            newpunctuationsindexes = newpunctuationsindexes[::-1]
            for n in range(len(newpunctuationsindexes)):
                punctuations.insert(newpunctuationsindexes[n][1], newpunctuationsindexes[n][0])
                
        #print(newsentences, punctuations)
        return newsentences, punctuations
        
    
    def LoadBoWModelAndDatabaseOnesZeros(self):
        
        
        import FASTsearch
        
        #print('loading the tag hkl db..')
        self.fsearch1 = FASTsearch.FASTsearch('GS_DB_word.tag_.hkl')
        #print('done')
        
        #print('generating BoW Model..')
        self.fsearch1.Gen_BoW_Model(1000, "word")
        #print('done')
        
        #print('loading the bow model')
        self.fsearch1.Load_BoW_Model('bagofwordsGS_DB_word.tag_.pkl', 'DataBaseOneZerosGS_DB_word.tag_.hkl')
        #print('done')
        
        #print('loading the dep hkl db..')
        self.fsearch2 = FASTsearch.FASTsearch('GS_DB_word.dep_.hkl')
        #print('done')
        
        #print('generating BoW Model..')
        self.fsearch2.Gen_BoW_Model(1000, "word")
        #print('done')
        
        #print('loading the bow model')
        self.fsearch2.Load_BoW_Model('bagofwordsGS_DB_word.dep_.pkl', 'DataBaseOneZerosGS_DB_word.dep_.hkl')
        #print('done')

    def LoadSentGlueSGDandGSUtils(self):
        
        import GS_Utils
        #print('initializing the gs utils..')
        self.gs = GS_Utils.GS_Utils('de_core_news_sm')
        #print('done')
        

        from SentGlue import SentGlueMach
        #print('loading the Stochastic Gradient models..')
        self.sgm = SentGlueMach('trainedSGD_twolabel.pkl', 'bagofwordstwolabel.pkl')
        #print('done')
        #print('initializing the SGM..')
        self.sgm.initialize()
        #print('done')
        
        #print('importing spacy..')
        import spacy
        #print('done')
        
        #print('importing german model..')
        self.nlp = spacy.load('de_core_news_sm')
        #print('done')
        
        return 'done'
    
    def CommaSentenceOrNot(self, sentences):
        
        nlp = self.nlp
        
        commasentences = []
        counter = 0
        
        #print('creating array of comma or not..')
        for sentence in sentences:
            
            doc = nlp(' '.join(sentence[0]))
            
            #print(doc)
            counter += 1
            #if counter % 100 == 0:
                #print(counter)
                
            
            n = 0
            firstone = 0
            token = []
            nextword = 0
            for word in doc:
                #print(word.tag_)
                # es eignet sich hierbei word.pos_  fuer noun und verb, word.dep_ fuer sb pd, und evtl tag 
                
                if firstone == 0:
                    token.append(word.text)
                
                firstone = 1
                

                if nextword == 1:
                    token.append(word.text)
                
                nextword = 0
                
                if word.tag_ == '$,':
                    n += 1
                    nextword = 1
            
            sentence.append([n, token])
                
            commasentences.append(sentence)
            
        #print('done')
        return commasentences
    
    def EnumerationSolver(self, sentences):
        
        
        gs = self.gs
        
        
        nlp = self.nlp
        
        sgm = self.sgm
        
        
        enumerationsentences = []
        counter = 0
        NOTenumerations = []
        #print('processing enumerations..')
        for sentence in sentences:
            
            doc = nlp(' '.join(sentence[0]))
            
            #print(doc)
            counter += 1
            #if counter % 100 == 0:
                #print(counter)
            
            n = 0
            firstone = 0
            token = []
            nextword = 0
            enumeration = False
            
            splitsentence = []
            splitsentence_deps = []
            splitsentence_tags = []
            splitsentences = []
            splitsentences_deps = []
            splitsentences_tags = []
            
            
            for word in doc:
                #print(word.tag_)
                # es eignet sich hierbei word.pos_  fuer noun und verb, word.dep_ fuer sb pd, und evtl tag 
                
                
                nextword = 0
                
                if word.tag_ == '$,':
                    n += 1
                    nextword = 1
                
                if (word.text == 'und' or word.text == 'oder') and n >= 1:
                    enumeration = True
                    break
            
            
            output = []
            if enumeration == True:
                
                for word in doc:
                    
                    #print(word.text)
                        
                    if word.text != ',' and word.text != '.' and word.text != 'und':
                        
                        splitsentence.append(word.text)
                        splitsentence_deps.append(word.dep_)
                        splitsentence_tags.append(word.tag_)
                    
                    if word.text == ',' or word.text == 'und':
                        
                        #print('oi')
                        
                        splitsentences.append(splitsentence)
                        splitsentences_deps.append(splitsentence_deps)
                        splitsentences_tags.append(splitsentence_tags)
                        splitsentence = []
                        splitsentence_deps = []
                        splitsentence_tags = []
                        
                splitsentences.append(splitsentence)
                splitsentences_deps.append(splitsentence_deps)
                splitsentences_tags.append(splitsentence_tags)
                
                #print( 'splitsentences', splitsentences)
                
                token = []
                enumerations = []
                enumerationsSPOs = []
                NOTenumerations = []
                
                for sentence in splitsentences:
                    token.append(sentence[0])
                    
                    
                    if sentence[0] not in self.full_list:
                        enumerations.append(sentence)
                        enumerationsSPOs.append(gs.checkSPO(sentence, 0))
                    else:
                        NOTenumerations.append(sentence)
                
                #print(enumerationsSPOs)
                
                
                #print('enumerations', enumerations)
                biggest = []
                for i in range(len(enumerationsSPOs)):
                    biggest.append([i, sum(enumerationsSPOs[i])])
                
                
                sortedbiggest = sorted(biggest[::-1], key=lambda tup: tup[1], reverse=True)
                
                for i in range(len(sortedbiggest)):
                    if sortedbiggest[i][0] == 0:
                        mainsentenceIndex = sortedbiggest[i][0]
                        lastornot = 0
                        break
                    
                    if sortedbiggest[i][0] == len(biggest) - 1:
                        mainsentenceIndex = sortedbiggest[i][0]
                        lastornot = 1
                        break
                    
                
                # Hier muss noch für den Fall Er, sie und der Beamte LACHTEN den Clown aus --> das lachten abgefangen werden mit der Datenbank der Fälle, sprich enumeration im spo 1 0 0 + plural muss dann zu singular werden abhängig von den artikeln.
                #print('enumerations', enumerations)
                mainsentence = enumerations[mainsentenceIndex]
                #print('main', mainsentence)
                probablemainsentences = []
                for i in range(len(enumerations)):
                    if i != mainsentenceIndex:
                        iprobablemainsentences = []
                        probablemainsentence = []
                        if lastornot == 0:
                            for j in range(1, len(mainsentence)):
                                probablemainsentence = mainsentence[0:j] + enumerations[i]
                                #print(probablemainsentence)
                                iprobablemainsentences.append(' '.join(probablemainsentence))
                        if lastornot == 1:
                            for j in range(1, len(mainsentence)):
                                probablemainsentence = enumerations[i] + mainsentence[-j:] 
                                iprobablemainsentences.append(' '.join(probablemainsentence))
                        probablemainsentences.append(iprobablemainsentences)
                
                
                # hier wird auf noch da geprüft, aber es ist wichtiger in diesem fall, dass ein tuple nicht zerissen vorkommt AENDERN !!!!
                
                #print('probablemainsentences', probablemainsentences)
                tuplesToCheck = []
                tuples = [['ART', 'NN'], ['APPR','NN'], ['ART', 'CARD']]
                for tupl in tuples:
                    
                    
                    checktupleindex, tupleInWords = gs.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')
                    if checktupleindex == 2:
                        tuplesToCheck.append([tupl, tupleInWords])
                triplesToCheck = []
                triples = [['ART','ADJA','NN'], ['APPR', 'ART', 'NN'], ['KOKOM', 'ART', 'NN']]
                for tripl in triples:
                    checktripleindex, tripleInWords = gs.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')
                    if checktripleindex == 3:
                        triplesToCheck.append([tripl, tripleInWords])   
                
                #print('tuples to check', tuplesToCheck)
                #print('triples to check', triplesToCheck)
                #print('probablemainsentences', probablemainsentences)
                for probsentences in probablemainsentences:
                    
                    checktripleindexes = []
                    checktupleindexes = []
                    #print(probsentences)
                    filteredprobsentences = []
                    for sentence in probsentences:
                        tuplchecked = 0
                        triplchecked = 0
                        #print('sentence and tuples to check', sentence, tuplesToCheck)
                        for tupl in tuplesToCheck:
                            
                            checkedsecondtime, tupleinWords = gs.checkForAnnotationTuple(sentence.split(), tupl[0], 'word.tag_', tupl[1])
                            
                            #print(sentence, checkedsecondtime)
                            if checkedsecondtime == 1:
                                        
                                tuplchecked = 0
                            if checkedsecondtime == 2:
                                
                                tuplchecked = 1
                        
                        for tripl in triplesToCheck:
                            checkedsecondtime, tripleinWords = gs.checkForAnnotationTriple(sentence.split(), tripl[0], 'word.tag_', tripl[1])
                            if checkedsecondtime == 1 or checkedsecondtime == 2:
                                            
                                triplchecked = 0
                            if checkedsecondtime == 3:
                                
                                triplchecked = 1
                            
                        
                        if triplchecked == 1 or tuplchecked == 1:
                            filteredprobsentences.append(sentence)
                    
                    #print('filteredprobsentences', filteredprobsentences)
                    if len(filteredprobsentences) == 0:
                        filteredprobsentences = probsentences
                    # here is still the problem, that there are lists of words instead of proper sentences..
                    #print('filteredprobsentences', filteredprobsentences)
                    probsMatrix = sgm.predictprobsOnSentenceList(filteredprobsentences, filteredprobsentences)
                    
                    #print(probsMatrix)
                    
                    for i in range(len(probsMatrix)):
                        probsMatrix[i][0] = i
                    
                    #print(probsMatrix)
                    
                    sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
                    
                    #print(sortedprobsMatrix)
                    
                    bestindex = sortedprobsMatrix[0][0]
                    
                    #print(bestindex)
                    #print('probablemainsentences', filteredprobsentences)
                    probablemainsentence = filteredprobsentences[int(bestindex)]
                    #print('oi', probablemainsentence)
                    
                    #print('probablemainsentence', probablemainsentence)
                    enumerationsentences.append([probablemainsentence])
                    
                
                enumerationsentences.append([' '.join(mainsentence)])
                
                for notenum in NOTenumerations:
                    #print(enumerationsentences)
                    #print(enumerationsentences[-1])
                    #print('enum no1', enumerationsentences)
                    #print('notenum', notenum)
                    enumerationsentences[-1].append(' '.join(notenum))
                    #print('enumsentences',enumerationsentences[-1])
                    enumerationsentences[-1] = [', '.join(enumerationsentences[-1])]
                    
                
            else:
                enumerationsentences.append([sentence])
            
            
            output.append(enumerationsentences)
        
        
        for n in range(len(output[0])):
            #print('out',output[0][n])
            try:
                output[0][n] = [output[0][n][0].split()]
            except:
                output[0][n] = [output[0][n][0][0]]
        
        
        #print('done')
        return output[0] 
    
    
    def GetUtteranceNumber(self, sentences):
        
        nlp = self.nlp
        
        uttersentences = []
        
        for sentence in sentences:
            
            doc = nlp(' '.join(sentence[0]))
            
            subjectcount = 0

            for word in doc:

                if word.dep_ == 'sb' or word.dep_ == 'ep':
                    subjectcount += 1
            
            sentence.append(subjectcount)
            uttersentences.append(sentence)
        
        return uttersentences
    
    def GetQuestionOrNot(self, sentences):
        
        nlp = self.nlp
        
        uttersentences = []
        questionmark = 0
        for sentence in sentences:
            
            doc = nlp(' '.join(sentence[0]))
            
            
            count = 0
            for word in doc:
                
                
                count += 1 
                
                if word.text == '?':
                    questionmark = 1
            
            sentence.append(questionmark)
            uttersentences.append(sentence)
        
        return uttersentences
    
    def SplitSentencesIntoHauptNebenTuple(self, sentences, punctuations):
        
        
        oldsplitsentences = []
        #print('hauptneben inputsentences', sentences)
        
        gs = self.gs
        
        #print('importing spacy..')
        import spacy
        #print('done')
        
        nlp = self.nlp
        
        outputsentences = []
        sentencesThatAreOutoutput = []
        outsentences = []
        for generalindex in range(len(sentences)):
            presentence = sentences[generalindex]
            
            splitsentence = []
            splitsentence_deps = []
            splitsentence_tags = []
            splitsentences = []
            splitsentences_deps = []
            splitsentences_tags = []
            commainfo = presentence[1]
            outputsentence = []
            
            
            token = commainfo[1]
            
            commaornot = commainfo[0]
            
            numberutterances = presentence[2]
            
            sentence = presentence[0]
            
            oldsentence = presentence[0]
            
            #print(commaornot)
            if commaornot >= 2:
                #print('nla')
                
                sentence[0] = sentence[0].title()
                
                doc = nlp(' '.join(sentence))
            
                
                for word in doc:
                    
                    #print(word.text)
                        
                    if word.text != ',' and word.text != '.':
                        
                        splitsentence.append(word.text)
                        splitsentence_deps.append(word.dep_)
                        splitsentence_tags.append(word.tag_)
                    
                    if word.text == ',':
                        
                        #print('oi')
                        
                        splitsentences.append(splitsentence)
                        splitsentences_deps.append(splitsentence_deps)
                        splitsentences_tags.append(splitsentence_tags)
                        splitsentence = []
                        splitsentence_deps = []
                        splitsentence_tags = []
                
                
                splitsentences.append(splitsentence)
                splitsentences[0][0] = splitsentences[0][0].lower()
                splitsentences_deps.append(splitsentence_deps)
                splitsentences_tags.append(splitsentence_tags)
                oldsplitsentences = splitsentences
                #print(splitsentences)
                #print(splitsentences_tags)
                #print(splitsentences_deps)
                spo = []
                
                for n in range(len(splitsentences)):
                    prespo = []
                    prespo = gs.checkSPO(splitsentences_deps[n], 1)
                    prespo.append( gs.checkForAnnotation(splitsentences[n], 'VVINF', 'word.tag_'))
                    prespo.append(gs.checkForAnnotation(splitsentences[n], 'VAFIN', 'word.tag_'))
                    prespo.append(gs.checkForAnnotation(splitsentences[n], 'VVFIN', 'word.tag_'))
                    prespo.append(gs.checkForAnnotation(splitsentences[n], 'VMFIN', 'word.tag_'))
                    
                    
                    spo.append(prespo)
                #print(splitsentences_deps)
                #print(splitsentences)
                #print(spo)
                
                indexSPO = []
                lastm = len(splitsentences)
                for o in range(len(splitsentences)):
                    
                    m = len(splitsentences) - 1 - o
                    for n in range(len(splitsentences)):
                        
                        
                        if m < n - 1 and n < lastm:
                            
                            #print('spo s',spo[m], spo[n])
                            sb = spo[m][0] + spo[n][0] 
                            Vafin = 1
                            if spo[m][3] == 1 or spo[n][3] == 1:
                                Vafin = spo[m][3] + spo[n][3]
                            Vvinf = 1
                            if spo[m][4] == 1 or spo[n][4] == 1:
                                Vvinf = spo[m][4] + spo[n][4]
                            Vvfin = 1
                            if spo[m][5] == 1 or spo[n][5] == 1:
                                Vvfin = spo[m][5] + spo[n][5]
                            Vmfin = 1
                            if spo[m][6] == 1 or spo[n][6] == 1:
                                Vmfin == spo[m][6] + spo[n][6]
                            #wrapped = 0
                            #for n in range(len(indexSPO)):
                                #if n == indexSPO[n][0] + 1 and n == indexSPO[n][1] - 1:
                                    #wrapped = 1
                            #print(sb, Vafin, Vvinf, Vvfin, Vmfin, 'm n', m, n)
                            if sb == 1 and Vafin == 1 and Vvinf == 1 and (Vvfin == 1 or Vmfin == 1):
                                
                                indexSPO.append([m,n])
                                #print([m,n])
                                lastm = m
                                #print('lastm',lastm)
                                        
                            
                #print(splitsentences)
                Hauptsentences = []
                for n in range(len(indexSPO)):
                    if indexSPO[n][0] > indexSPO[n][1]:
                        i = 1
                        j = 0
                    else:
                        i = 0
                        j = 1
                    Hauptsentences.append([splitsentences[indexSPO[n][i]] + splitsentences[indexSPO[n][j]] , indexSPO[n][i], indexSPO[n][j] ])
                
                HauptSentences = []
                for n in range(len(Hauptsentences)):
                    m = len(Hauptsentences) - 1 - n
                    HauptSentences.append(Hauptsentences[m])
                
                #print('Hauptsentences', Hauptsentences)
                #print('HauptSentences', HauptSentences)
                sentencesThatAreOut =[]
                
                for n in range(len(HauptSentences)):
                    index = HauptSentences[n][1]
                    finish = 0
                    #print('Oi',HauptSentences[n])
                    if n == len(HauptSentences) - 1:
                        
                        #print('lenHauptsentences', len(HauptSentences))
                        
                        stopindex = len(splitsentences)
                        finish = 1
                    else:
                        stopindex = HauptSentences[n + 1][1]
                    #print('stopindex', stopindex)    
                    vvfinisthere = 0
                    if finish == 0:
                        if splitsentences_tags[stopindex][0] == 'VVFIN':
                            stopindex -= 1
                            vvfinisthere = 1
                            
                    if splitsentences_tags[index][0] == 'VVFIN':
                        vvfinisthere = 1
                    
                    if vvfinisthere == 1:
                        
                        
                        HNTuple = HauptSentences[n][0] + [','] + splitsentences[index - 1]
                        outputsentence.append(HNTuple)
                        sentencesThatAreOut.append(index - 1)
                        sentencesThatAreOut.append(Hauptsentences[n][1])
                        sentencesThatAreOut.append(Hauptsentences[n][2])
                        
                    for m in range(index + 1, stopindex ):
                        if m != HauptSentences[n][2]:
                            HNTuple = HauptSentences[n][0] + [','] + splitsentences[m]
                            #print('check', HauptSentences[n], n)
                            #print('check', splitsentences[m], m)
                            #print('double', HNTuple)
                            outputsentence.append(HNTuple)
                            
                            
                            sentencesThatAreOut.append(m)
                            sentencesThatAreOut.append(Hauptsentences[n][1])
                            sentencesThatAreOut.append(Hauptsentences[n][2])
                
                sentencesThatAreOutoutput.append(sentencesThatAreOut)
                
                
                cpOrNots = []
                rcOrNots = []
                for splitsentence in splitsentences_deps:
                    cpOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'cp')
                    cpOrNots.append(cpOrNot)
                    rcOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'rc')
                    rcOrNots.append(rcOrNot)
                    
                #print('Laenge splitsentences', len(splitsentences))
                #print('laenge cpOrNots', len(cpOrNots))
                #print(cpOrNots)
                #print('rc or nots', rcOrNots)
                pairs = []
                for n in range(len(cpOrNots)):
                    index = len(cpOrNots) - 1 - n
                    done = 0
                    if rcOrNots[index] == 1:
                        pairs.append([index, index - 1])
                        done = 1
                    
                    
                    if done == 0 and cpOrNots[index] == 1:
                        try:
                            if splitsentences_tags[index + 1][0] == 'VVFIN':
                                pairs.append([index, index + 1])
                                done = 1
                        except:
                            pass
                        try:
                            if done == 0 and rcOrNots[index - 1] == 0:
                                pairs.append([index, index - 1])
                                done = 1
                        except:
                            pass
                        try:
                            if done == 0 and rcOrNots[index - 1] == 1:
                                if rcOrNots[index - 2] == 0:
                                    pairs.append([index, index - 2])
                        except:
                            pass
                        
                for pair in pairs[::-1]:
                    if pair[0] not in set(sentencesThatAreOut) or pair[1] not in set(sentencesThatAreOut):
                        outputsentence.append(splitsentences[pair[1]] + [','] + splitsentences[pair[0]])
                #print('hnhn',sentences)
                sentences[generalindex][0] = outputsentence
                
                #print('outputsentence hntuple',outputsentence)
                #outputsentences.append([outputsentence , i])
            
            #print('Oio', outputsentences)
            #print(sentencesThatAreOutoutput)
            #print(splitsentences)
            #print('oioioioioioioio',sentences)
            
            #print(sentences[0][0])
            
            
            #print('oioi',sentences[n])
            #print('malatesta', sentences[n][0][0])
            #print('generalindex sentences index 0', sentences[generalindex][0])
            try:
                if type(sentences[generalindex][0][0]) == str:
                    sentences[generalindex][0] = [sentences[generalindex][0]]
            except:
                pass
            #print('generalindex sentences index 0', sentences[generalindex][0])
            #print('oldsentence', oldsentence)
            newgeneratedsentences = len(sentences[generalindex][0])
            if newgeneratedsentences > 1:
                #print('goti t')
                for sentence in sentences[generalindex][0]:
                    punctuations.insert(generalindex, punctuations[generalindex])
                    outsentences.append(sentence)
                del punctuations[generalindex]
            if newgeneratedsentences == 1:
                if len(sentences[generalindex][0][0]) > 1:
                    outsentences.append(sentences[generalindex][0][0])
                else:
                    outsentences.append(oldsentence)
            if newgeneratedsentences == 0:
                #print('case oldsentence', oldsentence)
                outsentences.append(oldsentence)
            #print('oioi', sentences[n])
        # connect alonestanding commatas with the word before
        #print('theoutsentences', outsentences)
        for outsentence in outsentences:
            todelete = []
            for n in range(len(outsentence)):
                if outsentence[n] == ',':
                    todelete.append(n)
                    outsentence[n-1] = outsentence[n-1] + ','
            for deleteindex in todelete[::-1]:
                del outsentence[deleteindex]
        
        for index in range(len(outsentences)):
            outsentences[index] = [outsentences[index]]
        #print('theoutsentences', outsentences)
        
        #removing doubles
        doubledsentences = []
        for o in range(len(outsentences)):
            sentence = outsentences[o][0]
            for m in range(len(outsentences)):
                if m != o:    
                    count = 0
                    for n in range(len(sentence)):
                        if sentence[n] in outsentences[m][0] or sentence[n][:-1] in outsentences[m][0]:
                            count += 1
                        if count == len(sentence):
                            doubledsentences.append(sentence)
        punctdeleteindex = []
        tmp = set()
        for sentence in doubledsentences:
            tmp.add(tuple(sentence))
        #print(list(tmp))
        doubledsentences = []
        for tup in tmp:
            doubledsentences.append([list(tup)])
        #print('doubledsentences',doubledsentences)
        punctdeleteindexes = []
        for double in doubledsentences:
            if double in outsentences:
                punctdeleteindex = outsentences[::-1].index(double) 
                del outsentences[len(outsentences) - 1 - punctdeleteindex]
                punctdeleteindexes.append(punctdeleteindex)
        
        for index in punctdeleteindexes[::-1]:
            del punctuations[len(outsentences) - 1 - index]
        
        #print('oldsplit',oldsplitsentences)
        #print('outsents',outsentences)
        
        for o in range(len(oldsplitsentences)):
            for m in range(len(outsentences)):
                counter = 0
                for n in range(len(oldsplitsentences[o])):
                    if oldsplitsentences[o][n] in outsentences[m][0] or oldsplitsentences[o][n] + ',' in outsentences[m][0]:
                        counter += 1
                if counter >= len(oldsplitsentences[o]):
                    break
                if m == len(outsentences) - 1 and counter < len(oldsplitsentences[o]):
                    if o == 0:
                        outsentences.insert(0,[oldsplitsentences[o]])
                        punctuations.insert(0, punctuations[0])
                    else:
                        newones = []
                        for i in range(len(outsentences)):
                            if outsentences[i][0][-1] == oldsplitsentences[o - 1][-1]:
                                if len(outsentences[i][0]) > 2 and len(oldsplitsentences[o - 1]) > 2:
                                    if outsentences[i][0][-2] == oldsplitsentences[o - 1][-2]:
                                        if outsentences[i][0][-3] == oldsplitsentences[o - 1][-3]:
                                            newones.append([i + 1, [oldsplitsentences[o]]])
                        for newone in newones[::-1]:
                            #print(newones)
                            outsentences.insert(newone[0], newone[1])
                            punctuations.insert(newone[0], punctuations[newone[0] - 1])
                                    
                    
        #print('outsentences at the very end ', outsentences, punctuations)
        return outsentences, punctuations
            
    
    # Notiz: Hier muss der Input immer Paare sein, von Hauptsatz/Nebensatz. D.h. eine weitere vorgeschaltete Klasse ist von Nöten.
    
    def SplitCommatas(self, Inputsentences, punctuations):
        
        gs = self.gs
        
        nlp = self.nlp
        
        gramcorr_splitsentences = []
        counter = 0
        newpunctuationsindex = []
        for Inputsentence in Inputsentences:
            
            counter += 1
            
            
            commainfo = Inputsentence[1]
            
            
            token = commainfo[1]
            
            commaornot = commainfo[0]
            
            numberutterances = Inputsentence[2]
            
            
            if commaornot == 0:
                gramcorr_splitsentences.append(Inputsentence[0])
            
            if commaornot > 1:
                gramcorr_splitsentences.append(Inputsentence[0])
            
            if commaornot == 1:
                oldsentence = Inputsentence[0]
                Inputsentence = [[Inputsentence[0]]]
                
                
                for sentence in Inputsentence[0]:

                    splitsentence = []

                    splitsentences = []


                    processed = 0
                    wasNotInAnyList = 0
                    try:
                        for n in range(len(token)):

                            if token[n] in self.final_list:
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])


                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)

                                if n == 1:


                                    if token[n] == 'um' or token[n] == 'Um':
                                        
                                        splitsentences[n].insert(0,'dies')
                                        splitsentences[n].insert(0,'um')
                                    else:
                                        splitsentences[n].insert(0,'dann')


                                if n == 0:

                                    if token[n] == 'um' or token[n] == 'Um':
                                        splitsentences[n].insert(0,'dies')
                                        splitsentences[n].insert(0,'um')
                                        splitsentences = splitsentences[::-1]
                                    else:
                                        splitsentences[n].insert(0,'dann')

                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]

                                generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
                                processed = 1

                            if token[n] in self.adversativ_list:
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)

                                splitsentences[n].append('jedoch')


                                generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
                                processed = 1

                            if token[n] in self.kausal_list:
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)

                                # Da deswegen an den anderen Satz gehaengt wird, muss der input zu commasentences immer ZWEI sentences sein.
                                #print('splitsentences in kausal', splitsentences)
                                if n == 1:
                                    splitsentences[n - 1].insert(0,'deswegen')
                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]


                                if n == 0:
                                    splitsentences[n + 1].insert(0,'deswegen')


                                #print('splitsentences in kausal', splitsentences)


                                generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
                                processed = 1

                            # from here come konsekutiv sentences, they have to be split according https://www.deutschplus.net/pages/Konsekutivsatz
                            if token[n] in self.konsekutiv_list:
                                #print('oi konsekutiv')
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)

                                generalrules = [['KOUS','PPER']]
                                processed = 1


                            if token[n] in self.konditional_list:
                                splitsentence = []
                                for word in sentence:


                                    if word[-1] == ',':
                                        splitsentence.append(word[:-1])
                                    if word == ',':
                                        pass
                                    if word[-1] != ',':
                                        splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)


                                if n == 1:

                                    spoCount = gs.checkSPO(splitsentences[n], 0)

                                    spoCount = sum(spoCount)

                                    if spoCount == 2:
                                        thereisanes = 0
                                        for word in splitsentences[n]:
                                            if word == 'es' or word == 'Es':
                                                thereisanes = 1
                                        if thereisanes == 0:
                                            splitsentences[n].append('es')


                                if n == 0:


                                    spoCount = gs.checkSPO(splitsentences[n], 0)

                                    spoCount = sum(spoCount)

                                    if spoCount == 2:

                                        thereisanes = 0
                                        for word in splitsentences[n]:
                                            if word == 'es' or word == 'Es':
                                                thereisanes = 1
                                        if thereisanes == 0:
                                            splitsentences[n].append('es')

                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]

                                generalrules = [['KOUS','PPER']]
                                processed = 1

                            if token[n] in self.konzessiv_list:
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)


                                if n == 1:
                                    splitsentences[n - 1].insert(0,'trotzdem')
                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]


                                if n == 0:
                                    splitsentences[n + 1].insert(0,'trotzdem')


                                generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
                                processed = 1

                            if token[n] in self.lokal_list:
                                #print('lokal ole ole ')
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)


                                if n == 1:
                                    splitsentences[n - 1].insert(0,'dort')
                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]


                                if n == 0:
                                    splitsentences[n + 1].insert(0,'dort')


                                generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
                                processed = 1

                            if token[n] in self.instrumental_list:
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)


                                if n == 1:
                                    splitsentences[n - 1].insert(0,'so')
                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]


                                if n == 0:
                                    splitsentences[n + 1].insert(0,'so')


                                generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
                                processed = 1

                            if token[n] in self.temporal_list_vor:
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)


                                if n == 1:
                                    splitsentences[n].insert(0,'danach')


                                if n == 0:
                                    splitsentences[n].insert(0,'danach')
                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]

                                generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
                                processed = 1

                            if token[n] in self.temporal_list_nach:
                                splitsentence = []
                                for word in sentence:

                                    if word != token[n]:

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)


                                if n == 1:
                                    splitsentences[n].insert(0,'davor')


                                if n == 0:
                                    splitsentences[n].insert(0,'davor')
                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]

                                generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
                                processed = 1

                            #print(token[n])
                            if token[n] == 'der' or token[n] == 'welcher':
                
                                tokens = self.nlp(' '.join(sentence))
                                for word in tokens:
                                    if word.dep_ == 'rc':
                                        wordwithrc = word.text

                                rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')

                                oldsplitsentences = splitsentences
                                splitsentences = []

                                if rcORnot == 1:
                                    splitsentence = []
                                    for word in sentence:


                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                        if word[-1] == ',' or word == ',':

                                            splitsentences.append(splitsentence)

                                            splitsentence = []

                                    splitsentences.append(splitsentence)

                                    # das umtauschen wird hier vollzogen, da ansonsten spacy dieser nicht als PDS einliest.. analog in den anderen.

                                    if wordwithrc in splitsentences[n]:

                                        splitsentences[n][0] = 'dieser'

                                        verb = splitsentences[n][-1]

                                        splitsentences[n] = splitsentences[n][:-1]
                                        splitsentences[n].insert(1, verb)


                                        #print('Vorsicht', splitsentences)

                                        generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
                                        processed = 1
                                    else:
                                        splitsentences = oldsplitsentences
                                        splitsentence = []

                            if token[n] == 'die' or token[n] == 'welche':


                                tokens = self.nlp(' '.join(sentence))
                                for word in tokens:
                                    if word.dep_ == 'rc':
                                        wordwithrc = word.text


                                rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')


                                oldsplitsentences = splitsentences
                                splitsentences = []


                                if rcORnot == 1:
                                    #print('it went to rcornot in case die')


                                    splitsentence = []
                                    for word in sentence:


                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                        if word[-1] == ',' or word == ',':

                                            splitsentences.append(splitsentence)

                                            splitsentence = []

                                    splitsentences.append(splitsentence)

                                    if wordwithrc in splitsentences[n]:
                                        #print('wordwithrc was in sentence')
                                        #print(wordwithrc)
                                        #print(splitsentences[n])
                                        #print('wordwithrcend')
                                        splitsentences[n][0] = 'diese'

                                        verb = splitsentences[n][-1]

                                        splitsentences[n] = splitsentences[n][:-1]
                                        splitsentences[n].insert(1, verb)


                                        generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
                                        processed = 1
                                    else:

                                        splitsentences = oldsplitsentences
                                        splitsentence = []

                            if token[n] == 'dem':

                                tokens = self.nlp(' '.join(sentence))
                                for word in tokens:
                                    if word.dep_ == 'rc':
                                        wordwithrc = word.text

                                rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')

                                oldsplitsentences = splitsentences
                                splitsentences = []
                                
                                if rcORnot == 1:
                                    splitsentence = []
                                    for word in sentence:


                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',' and word[-1] != '.':
                                            splitsentence.append(word)

                                        if word[-1] == ',':

                                            splitsentences.append(splitsentence)

                                            splitsentence = []

                                    splitsentences.append(splitsentence)


                                    if wordwithrc in splitsentences[n]:

                                        splitsentences[n][0] = 'diesem'

                                        verb = splitsentences[n][-1]

                                        splitsentences[n] = splitsentences[n][:-1]
                                        splitsentences[n].insert(1, verb)


                                        generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
                                        processed = 1
                                    else:
                                        splitsentences = oldsplitsentences
                                        splitsentence = []

                            if token[n] == 'das' or token[n] == 'welches':

                                tokens = self.nlp(' '.join(sentence))
                                for word in tokens:
                                    if word.dep_ == 'rc':
                                        wordwithrc = word.text


                                rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')

                                #print('Oeeee',rcORnot)
                                oldsplitsentences = splitsentences
                                splitsentences = []
                                if rcORnot == 1:
                                    splitsentence = []
                                    for word in sentence:


                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                        if word[-1] == ',' or word == ',':

                                            splitsentences.append(splitsentence)

                                            splitsentence = []

                                    splitsentences.append(splitsentence)
                                    #print('splitsentence in das rc', splitsentences)
                                    if wordwithrc in splitsentences[n]:

                                        splitsentences[n][0] = 'dieses'

                                        verb = splitsentences[n][-1]
                                        #print('verb',verb)
                                        splitsentences[n] = splitsentences[n][:-1]
                                        splitsentences[n].insert(1, verb)

                                        generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
                                        processed = 1
                                    else:
                                        splitsentences = oldsplitsentences
                                        splitsentence = []

                            if token[n] == 'dessen' or token[n] == 'wessen':

                                tokens = self.nlp(' '.join(sentence))
                                for word in tokens:
                                    if word.dep_ == 'rc':
                                        wordwithrc = word.text


                                rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
                                
                                oldsplitsentences = splitsentences
                                splitsentences = []

                                if rcORnot == 1:
                                    splitsentence = []
                                    for word in sentence:
                                        

                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                        if word[-1] == ',' or word == ',':

                                            splitsentences.append(splitsentence)

                                            splitsentence = []

                                    splitsentences.append(splitsentence)

                                    if wordwithrc in splitsentences[n]:
                                        verb = splitsentences[n][-1]

                                        splitsentences[n] = splitsentences[n][:-1]
                                        splitsentences[n].insert(1, verb)


                                        generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
                                        processed = 1
                                    else:
                                        splitsentences = oldsplitsentences
                                        splitsentence = []

                            if token[n] == 'den' or token[n] == 'welchen':

                                tokens = self.nlp(' '.join(sentence))
                                for word in tokens:
                                    if word.dep_ == 'rc':
                                        wordwithrc = word.text


                                rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')

                                oldsplitsentences = splitsentences
                                splitsentences = []
                                
                                if rcORnot == 1:
                                    splitsentence = []
                                    for word in sentence:


                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',':
                                            splitsentence.append(word)

                                        if word[-1] == ',' or word == ',':

                                            splitsentences.append(splitsentence)

                                            splitsentence = []

                                    splitsentences.append(splitsentence)


                                    if wordwithrc in splitsentences[n]:

                                        splitsentences[n][0] = 'diesen'

                                        verb = splitsentences[n][-1]

                                        splitsentences[n] = splitsentences[n][:-1]
                                        splitsentences[n].insert(1, verb)


                                        generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
                                        processed = 1
                                    else:
                                        splitsentences = oldsplitsentences
                                        splitsentence = []


                            if token[n] == 'wem' or token[n] == 'Wem' or token[n] == 'welchem':

                                daORnot = gs.checkForAnnotation(sentence, 'da', 'word.dep_')

                                oaORnot = gs.checkForAnnotation(sentence, 'oa', 'word.dep_')

                                reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
                                
                                oldsplitsentences = splitsentences
                                splitsentences = []
                                
                                for word in sentence:


                                    if word[-1] == ',':
                                        splitsentence.append(word[:-1])
                                    if word == ',':
                                        pass
                                    if word[-1] != ',':
                                        splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)


                                if n == 0:
                                    index = 1
                                if n == 1:
                                    index = 0

                                if reORnot == 1:
                                    pass
                                if daORnot == 1 and reORnot == 0:
                                    splitsentences[index].insert(1, 'das')

                                if oaORnot == 1 and reORnot == 0:
                                    splitsentences[index].insert(1, 'dem')

                                if n == 1:
                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]


                                generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
                                processed = 1

                            if token[n] in self.indirectspeech_list and token[1] not in self.konsekutiv_list:
                                

                                reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
                                oldsplitsentences = splitsentences
                                splitsentences = []
                                splitsentence = []
                                for word in sentence:


                                    if word[-1] == ',':
                                        splitsentence.append(word[:-1])
                                    if word == ',':
                                        pass
                                    if word[-1] != ',':
                                        splitsentence.append(word)

                                    if word[-1] == ',' or word == ',':

                                        splitsentences.append(splitsentence)

                                        splitsentence = []

                                splitsentences.append(splitsentence)


                                if n == 0:
                                    index = 1
                                if n == 1:
                                    index = 0

                                if reORnot == 0:
                                    if splitsentences[index][0] != 'was':
                                        splitsentences[index].insert(1, 'das')

                                if n == 1:
                                    splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]


                                generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
                                processed = 1

                            if processed == 0 and n == 1:

                                ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VVINF'], 'word.tag_', 'None')
                                if ZUVINFTupelORnot == 0:
                                    ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VAINF'], 'word.tag_', 'None')

                                if ZUVINFTupelORnot == 1:

                                    reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
                                    splitsentence = []
                                    for word in sentence:


                                        if word[-1] == ',':
                                            splitsentence.append(word[:-1])
                                        if word == ',':
                                            pass
                                        if word[-1] != ',' :
                                            splitsentence.append(word)

                                        if word[-1] == ',' or word == ',':

                                            splitsentences.append(splitsentence)
                                            processed = 1
                                            splitsentence = []
                                    
                                    splitsentences.append(splitsentence)

                                    for m in range(2):
                                        ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VVINF'], 'word.tag_','None')

                                        if ZUINForNOT == 0:
                                            ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VAINF'], 'word.tag_','None')


                                        if ZUINForNOT == 1:
                                            r = m
                                            ZUINForNOT = 0


                                    if r == 0:
                                        index = 1
                                    if r == 1:
                                        index = 0

                                    objectORnot = gs.checkForAnnotation(splitsentences[index] , 'oa', 'word.dep_')

                                    if reORnot == 0 and objectORnot == 0:
                                        splitsentences[index].insert(1, 'das')

                                    if r == 1:
                                        splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
                                else:
                                    processed == 2


                    except:
                        wasNotInAnyList = 1


                    #rules = [['ART','ADJA','NN'], ['ART','ADJA','NE'], ['ART', 'NN'], ['ART', 'NE'], ['APPR','NN'], ['APPR','NE'], ['APPR', 'ART', 'NN'], ['APPR', 'ART', 'NE'], ['APPR','ART','NN','ADJA','NN'], ['APPR','ART','NN','ADJA','NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD']]
                    
                    #print('B',splitsentences)
                    endsentences = []
                    if (processed == 2 or processed == 0) and n == 1:
                        wasNotInAnyList = 1
                    
                    
                    try:
                        if wasNotInAnyList == 0:
                            newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
                            #print('splitsentencee', splitsentences)
                            if len(splitsentences) > 2:
                                splitsentences = splitsentences[:2]
                            
                            #print('splitsentenceeeees', splitsentences)

                            for splitsentence in splitsentences:

                                #print('splitsentenceeeeeeeeeeee!!',splitsentence)
                                wordtoputfirst = 'nada'
                                for word in self.firstwordlist:
                                    if word == splitsentence[0]:
                                        wordtoputfirst = word
                                        splitsentence.remove(word)


                                #print('get the tuples and triples to check..')
                                tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(splitsentence)
                                #print('done')
                                #print(tuplesTocheck, 'ole', triplesTocheck ,'aiai', quadruplesTocheck)
                                #print('1')
                                grammpiecessentence = self.gs.createTupleofGrammarpieces( splitsentence, tuplesTocheck, triplesTocheck, quadruplesTocheck)
                                
                                #print('grammpiece',grammpiecessentence)
                                #print('2')
                                if len(grammpiecessentence) > 7:
                                    print('A sentence is too long, too many permutations. \n piping wrong grammar..')
                                    endsentence = ' '.join(grammpiecessentence)

                                else:
                                    #print('genrating the permutations')
                                    permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
                                    #print('done')
                                    #print(permutations)
                                    #print('3')
                                    firstwordwithverblist = ['deswegen', 'danach']
                                    permutationstodelete = []
                                    for permutation in permutations:
                                        #print('4')
                                        if permutation[0] in firstwordwithverblist:
                                            #print('4.1')
                                            count = 1
                                            for word in self.nlp(permutation[1]):
                                                #print('4.2')
                                                if word.tag_[0] != 'V':
                                                    #print('4.3')
                                                    permutationstodelete.append(permutation)
                                                    break
                                                else:
                                                    break
                                            #for word in self.nlp(permutation[0]):
                                                #print('4.2')
                                                #if word.tag_[0] != 'V':
                                                    #print('4.3')
                                                    #permutationstodelete.append(permutation)
                                                    #break
                                                #else:
                                                    #break
                                    for delperm in permutationstodelete:
                                        try:
                                            permutations.remove(delperm)
                                        except:
                                            
                                            pass
                                    #print('5')
                                    
                                    sentencesToCheck = []
                                    if wordtoputfirst in self.firstwordlist:
                                        for sentence in permutations:
                                            sentencesToCheck.append(wordtoputfirst + ' ' + ' '.join(sentence))
                                    else:
                                        for sentence in permutations:
                                            sentencesToCheck.append(' '.join(sentence))

                                    endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(splitsentence))
                                    #print('done')
                                    #print('endsent',endsentence)
                                endsentences.append(endsentence)
                    except:
                        #print('there was an error')
                        wasNotInAnyList = 1
                        endsentences = []
                        todelete = []
                        for index in range(len(newpunctuationsindex)):
                            if newpunctuationsindex[index][0] == counter - 1:
                                todelete.append(index)
                        for todel in todelete[::-1]:
                            del newpunctuationsindex[todel]
                                
                    
                    if wasNotInAnyList == 1:
                        #print('was not in any list')
                        #print(oldsentence)
                        endsplisentences = []
                        splisentence = []
                        for word in oldsentence:


                            if word[-1] == ',':
                                splisentence.append(word[:-1])
                            
                            if word == ',':
                                pass
                            if word[-1] != ',':
                                splisentence.append(word)

                            if word[-1] == ',' or word == ',':

                                endsplisentences.append(splisentence)
                                
                                splisentence = []

                        endsplisentences.append(splisentence)
                        
                        newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
                        
                        #print('endsplisentences',endsplisentences)
                        for splsentence in endsplisentences:
                        
                            endsentences.append(' '.join(splsentence))


                        '''


                        fsearch1 = self.fsearch1
                        spacyclass1 = 'word.tag_'


                        gs_sentence1 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass1)


                        print('searchPatternMatch for tags')
                        bestmatches1 = fsearch1.searchPatternMatch(' '.join(gs_sentence1), 1)
                        print('done')

                        #print('oioi', bestmatches1)

                        #print(len(fsearch1.database))
                        right_gs_tupel1 = []

                        if len(bestmatches1) < 10:
                            bestndocs1 = len(bestmatches1)
                        else:
                            bestndocs1 = 10

                        for m in range(bestndocs1):
                            right_gs_tupel1.append(fsearch1.database[bestmatches1[m][0]])


                        statistically_correct_sentences1 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence1, right_gs_tupel1)


                        fsearch2 = self.fsearch2

                        spacyclass2 = 'word.dep_'

                        gs_sentence2 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass2)

                        print('searchPatternMatch for deps')
                        bestmatches2 = fsearch2.searchPatternMatch(' '.join(gs_sentence2), 1)
                        print('done')

                        right_gs_tupel2 = []


                        if len(bestmatches2) < 10:
                            bestndocs2 = len(bestmatches2)
                        else:
                            bestndocs2 = 10


                        for m in range(bestndocs2):
                            right_gs_tupel2.append(fsearch2.database[bestmatches2[m][0]])

                        #print(' '.join(splitsentence))

                        statistically_correct_sentences2 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence2, right_gs_tupel2)


                        print(splitsentence)


                        Rightsentence = gs.GetBestgsAccordingRules(' '.join(splitsentence) , gs_sentence1, right_gs_tupel1, right_gs_tupel2, statistically_correct_sentences1, statistically_correct_sentences2, rules, generalrules)


                        '''
                    for endsentence in endsentences:
                        gramcorr_splitsentences.append(endsentence.split())

        for index in newpunctuationsindex:
            punctuations.insert(index[0], index[1])                
                        
        return gramcorr_splitsentences, punctuations
                        
                
    def putAppendixesIntoOwnSentences(self, sentences, punctuations):
        
        gs = self.gs
        #triples = [['NN', 'ART', 'NN'], ['NE', 'ART', 'NN'], ['NN', 'ART', 'NN'], ['NE', 'ART', 'NE']]
        quadruples = [['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'ART', 'NN'], ['NE', 'APPR', 'ART', 'NN'], ['NN', 'APPR', 'ART', 'NE'], ['NE', 'APPR', 'ART', 'NE']] 
        quadruplestochange = []
        triplestochange = []
        newsentences = []
        newpunctuations = []
        Whatisofnouns = []
        oldsentences = sentences
        oldpunctuations = punctuations
        for hauptindex in range(len(sentences)):
            
            sentence = sentences[hauptindex]
            try:
                #for triple in triples:
                #    AnnoOrNot, tripleInWords = gs.checkForAnnotationTriple(sentence, triple, 'word.tag_', 'None')
                #    for tripleinwor in tripleInWords:
                #        triplestochange.append([triple, tripleinwor])

                for quadruple in quadruples:
                    AnnoOrNot, quadrupleInWords = gs.checkForAnnotationQuadruple(sentence, quadruple, 'word.tag_', 'None')
                    #print('quadinwords', quadrupleInWords)
                    #print('ANNOORNOT', AnnoOrNot)
                    for quadrupleInWo in quadrupleInWords:
                        quadruplestochange.append([quadruple, quadrupleInWo])

                #print('quadstochange',quadruplestochange)
                for quad in quadruplestochange:
                    for n in range(len(sentence) - 4):
                        if sentence[n] == quad[1][0]:
                            if sentence[n + 1] == quad[1][1]:
                                if sentence[n + 2] == quad[1][2]:
                                    artword = None
                                    longerWhatisnoun = 0
                                    for m in range(2):
                                        for word in self.nlp(sentence[n - m]):
                                            if word.tag_ == 'ART':
                                                Nounthatis = sentence[n - m:n + 1]
                                                import spacy
                                                nlp = spacy.load('de_core_news_sm')
                                                token3 = nlp(sentence[n+4])
                                                counter = 0
                                                Whatisnoun = sentence[n + 1:n + 4]
                                                for wor in token3:
                                                    counter += 1
                                                    if wor.tag_ == 'NN' or wor.tag_ == 'NE':
                                                        if counter == 1:
                                                            Whatisnoun = sentence[n + 1:n + 5]
                                                            longerWhatisnoun = 1
                                                        if counter == 2:
                                                            Whatisnoun = sentence[n + 1:n + 4]


                                                artword = word.text
                                    #print(sentence[n - 1],'oi')
                                    if ((artword == 'die' or artword == 'Die') and sentence[n][-1] != 'n') or ((artword == 'der' or artword == 'einer' or artword == 'dieser') and (sentence[n - 2] in ['von', 'in', 'auf', 'ueber', 'unter', 'nach', 'mit'])):

                                        if artword == 'der':
                                            Nounthatis[0] = 'die'

                                        donothing = 0
                                        if sentence[n + 1] == 'mit':
                                            if sentence[n + 2] == 'den':
                                                verb = ' hat die '
                                                Whatisnoun = Whatisnoun[2:]
                                            if sentence[n + 2] == 'der':
                                                verb = ' hat eine '
                                                Whatisnoun = Whatisnoun[2:]
                                            if sentence[n + 2] != 'der' and sentence[n + 2] != 'den':
                                                donothing = 1
                                        else:
                                            verb = ' ist '
                                        if donothing == 0:
                                            newsentence = ' '.join(Nounthatis) + verb + ' '.join(Whatisnoun)


                                            newsentences.append([hauptindex + 1, newsentence.split()])
                                            newpunctuations.append([hauptindex + 1, punctuations[hauptindex]])
                                            if longerWhatisnoun == 0:
                                                Whatisofnouns.append([n + 1, n + 4, hauptindex])
                                            else:
                                                Whatisofnouns.append([n + 1, n + 5, hauptindex])
            except:
                print('Konnte nicht ' + str(sentence) + 'in Characterisierung pro Satz prozessieren..')
        try:        
            for whatis in Whatisofnouns[::-1]:
                thereisacomma = 0
                #print(sentences[whatis[2]][whatis[1] - 1])
                if sentences[whatis[2]][whatis[1] - 1][-1] == ',':

                    thereisacomma = 1
                if thereisacomma == 1:
                    #print(sentences[whatis[2]][whatis[0] - 1])
                    sentences[whatis[2]][whatis[0] - 1] = sentences[whatis[2]][whatis[0] - 1] + ','
                del sentences[whatis[2]][whatis[0]:whatis[1]]
            for newsent in newsentences[::-1]:
                sentences.insert(newsent[0], newsent[1])
            for newpunct in newpunctuations[::-1]:
                punctuations.insert(newpunct[0], newpunct[1])
            for sentence in sentences:
                if sentence[-1][-1] == ',':
                    sentence[-1] = sentence[-1][:-1]
        except:
            print('konnte nicht die gesammelten Characterisierungen prozessieren')
            sentences = oldsentences
            punctuations = oldpunctuations
            
            
        return sentences, punctuations