|
|
-
- # split sentences
-
- # in den Listen fehlt noch sondern ( und noch weitere Dinge..)
-
-
- # Folgende Konjunktionen brauchen keine Satzumformungen:
- # Woraufhin, zudem, zumal, umso - desto,
-
- # sondern ist schwierig zu lösen.. am besten mit sondern weg, und anschließend SentGlue
-
-
-
- class SentSeg(object):
-
- def __init__(self, language):
-
- self.language = language
-
- self.punktuation_list = ['.', '?', '!', ';', ':']
-
- self.wrappunktuation_list = [',', '-']
-
- self.adversativ_list = ['wohingegen', 'Wohingegen', 'aber', 'Aber', 'wobei', 'Wobei', 'hingegen']
-
- self.final_list = ['damit','Damit', 'um', 'Um']
-
- self.kausal_list = ['weil', 'Weil', 'da', 'Da', 'denn', 'falls', 'Falls' ]
-
- self.konditional_list = ['wenn', 'Wenn', 'sobald', 'Sobald', 'als', 'falls']
-
- self.konsekutiv_list = ['dass', 'Dass']
-
- self.konzessiv_list = ['obwohl', 'Obwohl', 'obgleich', 'Obgleich', 'trotzdem', 'Trotzdem', 'wenngleich', 'doch']
-
- self.lokal_list = ['wo', 'Wo']
-
- self.temporal_list_vor = ['bevor', 'Bevor']
-
- self.temporal_list_nach = ['nachdem', 'Nachdem']
-
- self.instrumental_list = ['indem', 'Indem']
-
- self.indirectspeech_list = ['ob', 'Ob', 'wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso']
- self.firstwordlist = []
- #self.firstwordlist = ['wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso', 'dies', 'dann', 'jedoch', 'deswegen', 'trotzdem', 'danach', 'davor', 'wenn', 'sobald']
-
- self.full_list = self.adversativ_list + self.final_list + self.kausal_list + self.konditional_list + self.konsekutiv_list + self.konzessiv_list + self.lokal_list + self.temporal_list_nach + self.temporal_list_vor + self.instrumental_list + self.indirectspeech_list
-
- def ReadDoc2Sent(self, document):
-
- splitsentences = []
- splitsentence = []
-
- with open(document) as sentences:
- counter = 0
- for sentence in sentences:
-
- counter += 1
- if counter % 1000 == 0:
- print(counter)
-
- words = sentence.split()
-
-
-
- for word in words:
-
- splitsentence.append(word)
-
-
-
-
-
-
- if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
-
- splitsentences.append([splitsentence])
-
- splitsentence = []
-
- return splitsentences
-
-
-
-
-
- def AndOrSolver(self, sentences, punctuations):
-
- for n in range(len(punctuations)):
- if punctuations[n] == ':' or punctuations[n] == '-':
- punctuations[n] = '.'
-
-
- #print(sentences, punctuations)
-
- splitsentences = []
-
- counter = 0
-
- newsentences = []
- for sentence in sentences:
- newpunctuationsindexes = []
- utterancenumber = sentence[2]
- commainfo = sentence[1]
- commaornot = commainfo[0]
- sentence = sentence[0]
-
-
- counter += 1
- doc = self.nlp(' '.join(sentence))
-
- subjectcount = 0
- separationwords = []
- subjectcounts = []
- doccounter = 0
- subjectindex = []
- rcornot = 0
- for word in doc:
- doccounter += 1
- if word.dep_ == 'sb' or word.dep_ == 'ep':
- subjectcount += 1
- subjectindex.append(doccounter - 1)
- if word.dep_ == 'rc':
- rcornot = 1
-
-
- if word.tag_ == '$,':
-
- subjectcounts.append([subjectcount, doccounter - 2, subjectindex, rcornot])
- subjectindex = []
- subjectcount = 0
- #print('aleaole',sentence[doccounter - 2])
- if len(sentence[doccounter - 2]) > 1:
-
- doccounter -= 1
-
- if word.text == 'und' or word.text == 'also' or word.text == 'oder' or word.text == 'schon' or word.text == 'bald' or word.text == 'doch' or word.text == 'jedoch' or word.text == 'sondern':
- separationwords.append(doccounter - 1)
-
- #print('separationwords', separationwords)
- #print('subjectcounts', subjectcounts)
-
-
-
- separationwordstocut = []
- listofownsentencessubjectindexes = []
- for n in range(len(subjectcounts) - 1):
- if subjectcounts[n][0] > 0 and subjectcounts[n + 1][0] > 0 and subjectcounts[n + 1][3] == 0:
- listofownsentencessubjectindexes.append(subjectcounts[n])
- for m in range(len(separationwords)):
- if subjectcounts[n][1] < separationwords[m] < subjectcounts[n + 1][1]:
- #print(subjectcounts[n + 1], separationwords[m])
- if subjectcounts[n + 1][0] > 1:
- if subjectcounts[n + 1][2][0] < separationwords[m] <= subjectcounts[n + 1][2][-1]:
- separationwordstocut.append(separationwords[m])
-
- processed = 0
-
- #print('oioioi')
- #print(listofownsentencessubjectindexes)
- #print(separationwordstocut)
-
- if len(listofownsentencessubjectindexes) > 0:
- for n in range(len(listofownsentencessubjectindexes)):
-
- sentence[listofownsentencessubjectindexes[n][1]] = sentence[listofownsentencessubjectindexes[n][1]] + 'alohaseparator'
- newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
- #print('a new punctuation1')
- processed = 1
- if len(separationwordstocut) > 0:
- for n in range(len(separationwordstocut)):
- sentence[separationwordstocut[n] - 1] = sentence[separationwordstocut[n] - 1] + 'alohaseparator'
- #print('a new punctuation2')
- newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
- processed = 1
-
- if processed == 0:
- newsentences.append([sentence])
-
- if processed == 1:
- #print(sentence)
- splitsentence = []
- for word in sentence:
- splitsentence.append(word)
- if word[-14:] == 'alohaseparator':
- if splitsentence[-1][-15] == ',':
- splitsentence[-1] = splitsentence[-1][:-15]
- else:
- splitsentence[-1] = splitsentence[-1][:-14]
- newsentences.append([splitsentence])
- splitsentence = []
- newsentences.append([splitsentence])
-
- #print(newpunctuationsindexes)
- newpunctuationsindexes = newpunctuationsindexes[::-1]
- for n in range(len(newpunctuationsindexes)):
- punctuations.insert(newpunctuationsindexes[n][1], newpunctuationsindexes[n][0])
-
- #print(newsentences, punctuations)
- return newsentences, punctuations
-
-
-
- def LoadBoWModelAndDatabaseOnesZeros(self):
-
-
- import FASTsearch
-
- #print('loading the tag hkl db..')
- self.fsearch1 = FASTsearch.FASTsearch('GS_DB_word.tag_.hkl')
- #print('done')
-
- #print('generating BoW Model..')
- self.fsearch1.Gen_BoW_Model(1000, "word")
- #print('done')
-
- #print('loading the bow model')
- self.fsearch1.Load_BoW_Model('bagofwordsGS_DB_word.tag_.pkl', 'DataBaseOneZerosGS_DB_word.tag_.hkl')
- #print('done')
-
- #print('loading the dep hkl db..')
- self.fsearch2 = FASTsearch.FASTsearch('GS_DB_word.dep_.hkl')
- #print('done')
-
- #print('generating BoW Model..')
- self.fsearch2.Gen_BoW_Model(1000, "word")
- #print('done')
-
- #print('loading the bow model')
- self.fsearch2.Load_BoW_Model('bagofwordsGS_DB_word.dep_.pkl', 'DataBaseOneZerosGS_DB_word.dep_.hkl')
- #print('done')
-
- def LoadSentGlueSGDandGSUtils(self):
-
- import GS_Utils
- #print('initializing the gs utils..')
- self.gs = GS_Utils.GS_Utils('de_core_news_sm')
- #print('done')
-
-
- from SentGlue import SentGlueMach
- #print('loading the Stochastic Gradient models..')
- self.sgm = SentGlueMach('trainedSGD_twolabel.pkl', 'bagofwordstwolabel.pkl')
- #print('done')
- #print('initializing the SGM..')
- self.sgm.initialize()
- #print('done')
-
- #print('importing spacy..')
- import spacy
- #print('done')
-
- #print('importing german model..')
- self.nlp = spacy.load('de_core_news_sm')
- #print('done')
-
- return 'done'
-
- def CommaSentenceOrNot(self, sentences):
-
- nlp = self.nlp
-
- commasentences = []
- counter = 0
-
- #print('creating array of comma or not..')
- for sentence in sentences:
-
- doc = nlp(' '.join(sentence[0]))
-
- #print(doc)
- counter += 1
- #if counter % 100 == 0:
- #print(counter)
-
-
- n = 0
- firstone = 0
- token = []
- nextword = 0
- for word in doc:
- #print(word.tag_)
- # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
-
- if firstone == 0:
- token.append(word.text)
-
- firstone = 1
-
-
- if nextword == 1:
- token.append(word.text)
-
- nextword = 0
-
- if word.tag_ == '$,':
- n += 1
- nextword = 1
-
- sentence.append([n, token])
-
- commasentences.append(sentence)
-
- #print('done')
- return commasentences
-
- def EnumerationSolver(self, sentences):
-
-
- gs = self.gs
-
-
- nlp = self.nlp
-
- sgm = self.sgm
-
-
- enumerationsentences = []
- counter = 0
- NOTenumerations = []
- #print('processing enumerations..')
- for sentence in sentences:
-
- doc = nlp(' '.join(sentence[0]))
-
- #print(doc)
- counter += 1
- #if counter % 100 == 0:
- #print(counter)
-
- n = 0
- firstone = 0
- token = []
- nextword = 0
- enumeration = False
-
- splitsentence = []
- splitsentence_deps = []
- splitsentence_tags = []
- splitsentences = []
- splitsentences_deps = []
- splitsentences_tags = []
-
-
-
- for word in doc:
- #print(word.tag_)
- # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
-
-
-
- nextword = 0
-
- if word.tag_ == '$,':
- n += 1
- nextword = 1
-
- if (word.text == 'und' or word.text == 'oder') and n >= 1:
- enumeration = True
- break
-
-
- output = []
- if enumeration == True:
-
- for word in doc:
-
- #print(word.text)
-
- if word.text != ',' and word.text != '.' and word.text != 'und':
-
- splitsentence.append(word.text)
- splitsentence_deps.append(word.dep_)
- splitsentence_tags.append(word.tag_)
-
- if word.text == ',' or word.text == 'und':
-
- #print('oi')
-
- splitsentences.append(splitsentence)
- splitsentences_deps.append(splitsentence_deps)
- splitsentences_tags.append(splitsentence_tags)
- splitsentence = []
- splitsentence_deps = []
- splitsentence_tags = []
-
- splitsentences.append(splitsentence)
- splitsentences_deps.append(splitsentence_deps)
- splitsentences_tags.append(splitsentence_tags)
-
- #print( 'splitsentences', splitsentences)
-
- token = []
- enumerations = []
- enumerationsSPOs = []
- NOTenumerations = []
-
- for sentence in splitsentences:
- token.append(sentence[0])
-
-
- if sentence[0] not in self.full_list:
- enumerations.append(sentence)
- enumerationsSPOs.append(gs.checkSPO(sentence, 0))
- else:
- NOTenumerations.append(sentence)
-
- #print(enumerationsSPOs)
-
-
- #print('enumerations', enumerations)
- biggest = []
- for i in range(len(enumerationsSPOs)):
- biggest.append([i, sum(enumerationsSPOs[i])])
-
-
- sortedbiggest = sorted(biggest[::-1], key=lambda tup: tup[1], reverse=True)
-
- for i in range(len(sortedbiggest)):
- if sortedbiggest[i][0] == 0:
- mainsentenceIndex = sortedbiggest[i][0]
- lastornot = 0
- break
-
- if sortedbiggest[i][0] == len(biggest) - 1:
- mainsentenceIndex = sortedbiggest[i][0]
- lastornot = 1
- break
-
-
- # Hier muss noch für den Fall Er, sie und der Beamte LACHTEN den Clown aus --> das lachten abgefangen werden mit der Datenbank der Fälle, sprich enumeration im spo 1 0 0 + plural muss dann zu singular werden abhängig von den artikeln.
- #print('enumerations', enumerations)
- mainsentence = enumerations[mainsentenceIndex]
- #print('main', mainsentence)
- probablemainsentences = []
- for i in range(len(enumerations)):
- if i != mainsentenceIndex:
- iprobablemainsentences = []
- probablemainsentence = []
- if lastornot == 0:
- for j in range(1, len(mainsentence)):
- probablemainsentence = mainsentence[0:j] + enumerations[i]
- #print(probablemainsentence)
- iprobablemainsentences.append(' '.join(probablemainsentence))
- if lastornot == 1:
- for j in range(1, len(mainsentence)):
- probablemainsentence = enumerations[i] + mainsentence[-j:]
- iprobablemainsentences.append(' '.join(probablemainsentence))
- probablemainsentences.append(iprobablemainsentences)
-
-
- # hier wird auf noch da geprüft, aber es ist wichtiger in diesem fall, dass ein tuple nicht zerissen vorkommt AENDERN !!!!
-
- #print('probablemainsentences', probablemainsentences)
- tuplesToCheck = []
- tuples = [['ART', 'NN'], ['APPR','NN'], ['ART', 'CARD']]
- for tupl in tuples:
-
-
- checktupleindex, tupleInWords = gs.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')
- if checktupleindex == 2:
- tuplesToCheck.append([tupl, tupleInWords])
- triplesToCheck = []
- triples = [['ART','ADJA','NN'], ['APPR', 'ART', 'NN'], ['KOKOM', 'ART', 'NN']]
- for tripl in triples:
- checktripleindex, tripleInWords = gs.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')
- if checktripleindex == 3:
- triplesToCheck.append([tripl, tripleInWords])
-
- #print('tuples to check', tuplesToCheck)
- #print('triples to check', triplesToCheck)
- #print('probablemainsentences', probablemainsentences)
- for probsentences in probablemainsentences:
-
- checktripleindexes = []
- checktupleindexes = []
- #print(probsentences)
- filteredprobsentences = []
- for sentence in probsentences:
- tuplchecked = 0
- triplchecked = 0
- #print('sentence and tuples to check', sentence, tuplesToCheck)
- for tupl in tuplesToCheck:
-
- checkedsecondtime, tupleinWords = gs.checkForAnnotationTuple(sentence.split(), tupl[0], 'word.tag_', tupl[1])
-
- #print(sentence, checkedsecondtime)
- if checkedsecondtime == 1:
-
- tuplchecked = 0
- if checkedsecondtime == 2:
-
- tuplchecked = 1
-
- for tripl in triplesToCheck:
- checkedsecondtime, tripleinWords = gs.checkForAnnotationTriple(sentence.split(), tripl[0], 'word.tag_', tripl[1])
- if checkedsecondtime == 1 or checkedsecondtime == 2:
-
- triplchecked = 0
- if checkedsecondtime == 3:
-
- triplchecked = 1
-
-
-
-
- if triplchecked == 1 or tuplchecked == 1:
- filteredprobsentences.append(sentence)
-
- #print('filteredprobsentences', filteredprobsentences)
- if len(filteredprobsentences) == 0:
- filteredprobsentences = probsentences
- # here is still the problem, that there are lists of words instead of proper sentences..
- #print('filteredprobsentences', filteredprobsentences)
- probsMatrix = sgm.predictprobsOnSentenceList(filteredprobsentences, filteredprobsentences)
-
- #print(probsMatrix)
-
- for i in range(len(probsMatrix)):
- probsMatrix[i][0] = i
-
- #print(probsMatrix)
-
- sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
-
- #print(sortedprobsMatrix)
-
- bestindex = sortedprobsMatrix[0][0]
-
- #print(bestindex)
- #print('probablemainsentences', filteredprobsentences)
- probablemainsentence = filteredprobsentences[int(bestindex)]
- #print('oi', probablemainsentence)
-
- #print('probablemainsentence', probablemainsentence)
- enumerationsentences.append([probablemainsentence])
-
-
- enumerationsentences.append([' '.join(mainsentence)])
-
- for notenum in NOTenumerations:
- #print(enumerationsentences)
- #print(enumerationsentences[-1])
- #print('enum no1', enumerationsentences)
- #print('notenum', notenum)
- enumerationsentences[-1].append(' '.join(notenum))
- #print('enumsentences',enumerationsentences[-1])
- enumerationsentences[-1] = [', '.join(enumerationsentences[-1])]
-
-
- else:
- enumerationsentences.append([sentence])
-
-
-
- output.append(enumerationsentences)
-
-
- for n in range(len(output[0])):
- #print('out',output[0][n])
- try:
- output[0][n] = [output[0][n][0].split()]
- except:
- output[0][n] = [output[0][n][0][0]]
-
-
- #print('done')
- return output[0]
-
-
- def GetUtteranceNumber(self, sentences):
-
- nlp = self.nlp
-
- uttersentences = []
-
- for sentence in sentences:
-
- doc = nlp(' '.join(sentence[0]))
-
- subjectcount = 0
-
- for word in doc:
-
- if word.dep_ == 'sb' or word.dep_ == 'ep':
- subjectcount += 1
-
- sentence.append(subjectcount)
- uttersentences.append(sentence)
-
- return uttersentences
-
- def GetQuestionOrNot(self, sentences):
-
- nlp = self.nlp
-
- uttersentences = []
- questionmark = 0
- for sentence in sentences:
-
- doc = nlp(' '.join(sentence[0]))
-
-
- count = 0
- for word in doc:
-
-
- count += 1
-
- if word.text == '?':
- questionmark = 1
-
- sentence.append(questionmark)
- uttersentences.append(sentence)
-
- return uttersentences
-
- def SplitSentencesIntoHauptNebenTuple(self, sentences, punctuations):
-
-
-
- oldsplitsentences = []
- #print('hauptneben inputsentences', sentences)
-
- gs = self.gs
-
- #print('importing spacy..')
- import spacy
- #print('done')
-
- nlp = self.nlp
-
- outputsentences = []
- sentencesThatAreOutoutput = []
- outsentences = []
- for generalindex in range(len(sentences)):
- presentence = sentences[generalindex]
-
- splitsentence = []
- splitsentence_deps = []
- splitsentence_tags = []
- splitsentences = []
- splitsentences_deps = []
- splitsentences_tags = []
- commainfo = presentence[1]
- outputsentence = []
-
-
- token = commainfo[1]
-
- commaornot = commainfo[0]
-
- numberutterances = presentence[2]
-
- sentence = presentence[0]
-
- oldsentence = presentence[0]
-
- #print(commaornot)
- if commaornot >= 2:
- #print('nla')
-
- sentence[0] = sentence[0].title()
-
- doc = nlp(' '.join(sentence))
-
-
- for word in doc:
-
- #print(word.text)
-
- if word.text != ',' and word.text != '.':
-
- splitsentence.append(word.text)
- splitsentence_deps.append(word.dep_)
- splitsentence_tags.append(word.tag_)
-
- if word.text == ',':
-
- #print('oi')
-
- splitsentences.append(splitsentence)
- splitsentences_deps.append(splitsentence_deps)
- splitsentences_tags.append(splitsentence_tags)
- splitsentence = []
- splitsentence_deps = []
- splitsentence_tags = []
-
-
- splitsentences.append(splitsentence)
- splitsentences[0][0] = splitsentences[0][0].lower()
- splitsentences_deps.append(splitsentence_deps)
- splitsentences_tags.append(splitsentence_tags)
- oldsplitsentences = splitsentences
- #print(splitsentences)
- #print(splitsentences_tags)
- #print(splitsentences_deps)
- spo = []
-
- for n in range(len(splitsentences)):
- prespo = []
- prespo = gs.checkSPO(splitsentences_deps[n], 1)
- prespo.append( gs.checkForAnnotation(splitsentences[n], 'VVINF', 'word.tag_'))
- prespo.append(gs.checkForAnnotation(splitsentences[n], 'VAFIN', 'word.tag_'))
- prespo.append(gs.checkForAnnotation(splitsentences[n], 'VVFIN', 'word.tag_'))
- prespo.append(gs.checkForAnnotation(splitsentences[n], 'VMFIN', 'word.tag_'))
-
-
-
-
-
- spo.append(prespo)
- #print(splitsentences_deps)
- #print(splitsentences)
- #print(spo)
-
- indexSPO = []
- lastm = len(splitsentences)
- for o in range(len(splitsentences)):
-
- m = len(splitsentences) - 1 - o
- for n in range(len(splitsentences)):
-
-
-
- if m < n - 1 and n < lastm:
-
- #print('spo s',spo[m], spo[n])
- sb = spo[m][0] + spo[n][0]
- Vafin = 1
- if spo[m][3] == 1 or spo[n][3] == 1:
- Vafin = spo[m][3] + spo[n][3]
- Vvinf = 1
- if spo[m][4] == 1 or spo[n][4] == 1:
- Vvinf = spo[m][4] + spo[n][4]
- Vvfin = 1
- if spo[m][5] == 1 or spo[n][5] == 1:
- Vvfin = spo[m][5] + spo[n][5]
- Vmfin = 1
- if spo[m][6] == 1 or spo[n][6] == 1:
- Vmfin == spo[m][6] + spo[n][6]
- #wrapped = 0
- #for n in range(len(indexSPO)):
- #if n == indexSPO[n][0] + 1 and n == indexSPO[n][1] - 1:
- #wrapped = 1
- #print(sb, Vafin, Vvinf, Vvfin, Vmfin, 'm n', m, n)
- if sb == 1 and Vafin == 1 and Vvinf == 1 and (Vvfin == 1 or Vmfin == 1):
-
- indexSPO.append([m,n])
- #print([m,n])
- lastm = m
- #print('lastm',lastm)
-
-
-
- #print(splitsentences)
- Hauptsentences = []
- for n in range(len(indexSPO)):
- if indexSPO[n][0] > indexSPO[n][1]:
- i = 1
- j = 0
- else:
- i = 0
- j = 1
- Hauptsentences.append([splitsentences[indexSPO[n][i]] + splitsentences[indexSPO[n][j]] , indexSPO[n][i], indexSPO[n][j] ])
-
- HauptSentences = []
- for n in range(len(Hauptsentences)):
- m = len(Hauptsentences) - 1 - n
- HauptSentences.append(Hauptsentences[m])
-
- #print('Hauptsentences', Hauptsentences)
- #print('HauptSentences', HauptSentences)
- sentencesThatAreOut =[]
-
- for n in range(len(HauptSentences)):
- index = HauptSentences[n][1]
- finish = 0
- #print('Oi',HauptSentences[n])
- if n == len(HauptSentences) - 1:
-
- #print('lenHauptsentences', len(HauptSentences))
-
- stopindex = len(splitsentences)
- finish = 1
- else:
- stopindex = HauptSentences[n + 1][1]
- #print('stopindex', stopindex)
- vvfinisthere = 0
- if finish == 0:
- if splitsentences_tags[stopindex][0] == 'VVFIN':
- stopindex -= 1
- vvfinisthere = 1
-
- if splitsentences_tags[index][0] == 'VVFIN':
- vvfinisthere = 1
-
- if vvfinisthere == 1:
-
-
- HNTuple = HauptSentences[n][0] + [','] + splitsentences[index - 1]
- outputsentence.append(HNTuple)
- sentencesThatAreOut.append(index - 1)
- sentencesThatAreOut.append(Hauptsentences[n][1])
- sentencesThatAreOut.append(Hauptsentences[n][2])
-
- for m in range(index + 1, stopindex ):
- if m != HauptSentences[n][2]:
- HNTuple = HauptSentences[n][0] + [','] + splitsentences[m]
- #print('check', HauptSentences[n], n)
- #print('check', splitsentences[m], m)
- #print('double', HNTuple)
- outputsentence.append(HNTuple)
-
-
-
- sentencesThatAreOut.append(m)
- sentencesThatAreOut.append(Hauptsentences[n][1])
- sentencesThatAreOut.append(Hauptsentences[n][2])
-
- sentencesThatAreOutoutput.append(sentencesThatAreOut)
-
-
- cpOrNots = []
- rcOrNots = []
- for splitsentence in splitsentences_deps:
- cpOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'cp')
- cpOrNots.append(cpOrNot)
- rcOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'rc')
- rcOrNots.append(rcOrNot)
-
- #print('Laenge splitsentences', len(splitsentences))
- #print('laenge cpOrNots', len(cpOrNots))
- #print(cpOrNots)
- #print('rc or nots', rcOrNots)
- pairs = []
- for n in range(len(cpOrNots)):
- index = len(cpOrNots) - 1 - n
- done = 0
- if rcOrNots[index] == 1:
- pairs.append([index, index - 1])
- done = 1
-
-
- if done == 0 and cpOrNots[index] == 1:
- try:
- if splitsentences_tags[index + 1][0] == 'VVFIN':
- pairs.append([index, index + 1])
- done = 1
- except:
- pass
- try:
- if done == 0 and rcOrNots[index - 1] == 0:
- pairs.append([index, index - 1])
- done = 1
- except:
- pass
- try:
- if done == 0 and rcOrNots[index - 1] == 1:
- if rcOrNots[index - 2] == 0:
- pairs.append([index, index - 2])
- except:
- pass
-
- for pair in pairs[::-1]:
- if pair[0] not in set(sentencesThatAreOut) or pair[1] not in set(sentencesThatAreOut):
- outputsentence.append(splitsentences[pair[1]] + [','] + splitsentences[pair[0]])
- #print('hnhn',sentences)
- sentences[generalindex][0] = outputsentence
-
- #print('outputsentence hntuple',outputsentence)
- #outputsentences.append([outputsentence , i])
-
- #print('Oio', outputsentences)
- #print(sentencesThatAreOutoutput)
- #print(splitsentences)
- #print('oioioioioioioio',sentences)
-
- #print(sentences[0][0])
-
-
- #print('oioi',sentences[n])
- #print('malatesta', sentences[n][0][0])
- #print('generalindex sentences index 0', sentences[generalindex][0])
- try:
- if type(sentences[generalindex][0][0]) == str:
- sentences[generalindex][0] = [sentences[generalindex][0]]
- except:
- pass
- #print('generalindex sentences index 0', sentences[generalindex][0])
- #print('oldsentence', oldsentence)
- newgeneratedsentences = len(sentences[generalindex][0])
- if newgeneratedsentences > 1:
- #print('goti t')
- for sentence in sentences[generalindex][0]:
- punctuations.insert(generalindex, punctuations[generalindex])
- outsentences.append(sentence)
- del punctuations[generalindex]
- if newgeneratedsentences == 1:
- if len(sentences[generalindex][0][0]) > 1:
- outsentences.append(sentences[generalindex][0][0])
- else:
- outsentences.append(oldsentence)
- if newgeneratedsentences == 0:
- #print('case oldsentence', oldsentence)
- outsentences.append(oldsentence)
- #print('oioi', sentences[n])
- # connect alonestanding commatas with the word before
- #print('theoutsentences', outsentences)
- for outsentence in outsentences:
- todelete = []
- for n in range(len(outsentence)):
- if outsentence[n] == ',':
- todelete.append(n)
- outsentence[n-1] = outsentence[n-1] + ','
- for deleteindex in todelete[::-1]:
- del outsentence[deleteindex]
-
- for index in range(len(outsentences)):
- outsentences[index] = [outsentences[index]]
- #print('theoutsentences', outsentences)
-
- #removing doubles
- doubledsentences = []
- for o in range(len(outsentences)):
- sentence = outsentences[o][0]
- for m in range(len(outsentences)):
- if m != o:
- count = 0
- for n in range(len(sentence)):
- if sentence[n] in outsentences[m][0] or sentence[n][:-1] in outsentences[m][0]:
- count += 1
- if count == len(sentence):
- doubledsentences.append(sentence)
- punctdeleteindex = []
- tmp = set()
- for sentence in doubledsentences:
- tmp.add(tuple(sentence))
- #print(list(tmp))
- doubledsentences = []
- for tup in tmp:
- doubledsentences.append([list(tup)])
- #print('doubledsentences',doubledsentences)
- punctdeleteindexes = []
- for double in doubledsentences:
- if double in outsentences:
- punctdeleteindex = outsentences[::-1].index(double)
- del outsentences[len(outsentences) - 1 - punctdeleteindex]
- punctdeleteindexes.append(punctdeleteindex)
-
- for index in punctdeleteindexes[::-1]:
- del punctuations[len(outsentences) - 1 - index]
-
- #print('oldsplit',oldsplitsentences)
- #print('outsents',outsentences)
-
- for o in range(len(oldsplitsentences)):
- for m in range(len(outsentences)):
- counter = 0
- for n in range(len(oldsplitsentences[o])):
- if oldsplitsentences[o][n] in outsentences[m][0] or oldsplitsentences[o][n] + ',' in outsentences[m][0]:
- counter += 1
- if counter >= len(oldsplitsentences[o]):
- break
- if m == len(outsentences) - 1 and counter < len(oldsplitsentences[o]):
- if o == 0:
- outsentences.insert(0,[oldsplitsentences[o]])
- punctuations.insert(0, punctuations[0])
- else:
- newones = []
- for i in range(len(outsentences)):
- if outsentences[i][0][-1] == oldsplitsentences[o - 1][-1]:
- if len(outsentences[i][0]) > 2 and len(oldsplitsentences[o - 1]) > 2:
- if outsentences[i][0][-2] == oldsplitsentences[o - 1][-2]:
- if outsentences[i][0][-3] == oldsplitsentences[o - 1][-3]:
- newones.append([i + 1, [oldsplitsentences[o]]])
- for newone in newones[::-1]:
- #print(newones)
- outsentences.insert(newone[0], newone[1])
- punctuations.insert(newone[0], punctuations[newone[0] - 1])
-
-
-
-
-
- #print('outsentences at the very end ', outsentences, punctuations)
- return outsentences, punctuations
-
-
- # Notiz: Hier muss der Input immer Paare sein, von Hauptsatz/Nebensatz. D.h. eine weitere vorgeschaltete Klasse ist von Nöten.
-
- def SplitCommatas(self, Inputsentences, punctuations):
-
- gs = self.gs
-
- nlp = self.nlp
-
- gramcorr_splitsentences = []
- counter = 0
- newpunctuationsindex = []
- for Inputsentence in Inputsentences:
-
- counter += 1
-
-
- commainfo = Inputsentence[1]
-
-
- token = commainfo[1]
-
- commaornot = commainfo[0]
-
- numberutterances = Inputsentence[2]
-
-
- if commaornot == 0:
- gramcorr_splitsentences.append(Inputsentence[0])
-
- if commaornot > 1:
- gramcorr_splitsentences.append(Inputsentence[0])
-
- if commaornot == 1:
- oldsentence = Inputsentence[0]
- Inputsentence = [[Inputsentence[0]]]
-
-
-
-
- for sentence in Inputsentence[0]:
-
- splitsentence = []
-
- splitsentences = []
-
-
-
-
- processed = 0
- wasNotInAnyList = 0
- try:
- for n in range(len(token)):
-
- if token[n] in self.final_list:
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
-
-
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
- if n == 1:
-
-
- if token[n] == 'um' or token[n] == 'Um':
-
- splitsentences[n].insert(0,'dies')
- splitsentences[n].insert(0,'um')
- else:
- splitsentences[n].insert(0,'dann')
-
-
-
- if n == 0:
-
- if token[n] == 'um' or token[n] == 'Um':
- splitsentences[n].insert(0,'dies')
- splitsentences[n].insert(0,'um')
- splitsentences = splitsentences[::-1]
- else:
- splitsentences[n].insert(0,'dann')
-
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
- generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
- processed = 1
-
- if token[n] in self.adversativ_list:
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
- splitsentences[n].append('jedoch')
-
-
- generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
- processed = 1
-
- if token[n] in self.kausal_list:
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
- # Da deswegen an den anderen Satz gehaengt wird, muss der input zu commasentences immer ZWEI sentences sein.
- #print('splitsentences in kausal', splitsentences)
- if n == 1:
- splitsentences[n - 1].insert(0,'deswegen')
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
-
-
- if n == 0:
- splitsentences[n + 1].insert(0,'deswegen')
-
-
-
-
- #print('splitsentences in kausal', splitsentences)
-
-
- generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
- processed = 1
-
- # from here come konsekutiv sentences, they have to be split according https://www.deutschplus.net/pages/Konsekutivsatz
- if token[n] in self.konsekutiv_list:
- #print('oi konsekutiv')
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
- generalrules = [['KOUS','PPER']]
- processed = 1
-
-
- if token[n] in self.konditional_list:
- splitsentence = []
- for word in sentence:
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if n == 1:
-
- spoCount = gs.checkSPO(splitsentences[n], 0)
-
- spoCount = sum(spoCount)
-
- if spoCount == 2:
- thereisanes = 0
- for word in splitsentences[n]:
- if word == 'es' or word == 'Es':
- thereisanes = 1
- if thereisanes == 0:
- splitsentences[n].append('es')
-
-
- if n == 0:
-
-
- spoCount = gs.checkSPO(splitsentences[n], 0)
-
- spoCount = sum(spoCount)
-
- if spoCount == 2:
-
- thereisanes = 0
- for word in splitsentences[n]:
- if word == 'es' or word == 'Es':
- thereisanes = 1
- if thereisanes == 0:
- splitsentences[n].append('es')
-
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
- generalrules = [['KOUS','PPER']]
- processed = 1
-
- if token[n] in self.konzessiv_list:
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if n == 1:
- splitsentences[n - 1].insert(0,'trotzdem')
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
-
-
- if n == 0:
- splitsentences[n + 1].insert(0,'trotzdem')
-
-
- generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
- processed = 1
-
- if token[n] in self.lokal_list:
- #print('lokal ole ole ')
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if n == 1:
- splitsentences[n - 1].insert(0,'dort')
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
-
-
- if n == 0:
- splitsentences[n + 1].insert(0,'dort')
-
-
- generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
- processed = 1
-
- if token[n] in self.instrumental_list:
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if n == 1:
- splitsentences[n - 1].insert(0,'so')
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
-
-
- if n == 0:
- splitsentences[n + 1].insert(0,'so')
-
-
- generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
- processed = 1
-
- if token[n] in self.temporal_list_vor:
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if n == 1:
- splitsentences[n].insert(0,'danach')
-
-
-
-
- if n == 0:
- splitsentences[n].insert(0,'danach')
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
- generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
- processed = 1
-
- if token[n] in self.temporal_list_nach:
- splitsentence = []
- for word in sentence:
-
- if word != token[n]:
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if n == 1:
- splitsentences[n].insert(0,'davor')
-
-
-
-
- if n == 0:
- splitsentences[n].insert(0,'davor')
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
- generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
- processed = 1
-
- #print(token[n])
- if token[n] == 'der' or token[n] == 'welcher':
-
- tokens = self.nlp(' '.join(sentence))
- for word in tokens:
- if word.dep_ == 'rc':
- wordwithrc = word.text
-
- rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
-
- oldsplitsentences = splitsentences
- splitsentences = []
-
- if rcORnot == 1:
- splitsentence = []
- for word in sentence:
-
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
- # das umtauschen wird hier vollzogen, da ansonsten spacy dieser nicht als PDS einliest.. analog in den anderen.
-
- if wordwithrc in splitsentences[n]:
-
- splitsentences[n][0] = 'dieser'
-
- verb = splitsentences[n][-1]
-
- splitsentences[n] = splitsentences[n][:-1]
- splitsentences[n].insert(1, verb)
-
-
- #print('Vorsicht', splitsentences)
-
- generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
- processed = 1
- else:
- splitsentences = oldsplitsentences
- splitsentence = []
-
- if token[n] == 'die' or token[n] == 'welche':
-
-
- tokens = self.nlp(' '.join(sentence))
- for word in tokens:
- if word.dep_ == 'rc':
- wordwithrc = word.text
-
-
-
- rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
-
-
- oldsplitsentences = splitsentences
- splitsentences = []
-
-
- if rcORnot == 1:
- #print('it went to rcornot in case die')
-
-
- splitsentence = []
- for word in sentence:
-
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
- if wordwithrc in splitsentences[n]:
- #print('wordwithrc was in sentence')
- #print(wordwithrc)
- #print(splitsentences[n])
- #print('wordwithrcend')
- splitsentences[n][0] = 'diese'
-
- verb = splitsentences[n][-1]
-
- splitsentences[n] = splitsentences[n][:-1]
- splitsentences[n].insert(1, verb)
-
-
-
- generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
- processed = 1
- else:
-
- splitsentences = oldsplitsentences
- splitsentence = []
-
- if token[n] == 'dem':
-
- tokens = self.nlp(' '.join(sentence))
- for word in tokens:
- if word.dep_ == 'rc':
- wordwithrc = word.text
-
- rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
-
- oldsplitsentences = splitsentences
- splitsentences = []
-
- if rcORnot == 1:
- splitsentence = []
- for word in sentence:
-
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',' and word[-1] != '.':
- splitsentence.append(word)
-
- if word[-1] == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if wordwithrc in splitsentences[n]:
-
- splitsentences[n][0] = 'diesem'
-
- verb = splitsentences[n][-1]
-
- splitsentences[n] = splitsentences[n][:-1]
- splitsentences[n].insert(1, verb)
-
-
-
- generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
- processed = 1
- else:
- splitsentences = oldsplitsentences
- splitsentence = []
-
- if token[n] == 'das' or token[n] == 'welches':
-
- tokens = self.nlp(' '.join(sentence))
- for word in tokens:
- if word.dep_ == 'rc':
- wordwithrc = word.text
-
-
- rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
-
- #print('Oeeee',rcORnot)
- oldsplitsentences = splitsentences
- splitsentences = []
- if rcORnot == 1:
- splitsentence = []
- for word in sentence:
-
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
- #print('splitsentence in das rc', splitsentences)
- if wordwithrc in splitsentences[n]:
-
- splitsentences[n][0] = 'dieses'
-
- verb = splitsentences[n][-1]
- #print('verb',verb)
- splitsentences[n] = splitsentences[n][:-1]
- splitsentences[n].insert(1, verb)
-
- generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
- processed = 1
- else:
- splitsentences = oldsplitsentences
- splitsentence = []
-
- if token[n] == 'dessen' or token[n] == 'wessen':
-
- tokens = self.nlp(' '.join(sentence))
- for word in tokens:
- if word.dep_ == 'rc':
- wordwithrc = word.text
-
-
- rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
-
- oldsplitsentences = splitsentences
- splitsentences = []
-
- if rcORnot == 1:
- splitsentence = []
- for word in sentence:
-
-
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
- if wordwithrc in splitsentences[n]:
- verb = splitsentences[n][-1]
-
- splitsentences[n] = splitsentences[n][:-1]
- splitsentences[n].insert(1, verb)
-
-
-
- generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
- processed = 1
- else:
- splitsentences = oldsplitsentences
- splitsentence = []
-
- if token[n] == 'den' or token[n] == 'welchen':
-
- tokens = self.nlp(' '.join(sentence))
- for word in tokens:
- if word.dep_ == 'rc':
- wordwithrc = word.text
-
-
- rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
-
- oldsplitsentences = splitsentences
- splitsentences = []
-
- if rcORnot == 1:
- splitsentence = []
- for word in sentence:
-
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if wordwithrc in splitsentences[n]:
-
- splitsentences[n][0] = 'diesen'
-
- verb = splitsentences[n][-1]
-
- splitsentences[n] = splitsentences[n][:-1]
- splitsentences[n].insert(1, verb)
-
-
- generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
- processed = 1
- else:
- splitsentences = oldsplitsentences
- splitsentence = []
-
-
- if token[n] == 'wem' or token[n] == 'Wem' or token[n] == 'welchem':
-
- daORnot = gs.checkForAnnotation(sentence, 'da', 'word.dep_')
-
- oaORnot = gs.checkForAnnotation(sentence, 'oa', 'word.dep_')
-
- reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
-
- oldsplitsentences = splitsentences
- splitsentences = []
-
- for word in sentence:
-
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if n == 0:
- index = 1
- if n == 1:
- index = 0
-
- if reORnot == 1:
- pass
- if daORnot == 1 and reORnot == 0:
- splitsentences[index].insert(1, 'das')
-
- if oaORnot == 1 and reORnot == 0:
- splitsentences[index].insert(1, 'dem')
-
- if n == 1:
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
-
- generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
- processed = 1
-
- if token[n] in self.indirectspeech_list and token[1] not in self.konsekutiv_list:
-
-
- reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
- oldsplitsentences = splitsentences
- splitsentences = []
- splitsentence = []
- for word in sentence:
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',':
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
-
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
-
- if n == 0:
- index = 1
- if n == 1:
- index = 0
-
- if reORnot == 0:
- if splitsentences[index][0] != 'was':
- splitsentences[index].insert(1, 'das')
-
- if n == 1:
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
-
-
- generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
- processed = 1
-
- if processed == 0 and n == 1:
-
- ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VVINF'], 'word.tag_', 'None')
- if ZUVINFTupelORnot == 0:
- ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VAINF'], 'word.tag_', 'None')
-
- if ZUVINFTupelORnot == 1:
-
- reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
- splitsentence = []
- for word in sentence:
-
-
- if word[-1] == ',':
- splitsentence.append(word[:-1])
- if word == ',':
- pass
- if word[-1] != ',' :
- splitsentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- splitsentences.append(splitsentence)
- processed = 1
- splitsentence = []
-
- splitsentences.append(splitsentence)
-
- for m in range(2):
- ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VVINF'], 'word.tag_','None')
-
- if ZUINForNOT == 0:
- ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VAINF'], 'word.tag_','None')
-
-
- if ZUINForNOT == 1:
- r = m
- ZUINForNOT = 0
-
-
-
- if r == 0:
- index = 1
- if r == 1:
- index = 0
-
- objectORnot = gs.checkForAnnotation(splitsentences[index] , 'oa', 'word.dep_')
-
- if reORnot == 0 and objectORnot == 0:
- splitsentences[index].insert(1, 'das')
-
- if r == 1:
- splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
- else:
- processed == 2
-
-
- except:
- wasNotInAnyList = 1
-
-
- #rules = [['ART','ADJA','NN'], ['ART','ADJA','NE'], ['ART', 'NN'], ['ART', 'NE'], ['APPR','NN'], ['APPR','NE'], ['APPR', 'ART', 'NN'], ['APPR', 'ART', 'NE'], ['APPR','ART','NN','ADJA','NN'], ['APPR','ART','NN','ADJA','NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD']]
-
- #print('B',splitsentences)
- endsentences = []
- if (processed == 2 or processed == 0) and n == 1:
- wasNotInAnyList = 1
-
-
- try:
- if wasNotInAnyList == 0:
- newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
- #print('splitsentencee', splitsentences)
- if len(splitsentences) > 2:
- splitsentences = splitsentences[:2]
-
- #print('splitsentenceeeees', splitsentences)
-
- for splitsentence in splitsentences:
-
- #print('splitsentenceeeeeeeeeeee!!',splitsentence)
- wordtoputfirst = 'nada'
- for word in self.firstwordlist:
- if word == splitsentence[0]:
- wordtoputfirst = word
- splitsentence.remove(word)
-
-
-
- #print('get the tuples and triples to check..')
- tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(splitsentence)
- #print('done')
- #print(tuplesTocheck, 'ole', triplesTocheck ,'aiai', quadruplesTocheck)
- #print('1')
- grammpiecessentence = self.gs.createTupleofGrammarpieces( splitsentence, tuplesTocheck, triplesTocheck, quadruplesTocheck)
-
- #print('grammpiece',grammpiecessentence)
- #print('2')
- if len(grammpiecessentence) > 7:
- print('A sentence is too long, too many permutations. \n piping wrong grammar..')
- endsentence = ' '.join(grammpiecessentence)
-
- else:
- #print('genrating the permutations')
- permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
- #print('done')
- #print(permutations)
- #print('3')
- firstwordwithverblist = ['deswegen', 'danach']
- permutationstodelete = []
- for permutation in permutations:
- #print('4')
- if permutation[0] in firstwordwithverblist:
- #print('4.1')
- count = 1
- for word in self.nlp(permutation[1]):
- #print('4.2')
- if word.tag_[0] != 'V':
- #print('4.3')
- permutationstodelete.append(permutation)
- break
- else:
- break
- #for word in self.nlp(permutation[0]):
- #print('4.2')
- #if word.tag_[0] != 'V':
- #print('4.3')
- #permutationstodelete.append(permutation)
- #break
- #else:
- #break
- for delperm in permutationstodelete:
- try:
- permutations.remove(delperm)
- except:
-
- pass
- #print('5')
-
- sentencesToCheck = []
- if wordtoputfirst in self.firstwordlist:
- for sentence in permutations:
- sentencesToCheck.append(wordtoputfirst + ' ' + ' '.join(sentence))
- else:
- for sentence in permutations:
- sentencesToCheck.append(' '.join(sentence))
-
- endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(splitsentence))
- #print('done')
- #print('endsent',endsentence)
- endsentences.append(endsentence)
- except:
- #print('there was an error')
- wasNotInAnyList = 1
- endsentences = []
- todelete = []
- for index in range(len(newpunctuationsindex)):
- if newpunctuationsindex[index][0] == counter - 1:
- todelete.append(index)
- for todel in todelete[::-1]:
- del newpunctuationsindex[todel]
-
-
- if wasNotInAnyList == 1:
- #print('was not in any list')
- #print(oldsentence)
- endsplisentences = []
- splisentence = []
- for word in oldsentence:
-
-
- if word[-1] == ',':
- splisentence.append(word[:-1])
-
- if word == ',':
- pass
- if word[-1] != ',':
- splisentence.append(word)
-
- if word[-1] == ',' or word == ',':
-
- endsplisentences.append(splisentence)
-
- splisentence = []
-
- endsplisentences.append(splisentence)
-
- newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
-
- #print('endsplisentences',endsplisentences)
- for splsentence in endsplisentences:
-
- endsentences.append(' '.join(splsentence))
-
-
-
-
-
- '''
-
-
-
- fsearch1 = self.fsearch1
- spacyclass1 = 'word.tag_'
-
-
- gs_sentence1 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass1)
-
-
- print('searchPatternMatch for tags')
- bestmatches1 = fsearch1.searchPatternMatch(' '.join(gs_sentence1), 1)
- print('done')
-
- #print('oioi', bestmatches1)
-
- #print(len(fsearch1.database))
- right_gs_tupel1 = []
-
- if len(bestmatches1) < 10:
- bestndocs1 = len(bestmatches1)
- else:
- bestndocs1 = 10
-
- for m in range(bestndocs1):
- right_gs_tupel1.append(fsearch1.database[bestmatches1[m][0]])
-
-
- statistically_correct_sentences1 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence1, right_gs_tupel1)
-
-
- fsearch2 = self.fsearch2
-
- spacyclass2 = 'word.dep_'
-
- gs_sentence2 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass2)
-
- print('searchPatternMatch for deps')
- bestmatches2 = fsearch2.searchPatternMatch(' '.join(gs_sentence2), 1)
- print('done')
-
- right_gs_tupel2 = []
-
-
- if len(bestmatches2) < 10:
- bestndocs2 = len(bestmatches2)
- else:
- bestndocs2 = 10
-
-
- for m in range(bestndocs2):
- right_gs_tupel2.append(fsearch2.database[bestmatches2[m][0]])
-
- #print(' '.join(splitsentence))
-
- statistically_correct_sentences2 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence2, right_gs_tupel2)
-
-
- print(splitsentence)
-
-
- Rightsentence = gs.GetBestgsAccordingRules(' '.join(splitsentence) , gs_sentence1, right_gs_tupel1, right_gs_tupel2, statistically_correct_sentences1, statistically_correct_sentences2, rules, generalrules)
-
-
- '''
- for endsentence in endsentences:
- gramcorr_splitsentences.append(endsentence.split())
-
- for index in newpunctuationsindex:
- punctuations.insert(index[0], index[1])
-
- return gramcorr_splitsentences, punctuations
-
-
-
- def putAppendixesIntoOwnSentences(self, sentences, punctuations):
-
- gs = self.gs
- #triples = [['NN', 'ART', 'NN'], ['NE', 'ART', 'NN'], ['NN', 'ART', 'NN'], ['NE', 'ART', 'NE']]
- quadruples = [['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'ART', 'NN'], ['NE', 'APPR', 'ART', 'NN'], ['NN', 'APPR', 'ART', 'NE'], ['NE', 'APPR', 'ART', 'NE']]
- quadruplestochange = []
- triplestochange = []
- newsentences = []
- newpunctuations = []
- Whatisofnouns = []
- oldsentences = sentences
- oldpunctuations = punctuations
- for hauptindex in range(len(sentences)):
-
- sentence = sentences[hauptindex]
- try:
- #for triple in triples:
- # AnnoOrNot, tripleInWords = gs.checkForAnnotationTriple(sentence, triple, 'word.tag_', 'None')
- # for tripleinwor in tripleInWords:
- # triplestochange.append([triple, tripleinwor])
-
- for quadruple in quadruples:
- AnnoOrNot, quadrupleInWords = gs.checkForAnnotationQuadruple(sentence, quadruple, 'word.tag_', 'None')
- #print('quadinwords', quadrupleInWords)
- #print('ANNOORNOT', AnnoOrNot)
- for quadrupleInWo in quadrupleInWords:
- quadruplestochange.append([quadruple, quadrupleInWo])
-
- #print('quadstochange',quadruplestochange)
- for quad in quadruplestochange:
- for n in range(len(sentence) - 4):
- if sentence[n] == quad[1][0]:
- if sentence[n + 1] == quad[1][1]:
- if sentence[n + 2] == quad[1][2]:
- artword = None
- longerWhatisnoun = 0
- for m in range(2):
- for word in self.nlp(sentence[n - m]):
- if word.tag_ == 'ART':
- Nounthatis = sentence[n - m:n + 1]
- import spacy
- nlp = spacy.load('de_core_news_sm')
- token3 = nlp(sentence[n+4])
- counter = 0
- Whatisnoun = sentence[n + 1:n + 4]
- for wor in token3:
- counter += 1
- if wor.tag_ == 'NN' or wor.tag_ == 'NE':
- if counter == 1:
- Whatisnoun = sentence[n + 1:n + 5]
- longerWhatisnoun = 1
- if counter == 2:
- Whatisnoun = sentence[n + 1:n + 4]
-
-
-
- artword = word.text
- #print(sentence[n - 1],'oi')
- if ((artword == 'die' or artword == 'Die') and sentence[n][-1] != 'n') or ((artword == 'der' or artword == 'einer' or artword == 'dieser') and (sentence[n - 2] in ['von', 'in', 'auf', 'ueber', 'unter', 'nach', 'mit'])):
-
- if artword == 'der':
- Nounthatis[0] = 'die'
-
- donothing = 0
- if sentence[n + 1] == 'mit':
- if sentence[n + 2] == 'den':
- verb = ' hat die '
- Whatisnoun = Whatisnoun[2:]
- if sentence[n + 2] == 'der':
- verb = ' hat eine '
- Whatisnoun = Whatisnoun[2:]
- if sentence[n + 2] != 'der' and sentence[n + 2] != 'den':
- donothing = 1
- else:
- verb = ' ist '
- if donothing == 0:
- newsentence = ' '.join(Nounthatis) + verb + ' '.join(Whatisnoun)
-
-
- newsentences.append([hauptindex + 1, newsentence.split()])
- newpunctuations.append([hauptindex + 1, punctuations[hauptindex]])
- if longerWhatisnoun == 0:
- Whatisofnouns.append([n + 1, n + 4, hauptindex])
- else:
- Whatisofnouns.append([n + 1, n + 5, hauptindex])
- except:
- print('Konnte nicht ' + str(sentence) + 'in Characterisierung pro Satz prozessieren..')
- try:
- for whatis in Whatisofnouns[::-1]:
- thereisacomma = 0
- #print(sentences[whatis[2]][whatis[1] - 1])
- if sentences[whatis[2]][whatis[1] - 1][-1] == ',':
-
- thereisacomma = 1
- if thereisacomma == 1:
- #print(sentences[whatis[2]][whatis[0] - 1])
- sentences[whatis[2]][whatis[0] - 1] = sentences[whatis[2]][whatis[0] - 1] + ','
- del sentences[whatis[2]][whatis[0]:whatis[1]]
- for newsent in newsentences[::-1]:
- sentences.insert(newsent[0], newsent[1])
- for newpunct in newpunctuations[::-1]:
- punctuations.insert(newpunct[0], newpunct[1])
- for sentence in sentences:
- if sentence[-1][-1] == ',':
- sentence[-1] = sentence[-1][:-1]
- except:
- print('konnte nicht die gesammelten Characterisierungen prozessieren')
- sentences = oldsentences
- punctuations = oldpunctuations
-
-
-
- return sentences, punctuations
-
-
-
-
-
-
-
-
|