# split sentences # in den Listen fehlt noch sondern ( und noch weitere Dinge..) # Folgende Konjunktionen brauchen keine Satzumformungen: # Woraufhin, zudem, zumal, umso - desto, # sondern ist schwierig zu lösen.. am besten mit sondern weg, und anschließend SentGlue class SentSeg(object): def __init__(self, language): self.language = language self.punktuation_list = ['.', '?', '!', ';', ':'] self.wrappunktuation_list = [',', '-'] self.adversativ_list = ['wohingegen', 'Wohingegen', 'aber', 'Aber', 'wobei', 'Wobei', 'hingegen'] self.final_list = ['damit','Damit', 'um', 'Um'] self.kausal_list = ['weil', 'Weil', 'da', 'Da', 'denn', 'falls', 'Falls' ] self.konditional_list = ['wenn', 'Wenn', 'sobald', 'Sobald', 'als', 'falls'] self.konsekutiv_list = ['dass', 'Dass'] self.konzessiv_list = ['obwohl', 'Obwohl', 'obgleich', 'Obgleich', 'trotzdem', 'Trotzdem', 'wenngleich', 'doch'] self.lokal_list = ['wo', 'Wo'] self.temporal_list_vor = ['bevor', 'Bevor'] self.temporal_list_nach = ['nachdem', 'Nachdem'] self.instrumental_list = ['indem', 'Indem'] self.indirectspeech_list = ['ob', 'Ob', 'wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso'] self.firstwordlist = [] #self.firstwordlist = ['wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso', 'dies', 'dann', 'jedoch', 'deswegen', 'trotzdem', 'danach', 'davor', 'wenn', 'sobald'] self.full_list = self.adversativ_list + self.final_list + self.kausal_list + self.konditional_list + self.konsekutiv_list + self.konzessiv_list + self.lokal_list + self.temporal_list_nach + self.temporal_list_vor + self.instrumental_list + self.indirectspeech_list def ReadDoc2Sent(self, document): splitsentences = [] splitsentence = [] with open(document) as sentences: counter = 0 for sentence in sentences: counter += 1 if counter % 1000 == 0: print(counter) words = sentence.split() for word in words: splitsentence.append(word) if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2: splitsentences.append([splitsentence]) splitsentence = [] return splitsentences def AndOrSolver(self, sentences, punctuations): for n in range(len(punctuations)): if punctuations[n] == ':' or punctuations[n] == '-': punctuations[n] = '.' #print(sentences, punctuations) splitsentences = [] counter = 0 newsentences = [] for sentence in sentences: newpunctuationsindexes = [] utterancenumber = sentence[2] commainfo = sentence[1] commaornot = commainfo[0] sentence = sentence[0] counter += 1 doc = self.nlp(' '.join(sentence)) subjectcount = 0 separationwords = [] subjectcounts = [] doccounter = 0 subjectindex = [] rcornot = 0 for word in doc: doccounter += 1 if word.dep_ == 'sb' or word.dep_ == 'ep': subjectcount += 1 subjectindex.append(doccounter - 1) if word.dep_ == 'rc': rcornot = 1 if word.tag_ == '$,': subjectcounts.append([subjectcount, doccounter - 2, subjectindex, rcornot]) subjectindex = [] subjectcount = 0 #print('aleaole',sentence[doccounter - 2]) if len(sentence[doccounter - 2]) > 1: doccounter -= 1 if word.text == 'und' or word.text == 'also' or word.text == 'oder' or word.text == 'schon' or word.text == 'bald' or word.text == 'doch' or word.text == 'jedoch' or word.text == 'sondern': separationwords.append(doccounter - 1) #print('separationwords', separationwords) #print('subjectcounts', subjectcounts) separationwordstocut = [] listofownsentencessubjectindexes = [] for n in range(len(subjectcounts) - 1): if subjectcounts[n][0] > 0 and subjectcounts[n + 1][0] > 0 and subjectcounts[n + 1][3] == 0: listofownsentencessubjectindexes.append(subjectcounts[n]) for m in range(len(separationwords)): if subjectcounts[n][1] < separationwords[m] < subjectcounts[n + 1][1]: #print(subjectcounts[n + 1], separationwords[m]) if subjectcounts[n + 1][0] > 1: if subjectcounts[n + 1][2][0] < separationwords[m] <= subjectcounts[n + 1][2][-1]: separationwordstocut.append(separationwords[m]) processed = 0 #print('oioioi') #print(listofownsentencessubjectindexes) #print(separationwordstocut) if len(listofownsentencessubjectindexes) > 0: for n in range(len(listofownsentencessubjectindexes)): sentence[listofownsentencessubjectindexes[n][1]] = sentence[listofownsentencessubjectindexes[n][1]] + 'alohaseparator' newpunctuationsindexes.append([punctuations[counter - 1], counter - 1]) #print('a new punctuation1') processed = 1 if len(separationwordstocut) > 0: for n in range(len(separationwordstocut)): sentence[separationwordstocut[n] - 1] = sentence[separationwordstocut[n] - 1] + 'alohaseparator' #print('a new punctuation2') newpunctuationsindexes.append([punctuations[counter - 1], counter - 1]) processed = 1 if processed == 0: newsentences.append([sentence]) if processed == 1: #print(sentence) splitsentence = [] for word in sentence: splitsentence.append(word) if word[-14:] == 'alohaseparator': if splitsentence[-1][-15] == ',': splitsentence[-1] = splitsentence[-1][:-15] else: splitsentence[-1] = splitsentence[-1][:-14] newsentences.append([splitsentence]) splitsentence = [] newsentences.append([splitsentence]) #print(newpunctuationsindexes) newpunctuationsindexes = newpunctuationsindexes[::-1] for n in range(len(newpunctuationsindexes)): punctuations.insert(newpunctuationsindexes[n][1], newpunctuationsindexes[n][0]) #print(newsentences, punctuations) return newsentences, punctuations def LoadBoWModelAndDatabaseOnesZeros(self): import FASTsearch #print('loading the tag hkl db..') self.fsearch1 = FASTsearch.FASTsearch('GS_DB_word.tag_.hkl') #print('done') #print('generating BoW Model..') self.fsearch1.Gen_BoW_Model(1000, "word") #print('done') #print('loading the bow model') self.fsearch1.Load_BoW_Model('bagofwordsGS_DB_word.tag_.pkl', 'DataBaseOneZerosGS_DB_word.tag_.hkl') #print('done') #print('loading the dep hkl db..') self.fsearch2 = FASTsearch.FASTsearch('GS_DB_word.dep_.hkl') #print('done') #print('generating BoW Model..') self.fsearch2.Gen_BoW_Model(1000, "word") #print('done') #print('loading the bow model') self.fsearch2.Load_BoW_Model('bagofwordsGS_DB_word.dep_.pkl', 'DataBaseOneZerosGS_DB_word.dep_.hkl') #print('done') def LoadSentGlueSGDandGSUtils(self): import GS_Utils #print('initializing the gs utils..') self.gs = GS_Utils.GS_Utils('de_core_news_sm') #print('done') from SentGlue import SentGlueMach #print('loading the Stochastic Gradient models..') self.sgm = SentGlueMach('trainedSGD_twolabel.pkl', 'bagofwordstwolabel.pkl') #print('done') #print('initializing the SGM..') self.sgm.initialize() #print('done') #print('importing spacy..') import spacy #print('done') #print('importing german model..') self.nlp = spacy.load('de_core_news_sm') #print('done') return 'done' def CommaSentenceOrNot(self, sentences): nlp = self.nlp commasentences = [] counter = 0 #print('creating array of comma or not..') for sentence in sentences: doc = nlp(' '.join(sentence[0])) #print(doc) counter += 1 #if counter % 100 == 0: #print(counter) n = 0 firstone = 0 token = [] nextword = 0 for word in doc: #print(word.tag_) # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag if firstone == 0: token.append(word.text) firstone = 1 if nextword == 1: token.append(word.text) nextword = 0 if word.tag_ == '$,': n += 1 nextword = 1 sentence.append([n, token]) commasentences.append(sentence) #print('done') return commasentences def EnumerationSolver(self, sentences): gs = self.gs nlp = self.nlp sgm = self.sgm enumerationsentences = [] counter = 0 NOTenumerations = [] #print('processing enumerations..') for sentence in sentences: doc = nlp(' '.join(sentence[0])) #print(doc) counter += 1 #if counter % 100 == 0: #print(counter) n = 0 firstone = 0 token = [] nextword = 0 enumeration = False splitsentence = [] splitsentence_deps = [] splitsentence_tags = [] splitsentences = [] splitsentences_deps = [] splitsentences_tags = [] for word in doc: #print(word.tag_) # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag nextword = 0 if word.tag_ == '$,': n += 1 nextword = 1 if (word.text == 'und' or word.text == 'oder') and n >= 1: enumeration = True break output = [] if enumeration == True: for word in doc: #print(word.text) if word.text != ',' and word.text != '.' and word.text != 'und': splitsentence.append(word.text) splitsentence_deps.append(word.dep_) splitsentence_tags.append(word.tag_) if word.text == ',' or word.text == 'und': #print('oi') splitsentences.append(splitsentence) splitsentences_deps.append(splitsentence_deps) splitsentences_tags.append(splitsentence_tags) splitsentence = [] splitsentence_deps = [] splitsentence_tags = [] splitsentences.append(splitsentence) splitsentences_deps.append(splitsentence_deps) splitsentences_tags.append(splitsentence_tags) #print( 'splitsentences', splitsentences) token = [] enumerations = [] enumerationsSPOs = [] NOTenumerations = [] for sentence in splitsentences: token.append(sentence[0]) if sentence[0] not in self.full_list: enumerations.append(sentence) enumerationsSPOs.append(gs.checkSPO(sentence, 0)) else: NOTenumerations.append(sentence) #print(enumerationsSPOs) #print('enumerations', enumerations) biggest = [] for i in range(len(enumerationsSPOs)): biggest.append([i, sum(enumerationsSPOs[i])]) sortedbiggest = sorted(biggest[::-1], key=lambda tup: tup[1], reverse=True) for i in range(len(sortedbiggest)): if sortedbiggest[i][0] == 0: mainsentenceIndex = sortedbiggest[i][0] lastornot = 0 break if sortedbiggest[i][0] == len(biggest) - 1: mainsentenceIndex = sortedbiggest[i][0] lastornot = 1 break # Hier muss noch für den Fall Er, sie und der Beamte LACHTEN den Clown aus --> das lachten abgefangen werden mit der Datenbank der Fälle, sprich enumeration im spo 1 0 0 + plural muss dann zu singular werden abhängig von den artikeln. #print('enumerations', enumerations) mainsentence = enumerations[mainsentenceIndex] #print('main', mainsentence) probablemainsentences = [] for i in range(len(enumerations)): if i != mainsentenceIndex: iprobablemainsentences = [] probablemainsentence = [] if lastornot == 0: for j in range(1, len(mainsentence)): probablemainsentence = mainsentence[0:j] + enumerations[i] #print(probablemainsentence) iprobablemainsentences.append(' '.join(probablemainsentence)) if lastornot == 1: for j in range(1, len(mainsentence)): probablemainsentence = enumerations[i] + mainsentence[-j:] iprobablemainsentences.append(' '.join(probablemainsentence)) probablemainsentences.append(iprobablemainsentences) # hier wird auf noch da geprüft, aber es ist wichtiger in diesem fall, dass ein tuple nicht zerissen vorkommt AENDERN !!!! #print('probablemainsentences', probablemainsentences) tuplesToCheck = [] tuples = [['ART', 'NN'], ['APPR','NN'], ['ART', 'CARD']] for tupl in tuples: checktupleindex, tupleInWords = gs.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None') if checktupleindex == 2: tuplesToCheck.append([tupl, tupleInWords]) triplesToCheck = [] triples = [['ART','ADJA','NN'], ['APPR', 'ART', 'NN'], ['KOKOM', 'ART', 'NN']] for tripl in triples: checktripleindex, tripleInWords = gs.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None') if checktripleindex == 3: triplesToCheck.append([tripl, tripleInWords]) #print('tuples to check', tuplesToCheck) #print('triples to check', triplesToCheck) #print('probablemainsentences', probablemainsentences) for probsentences in probablemainsentences: checktripleindexes = [] checktupleindexes = [] #print(probsentences) filteredprobsentences = [] for sentence in probsentences: tuplchecked = 0 triplchecked = 0 #print('sentence and tuples to check', sentence, tuplesToCheck) for tupl in tuplesToCheck: checkedsecondtime, tupleinWords = gs.checkForAnnotationTuple(sentence.split(), tupl[0], 'word.tag_', tupl[1]) #print(sentence, checkedsecondtime) if checkedsecondtime == 1: tuplchecked = 0 if checkedsecondtime == 2: tuplchecked = 1 for tripl in triplesToCheck: checkedsecondtime, tripleinWords = gs.checkForAnnotationTriple(sentence.split(), tripl[0], 'word.tag_', tripl[1]) if checkedsecondtime == 1 or checkedsecondtime == 2: triplchecked = 0 if checkedsecondtime == 3: triplchecked = 1 if triplchecked == 1 or tuplchecked == 1: filteredprobsentences.append(sentence) #print('filteredprobsentences', filteredprobsentences) if len(filteredprobsentences) == 0: filteredprobsentences = probsentences # here is still the problem, that there are lists of words instead of proper sentences.. #print('filteredprobsentences', filteredprobsentences) probsMatrix = sgm.predictprobsOnSentenceList(filteredprobsentences, filteredprobsentences) #print(probsMatrix) for i in range(len(probsMatrix)): probsMatrix[i][0] = i #print(probsMatrix) sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True) #print(sortedprobsMatrix) bestindex = sortedprobsMatrix[0][0] #print(bestindex) #print('probablemainsentences', filteredprobsentences) probablemainsentence = filteredprobsentences[int(bestindex)] #print('oi', probablemainsentence) #print('probablemainsentence', probablemainsentence) enumerationsentences.append([probablemainsentence]) enumerationsentences.append([' '.join(mainsentence)]) for notenum in NOTenumerations: #print(enumerationsentences) #print(enumerationsentences[-1]) #print('enum no1', enumerationsentences) #print('notenum', notenum) enumerationsentences[-1].append(' '.join(notenum)) #print('enumsentences',enumerationsentences[-1]) enumerationsentences[-1] = [', '.join(enumerationsentences[-1])] else: enumerationsentences.append([sentence]) output.append(enumerationsentences) for n in range(len(output[0])): #print('out',output[0][n]) try: output[0][n] = [output[0][n][0].split()] except: output[0][n] = [output[0][n][0][0]] #print('done') return output[0] def GetUtteranceNumber(self, sentences): nlp = self.nlp uttersentences = [] for sentence in sentences: doc = nlp(' '.join(sentence[0])) subjectcount = 0 for word in doc: if word.dep_ == 'sb' or word.dep_ == 'ep': subjectcount += 1 sentence.append(subjectcount) uttersentences.append(sentence) return uttersentences def GetQuestionOrNot(self, sentences): nlp = self.nlp uttersentences = [] questionmark = 0 for sentence in sentences: doc = nlp(' '.join(sentence[0])) count = 0 for word in doc: count += 1 if word.text == '?': questionmark = 1 sentence.append(questionmark) uttersentences.append(sentence) return uttersentences def SplitSentencesIntoHauptNebenTuple(self, sentences, punctuations): oldsplitsentences = [] #print('hauptneben inputsentences', sentences) gs = self.gs #print('importing spacy..') import spacy #print('done') nlp = self.nlp outputsentences = [] sentencesThatAreOutoutput = [] outsentences = [] for generalindex in range(len(sentences)): presentence = sentences[generalindex] splitsentence = [] splitsentence_deps = [] splitsentence_tags = [] splitsentences = [] splitsentences_deps = [] splitsentences_tags = [] commainfo = presentence[1] outputsentence = [] token = commainfo[1] commaornot = commainfo[0] numberutterances = presentence[2] sentence = presentence[0] oldsentence = presentence[0] #print(commaornot) if commaornot >= 2: #print('nla') sentence[0] = sentence[0].title() doc = nlp(' '.join(sentence)) for word in doc: #print(word.text) if word.text != ',' and word.text != '.': splitsentence.append(word.text) splitsentence_deps.append(word.dep_) splitsentence_tags.append(word.tag_) if word.text == ',': #print('oi') splitsentences.append(splitsentence) splitsentences_deps.append(splitsentence_deps) splitsentences_tags.append(splitsentence_tags) splitsentence = [] splitsentence_deps = [] splitsentence_tags = [] splitsentences.append(splitsentence) splitsentences[0][0] = splitsentences[0][0].lower() splitsentences_deps.append(splitsentence_deps) splitsentences_tags.append(splitsentence_tags) oldsplitsentences = splitsentences #print(splitsentences) #print(splitsentences_tags) #print(splitsentences_deps) spo = [] for n in range(len(splitsentences)): prespo = [] prespo = gs.checkSPO(splitsentences_deps[n], 1) prespo.append( gs.checkForAnnotation(splitsentences[n], 'VVINF', 'word.tag_')) prespo.append(gs.checkForAnnotation(splitsentences[n], 'VAFIN', 'word.tag_')) prespo.append(gs.checkForAnnotation(splitsentences[n], 'VVFIN', 'word.tag_')) prespo.append(gs.checkForAnnotation(splitsentences[n], 'VMFIN', 'word.tag_')) spo.append(prespo) #print(splitsentences_deps) #print(splitsentences) #print(spo) indexSPO = [] lastm = len(splitsentences) for o in range(len(splitsentences)): m = len(splitsentences) - 1 - o for n in range(len(splitsentences)): if m < n - 1 and n < lastm: #print('spo s',spo[m], spo[n]) sb = spo[m][0] + spo[n][0] Vafin = 1 if spo[m][3] == 1 or spo[n][3] == 1: Vafin = spo[m][3] + spo[n][3] Vvinf = 1 if spo[m][4] == 1 or spo[n][4] == 1: Vvinf = spo[m][4] + spo[n][4] Vvfin = 1 if spo[m][5] == 1 or spo[n][5] == 1: Vvfin = spo[m][5] + spo[n][5] Vmfin = 1 if spo[m][6] == 1 or spo[n][6] == 1: Vmfin == spo[m][6] + spo[n][6] #wrapped = 0 #for n in range(len(indexSPO)): #if n == indexSPO[n][0] + 1 and n == indexSPO[n][1] - 1: #wrapped = 1 #print(sb, Vafin, Vvinf, Vvfin, Vmfin, 'm n', m, n) if sb == 1 and Vafin == 1 and Vvinf == 1 and (Vvfin == 1 or Vmfin == 1): indexSPO.append([m,n]) #print([m,n]) lastm = m #print('lastm',lastm) #print(splitsentences) Hauptsentences = [] for n in range(len(indexSPO)): if indexSPO[n][0] > indexSPO[n][1]: i = 1 j = 0 else: i = 0 j = 1 Hauptsentences.append([splitsentences[indexSPO[n][i]] + splitsentences[indexSPO[n][j]] , indexSPO[n][i], indexSPO[n][j] ]) HauptSentences = [] for n in range(len(Hauptsentences)): m = len(Hauptsentences) - 1 - n HauptSentences.append(Hauptsentences[m]) #print('Hauptsentences', Hauptsentences) #print('HauptSentences', HauptSentences) sentencesThatAreOut =[] for n in range(len(HauptSentences)): index = HauptSentences[n][1] finish = 0 #print('Oi',HauptSentences[n]) if n == len(HauptSentences) - 1: #print('lenHauptsentences', len(HauptSentences)) stopindex = len(splitsentences) finish = 1 else: stopindex = HauptSentences[n + 1][1] #print('stopindex', stopindex) vvfinisthere = 0 if finish == 0: if splitsentences_tags[stopindex][0] == 'VVFIN': stopindex -= 1 vvfinisthere = 1 if splitsentences_tags[index][0] == 'VVFIN': vvfinisthere = 1 if vvfinisthere == 1: HNTuple = HauptSentences[n][0] + [','] + splitsentences[index - 1] outputsentence.append(HNTuple) sentencesThatAreOut.append(index - 1) sentencesThatAreOut.append(Hauptsentences[n][1]) sentencesThatAreOut.append(Hauptsentences[n][2]) for m in range(index + 1, stopindex ): if m != HauptSentences[n][2]: HNTuple = HauptSentences[n][0] + [','] + splitsentences[m] #print('check', HauptSentences[n], n) #print('check', splitsentences[m], m) #print('double', HNTuple) outputsentence.append(HNTuple) sentencesThatAreOut.append(m) sentencesThatAreOut.append(Hauptsentences[n][1]) sentencesThatAreOut.append(Hauptsentences[n][2]) sentencesThatAreOutoutput.append(sentencesThatAreOut) cpOrNots = [] rcOrNots = [] for splitsentence in splitsentences_deps: cpOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'cp') cpOrNots.append(cpOrNot) rcOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'rc') rcOrNots.append(rcOrNot) #print('Laenge splitsentences', len(splitsentences)) #print('laenge cpOrNots', len(cpOrNots)) #print(cpOrNots) #print('rc or nots', rcOrNots) pairs = [] for n in range(len(cpOrNots)): index = len(cpOrNots) - 1 - n done = 0 if rcOrNots[index] == 1: pairs.append([index, index - 1]) done = 1 if done == 0 and cpOrNots[index] == 1: try: if splitsentences_tags[index + 1][0] == 'VVFIN': pairs.append([index, index + 1]) done = 1 except: pass try: if done == 0 and rcOrNots[index - 1] == 0: pairs.append([index, index - 1]) done = 1 except: pass try: if done == 0 and rcOrNots[index - 1] == 1: if rcOrNots[index - 2] == 0: pairs.append([index, index - 2]) except: pass for pair in pairs[::-1]: if pair[0] not in set(sentencesThatAreOut) or pair[1] not in set(sentencesThatAreOut): outputsentence.append(splitsentences[pair[1]] + [','] + splitsentences[pair[0]]) #print('hnhn',sentences) sentences[generalindex][0] = outputsentence #print('outputsentence hntuple',outputsentence) #outputsentences.append([outputsentence , i]) #print('Oio', outputsentences) #print(sentencesThatAreOutoutput) #print(splitsentences) #print('oioioioioioioio',sentences) #print(sentences[0][0]) #print('oioi',sentences[n]) #print('malatesta', sentences[n][0][0]) #print('generalindex sentences index 0', sentences[generalindex][0]) try: if type(sentences[generalindex][0][0]) == str: sentences[generalindex][0] = [sentences[generalindex][0]] except: pass #print('generalindex sentences index 0', sentences[generalindex][0]) #print('oldsentence', oldsentence) newgeneratedsentences = len(sentences[generalindex][0]) if newgeneratedsentences > 1: #print('goti t') for sentence in sentences[generalindex][0]: punctuations.insert(generalindex, punctuations[generalindex]) outsentences.append(sentence) del punctuations[generalindex] if newgeneratedsentences == 1: if len(sentences[generalindex][0][0]) > 1: outsentences.append(sentences[generalindex][0][0]) else: outsentences.append(oldsentence) if newgeneratedsentences == 0: #print('case oldsentence', oldsentence) outsentences.append(oldsentence) #print('oioi', sentences[n]) # connect alonestanding commatas with the word before #print('theoutsentences', outsentences) for outsentence in outsentences: todelete = [] for n in range(len(outsentence)): if outsentence[n] == ',': todelete.append(n) outsentence[n-1] = outsentence[n-1] + ',' for deleteindex in todelete[::-1]: del outsentence[deleteindex] for index in range(len(outsentences)): outsentences[index] = [outsentences[index]] #print('theoutsentences', outsentences) #removing doubles doubledsentences = [] for o in range(len(outsentences)): sentence = outsentences[o][0] for m in range(len(outsentences)): if m != o: count = 0 for n in range(len(sentence)): if sentence[n] in outsentences[m][0] or sentence[n][:-1] in outsentences[m][0]: count += 1 if count == len(sentence): doubledsentences.append(sentence) punctdeleteindex = [] tmp = set() for sentence in doubledsentences: tmp.add(tuple(sentence)) #print(list(tmp)) doubledsentences = [] for tup in tmp: doubledsentences.append([list(tup)]) #print('doubledsentences',doubledsentences) punctdeleteindexes = [] for double in doubledsentences: if double in outsentences: punctdeleteindex = outsentences[::-1].index(double) del outsentences[len(outsentences) - 1 - punctdeleteindex] punctdeleteindexes.append(punctdeleteindex) for index in punctdeleteindexes[::-1]: del punctuations[len(outsentences) - 1 - index] #print('oldsplit',oldsplitsentences) #print('outsents',outsentences) for o in range(len(oldsplitsentences)): for m in range(len(outsentences)): counter = 0 for n in range(len(oldsplitsentences[o])): if oldsplitsentences[o][n] in outsentences[m][0] or oldsplitsentences[o][n] + ',' in outsentences[m][0]: counter += 1 if counter >= len(oldsplitsentences[o]): break if m == len(outsentences) - 1 and counter < len(oldsplitsentences[o]): if o == 0: outsentences.insert(0,[oldsplitsentences[o]]) punctuations.insert(0, punctuations[0]) else: newones = [] for i in range(len(outsentences)): if outsentences[i][0][-1] == oldsplitsentences[o - 1][-1]: if len(outsentences[i][0]) > 2 and len(oldsplitsentences[o - 1]) > 2: if outsentences[i][0][-2] == oldsplitsentences[o - 1][-2]: if outsentences[i][0][-3] == oldsplitsentences[o - 1][-3]: newones.append([i + 1, [oldsplitsentences[o]]]) for newone in newones[::-1]: #print(newones) outsentences.insert(newone[0], newone[1]) punctuations.insert(newone[0], punctuations[newone[0] - 1]) #print('outsentences at the very end ', outsentences, punctuations) return outsentences, punctuations # Notiz: Hier muss der Input immer Paare sein, von Hauptsatz/Nebensatz. D.h. eine weitere vorgeschaltete Klasse ist von Nöten. def SplitCommatas(self, Inputsentences, punctuations): gs = self.gs nlp = self.nlp gramcorr_splitsentences = [] counter = 0 newpunctuationsindex = [] for Inputsentence in Inputsentences: counter += 1 commainfo = Inputsentence[1] token = commainfo[1] commaornot = commainfo[0] numberutterances = Inputsentence[2] if commaornot == 0: gramcorr_splitsentences.append(Inputsentence[0]) if commaornot > 1: gramcorr_splitsentences.append(Inputsentence[0]) if commaornot == 1: oldsentence = Inputsentence[0] Inputsentence = [[Inputsentence[0]]] for sentence in Inputsentence[0]: splitsentence = [] splitsentences = [] processed = 0 wasNotInAnyList = 0 try: for n in range(len(token)): if token[n] in self.final_list: splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 1: if token[n] == 'um' or token[n] == 'Um': splitsentences[n].insert(0,'dies') splitsentences[n].insert(0,'um') else: splitsentences[n].insert(0,'dann') if n == 0: if token[n] == 'um' or token[n] == 'Um': splitsentences[n].insert(0,'dies') splitsentences[n].insert(0,'um') splitsentences = splitsentences[::-1] else: splitsentences[n].insert(0,'dann') splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']] processed = 1 if token[n] in self.adversativ_list: splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) splitsentences[n].append('jedoch') generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']] processed = 1 if token[n] in self.kausal_list: splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) # Da deswegen an den anderen Satz gehaengt wird, muss der input zu commasentences immer ZWEI sentences sein. #print('splitsentences in kausal', splitsentences) if n == 1: splitsentences[n - 1].insert(0,'deswegen') splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] if n == 0: splitsentences[n + 1].insert(0,'deswegen') #print('splitsentences in kausal', splitsentences) generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']] processed = 1 # from here come konsekutiv sentences, they have to be split according https://www.deutschplus.net/pages/Konsekutivsatz if token[n] in self.konsekutiv_list: #print('oi konsekutiv') splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) generalrules = [['KOUS','PPER']] processed = 1 if token[n] in self.konditional_list: splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 1: spoCount = gs.checkSPO(splitsentences[n], 0) spoCount = sum(spoCount) if spoCount == 2: thereisanes = 0 for word in splitsentences[n]: if word == 'es' or word == 'Es': thereisanes = 1 if thereisanes == 0: splitsentences[n].append('es') if n == 0: spoCount = gs.checkSPO(splitsentences[n], 0) spoCount = sum(spoCount) if spoCount == 2: thereisanes = 0 for word in splitsentences[n]: if word == 'es' or word == 'Es': thereisanes = 1 if thereisanes == 0: splitsentences[n].append('es') splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] generalrules = [['KOUS','PPER']] processed = 1 if token[n] in self.konzessiv_list: splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 1: splitsentences[n - 1].insert(0,'trotzdem') splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] if n == 0: splitsentences[n + 1].insert(0,'trotzdem') generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']] processed = 1 if token[n] in self.lokal_list: #print('lokal ole ole ') splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 1: splitsentences[n - 1].insert(0,'dort') splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] if n == 0: splitsentences[n + 1].insert(0,'dort') generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']] processed = 1 if token[n] in self.instrumental_list: splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 1: splitsentences[n - 1].insert(0,'so') splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] if n == 0: splitsentences[n + 1].insert(0,'so') generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']] processed = 1 if token[n] in self.temporal_list_vor: splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 1: splitsentences[n].insert(0,'danach') if n == 0: splitsentences[n].insert(0,'danach') splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']] processed = 1 if token[n] in self.temporal_list_nach: splitsentence = [] for word in sentence: if word != token[n]: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 1: splitsentences[n].insert(0,'davor') if n == 0: splitsentences[n].insert(0,'davor') splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']] processed = 1 #print(token[n]) if token[n] == 'der' or token[n] == 'welcher': tokens = self.nlp(' '.join(sentence)) for word in tokens: if word.dep_ == 'rc': wordwithrc = word.text rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_') oldsplitsentences = splitsentences splitsentences = [] if rcORnot == 1: splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) # das umtauschen wird hier vollzogen, da ansonsten spacy dieser nicht als PDS einliest.. analog in den anderen. if wordwithrc in splitsentences[n]: splitsentences[n][0] = 'dieser' verb = splitsentences[n][-1] splitsentences[n] = splitsentences[n][:-1] splitsentences[n].insert(1, verb) #print('Vorsicht', splitsentences) generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']] processed = 1 else: splitsentences = oldsplitsentences splitsentence = [] if token[n] == 'die' or token[n] == 'welche': tokens = self.nlp(' '.join(sentence)) for word in tokens: if word.dep_ == 'rc': wordwithrc = word.text rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_') oldsplitsentences = splitsentences splitsentences = [] if rcORnot == 1: #print('it went to rcornot in case die') splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if wordwithrc in splitsentences[n]: #print('wordwithrc was in sentence') #print(wordwithrc) #print(splitsentences[n]) #print('wordwithrcend') splitsentences[n][0] = 'diese' verb = splitsentences[n][-1] splitsentences[n] = splitsentences[n][:-1] splitsentences[n].insert(1, verb) generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']] processed = 1 else: splitsentences = oldsplitsentences splitsentence = [] if token[n] == 'dem': tokens = self.nlp(' '.join(sentence)) for word in tokens: if word.dep_ == 'rc': wordwithrc = word.text rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_') oldsplitsentences = splitsentences splitsentences = [] if rcORnot == 1: splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',' and word[-1] != '.': splitsentence.append(word) if word[-1] == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if wordwithrc in splitsentences[n]: splitsentences[n][0] = 'diesem' verb = splitsentences[n][-1] splitsentences[n] = splitsentences[n][:-1] splitsentences[n].insert(1, verb) generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']] processed = 1 else: splitsentences = oldsplitsentences splitsentence = [] if token[n] == 'das' or token[n] == 'welches': tokens = self.nlp(' '.join(sentence)) for word in tokens: if word.dep_ == 'rc': wordwithrc = word.text rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_') #print('Oeeee',rcORnot) oldsplitsentences = splitsentences splitsentences = [] if rcORnot == 1: splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) #print('splitsentence in das rc', splitsentences) if wordwithrc in splitsentences[n]: splitsentences[n][0] = 'dieses' verb = splitsentences[n][-1] #print('verb',verb) splitsentences[n] = splitsentences[n][:-1] splitsentences[n].insert(1, verb) generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']] processed = 1 else: splitsentences = oldsplitsentences splitsentence = [] if token[n] == 'dessen' or token[n] == 'wessen': tokens = self.nlp(' '.join(sentence)) for word in tokens: if word.dep_ == 'rc': wordwithrc = word.text rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_') oldsplitsentences = splitsentences splitsentences = [] if rcORnot == 1: splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if wordwithrc in splitsentences[n]: verb = splitsentences[n][-1] splitsentences[n] = splitsentences[n][:-1] splitsentences[n].insert(1, verb) generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']] processed = 1 else: splitsentences = oldsplitsentences splitsentence = [] if token[n] == 'den' or token[n] == 'welchen': tokens = self.nlp(' '.join(sentence)) for word in tokens: if word.dep_ == 'rc': wordwithrc = word.text rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_') oldsplitsentences = splitsentences splitsentences = [] if rcORnot == 1: splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if wordwithrc in splitsentences[n]: splitsentences[n][0] = 'diesen' verb = splitsentences[n][-1] splitsentences[n] = splitsentences[n][:-1] splitsentences[n].insert(1, verb) generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']] processed = 1 else: splitsentences = oldsplitsentences splitsentence = [] if token[n] == 'wem' or token[n] == 'Wem' or token[n] == 'welchem': daORnot = gs.checkForAnnotation(sentence, 'da', 'word.dep_') oaORnot = gs.checkForAnnotation(sentence, 'oa', 'word.dep_') reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_') oldsplitsentences = splitsentences splitsentences = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 0: index = 1 if n == 1: index = 0 if reORnot == 1: pass if daORnot == 1 and reORnot == 0: splitsentences[index].insert(1, 'das') if oaORnot == 1 and reORnot == 0: splitsentences[index].insert(1, 'dem') if n == 1: splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']] processed = 1 if token[n] in self.indirectspeech_list and token[1] not in self.konsekutiv_list: reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_') oldsplitsentences = splitsentences splitsentences = [] splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) splitsentence = [] splitsentences.append(splitsentence) if n == 0: index = 1 if n == 1: index = 0 if reORnot == 0: if splitsentences[index][0] != 'was': splitsentences[index].insert(1, 'das') if n == 1: splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']] processed = 1 if processed == 0 and n == 1: ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VVINF'], 'word.tag_', 'None') if ZUVINFTupelORnot == 0: ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VAINF'], 'word.tag_', 'None') if ZUVINFTupelORnot == 1: reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_') splitsentence = [] for word in sentence: if word[-1] == ',': splitsentence.append(word[:-1]) if word == ',': pass if word[-1] != ',' : splitsentence.append(word) if word[-1] == ',' or word == ',': splitsentences.append(splitsentence) processed = 1 splitsentence = [] splitsentences.append(splitsentence) for m in range(2): ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VVINF'], 'word.tag_','None') if ZUINForNOT == 0: ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VAINF'], 'word.tag_','None') if ZUINForNOT == 1: r = m ZUINForNOT = 0 if r == 0: index = 1 if r == 1: index = 0 objectORnot = gs.checkForAnnotation(splitsentences[index] , 'oa', 'word.dep_') if reORnot == 0 and objectORnot == 0: splitsentences[index].insert(1, 'das') if r == 1: splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0] else: processed == 2 except: wasNotInAnyList = 1 #rules = [['ART','ADJA','NN'], ['ART','ADJA','NE'], ['ART', 'NN'], ['ART', 'NE'], ['APPR','NN'], ['APPR','NE'], ['APPR', 'ART', 'NN'], ['APPR', 'ART', 'NE'], ['APPR','ART','NN','ADJA','NN'], ['APPR','ART','NN','ADJA','NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD']] #print('B',splitsentences) endsentences = [] if (processed == 2 or processed == 0) and n == 1: wasNotInAnyList = 1 try: if wasNotInAnyList == 0: newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]]) #print('splitsentencee', splitsentences) if len(splitsentences) > 2: splitsentences = splitsentences[:2] #print('splitsentenceeeees', splitsentences) for splitsentence in splitsentences: #print('splitsentenceeeeeeeeeeee!!',splitsentence) wordtoputfirst = 'nada' for word in self.firstwordlist: if word == splitsentence[0]: wordtoputfirst = word splitsentence.remove(word) #print('get the tuples and triples to check..') tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(splitsentence) #print('done') #print(tuplesTocheck, 'ole', triplesTocheck ,'aiai', quadruplesTocheck) #print('1') grammpiecessentence = self.gs.createTupleofGrammarpieces( splitsentence, tuplesTocheck, triplesTocheck, quadruplesTocheck) #print('grammpiece',grammpiecessentence) #print('2') if len(grammpiecessentence) > 7: print('A sentence is too long, too many permutations. \n piping wrong grammar..') endsentence = ' '.join(grammpiecessentence) else: #print('genrating the permutations') permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence) #print('done') #print(permutations) #print('3') firstwordwithverblist = ['deswegen', 'danach'] permutationstodelete = [] for permutation in permutations: #print('4') if permutation[0] in firstwordwithverblist: #print('4.1') count = 1 for word in self.nlp(permutation[1]): #print('4.2') if word.tag_[0] != 'V': #print('4.3') permutationstodelete.append(permutation) break else: break #for word in self.nlp(permutation[0]): #print('4.2') #if word.tag_[0] != 'V': #print('4.3') #permutationstodelete.append(permutation) #break #else: #break for delperm in permutationstodelete: try: permutations.remove(delperm) except: pass #print('5') sentencesToCheck = [] if wordtoputfirst in self.firstwordlist: for sentence in permutations: sentencesToCheck.append(wordtoputfirst + ' ' + ' '.join(sentence)) else: for sentence in permutations: sentencesToCheck.append(' '.join(sentence)) endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(splitsentence)) #print('done') #print('endsent',endsentence) endsentences.append(endsentence) except: #print('there was an error') wasNotInAnyList = 1 endsentences = [] todelete = [] for index in range(len(newpunctuationsindex)): if newpunctuationsindex[index][0] == counter - 1: todelete.append(index) for todel in todelete[::-1]: del newpunctuationsindex[todel] if wasNotInAnyList == 1: #print('was not in any list') #print(oldsentence) endsplisentences = [] splisentence = [] for word in oldsentence: if word[-1] == ',': splisentence.append(word[:-1]) if word == ',': pass if word[-1] != ',': splisentence.append(word) if word[-1] == ',' or word == ',': endsplisentences.append(splisentence) splisentence = [] endsplisentences.append(splisentence) newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]]) #print('endsplisentences',endsplisentences) for splsentence in endsplisentences: endsentences.append(' '.join(splsentence)) ''' fsearch1 = self.fsearch1 spacyclass1 = 'word.tag_' gs_sentence1 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass1) print('searchPatternMatch for tags') bestmatches1 = fsearch1.searchPatternMatch(' '.join(gs_sentence1), 1) print('done') #print('oioi', bestmatches1) #print(len(fsearch1.database)) right_gs_tupel1 = [] if len(bestmatches1) < 10: bestndocs1 = len(bestmatches1) else: bestndocs1 = 10 for m in range(bestndocs1): right_gs_tupel1.append(fsearch1.database[bestmatches1[m][0]]) statistically_correct_sentences1 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence1, right_gs_tupel1) fsearch2 = self.fsearch2 spacyclass2 = 'word.dep_' gs_sentence2 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass2) print('searchPatternMatch for deps') bestmatches2 = fsearch2.searchPatternMatch(' '.join(gs_sentence2), 1) print('done') right_gs_tupel2 = [] if len(bestmatches2) < 10: bestndocs2 = len(bestmatches2) else: bestndocs2 = 10 for m in range(bestndocs2): right_gs_tupel2.append(fsearch2.database[bestmatches2[m][0]]) #print(' '.join(splitsentence)) statistically_correct_sentences2 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence2, right_gs_tupel2) print(splitsentence) Rightsentence = gs.GetBestgsAccordingRules(' '.join(splitsentence) , gs_sentence1, right_gs_tupel1, right_gs_tupel2, statistically_correct_sentences1, statistically_correct_sentences2, rules, generalrules) ''' for endsentence in endsentences: gramcorr_splitsentences.append(endsentence.split()) for index in newpunctuationsindex: punctuations.insert(index[0], index[1]) return gramcorr_splitsentences, punctuations def putAppendixesIntoOwnSentences(self, sentences, punctuations): gs = self.gs #triples = [['NN', 'ART', 'NN'], ['NE', 'ART', 'NN'], ['NN', 'ART', 'NN'], ['NE', 'ART', 'NE']] quadruples = [['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'ART', 'NN'], ['NE', 'APPR', 'ART', 'NN'], ['NN', 'APPR', 'ART', 'NE'], ['NE', 'APPR', 'ART', 'NE']] quadruplestochange = [] triplestochange = [] newsentences = [] newpunctuations = [] Whatisofnouns = [] oldsentences = sentences oldpunctuations = punctuations for hauptindex in range(len(sentences)): sentence = sentences[hauptindex] try: #for triple in triples: # AnnoOrNot, tripleInWords = gs.checkForAnnotationTriple(sentence, triple, 'word.tag_', 'None') # for tripleinwor in tripleInWords: # triplestochange.append([triple, tripleinwor]) for quadruple in quadruples: AnnoOrNot, quadrupleInWords = gs.checkForAnnotationQuadruple(sentence, quadruple, 'word.tag_', 'None') #print('quadinwords', quadrupleInWords) #print('ANNOORNOT', AnnoOrNot) for quadrupleInWo in quadrupleInWords: quadruplestochange.append([quadruple, quadrupleInWo]) #print('quadstochange',quadruplestochange) for quad in quadruplestochange: for n in range(len(sentence) - 4): if sentence[n] == quad[1][0]: if sentence[n + 1] == quad[1][1]: if sentence[n + 2] == quad[1][2]: artword = None longerWhatisnoun = 0 for m in range(2): for word in self.nlp(sentence[n - m]): if word.tag_ == 'ART': Nounthatis = sentence[n - m:n + 1] import spacy nlp = spacy.load('de_core_news_sm') token3 = nlp(sentence[n+4]) counter = 0 Whatisnoun = sentence[n + 1:n + 4] for wor in token3: counter += 1 if wor.tag_ == 'NN' or wor.tag_ == 'NE': if counter == 1: Whatisnoun = sentence[n + 1:n + 5] longerWhatisnoun = 1 if counter == 2: Whatisnoun = sentence[n + 1:n + 4] artword = word.text #print(sentence[n - 1],'oi') if ((artword == 'die' or artword == 'Die') and sentence[n][-1] != 'n') or ((artword == 'der' or artword == 'einer' or artword == 'dieser') and (sentence[n - 2] in ['von', 'in', 'auf', 'ueber', 'unter', 'nach', 'mit'])): if artword == 'der': Nounthatis[0] = 'die' donothing = 0 if sentence[n + 1] == 'mit': if sentence[n + 2] == 'den': verb = ' hat die ' Whatisnoun = Whatisnoun[2:] if sentence[n + 2] == 'der': verb = ' hat eine ' Whatisnoun = Whatisnoun[2:] if sentence[n + 2] != 'der' and sentence[n + 2] != 'den': donothing = 1 else: verb = ' ist ' if donothing == 0: newsentence = ' '.join(Nounthatis) + verb + ' '.join(Whatisnoun) newsentences.append([hauptindex + 1, newsentence.split()]) newpunctuations.append([hauptindex + 1, punctuations[hauptindex]]) if longerWhatisnoun == 0: Whatisofnouns.append([n + 1, n + 4, hauptindex]) else: Whatisofnouns.append([n + 1, n + 5, hauptindex]) except: print('Konnte nicht ' + str(sentence) + 'in Characterisierung pro Satz prozessieren..') try: for whatis in Whatisofnouns[::-1]: thereisacomma = 0 #print(sentences[whatis[2]][whatis[1] - 1]) if sentences[whatis[2]][whatis[1] - 1][-1] == ',': thereisacomma = 1 if thereisacomma == 1: #print(sentences[whatis[2]][whatis[0] - 1]) sentences[whatis[2]][whatis[0] - 1] = sentences[whatis[2]][whatis[0] - 1] + ',' del sentences[whatis[2]][whatis[0]:whatis[1]] for newsent in newsentences[::-1]: sentences.insert(newsent[0], newsent[1]) for newpunct in newpunctuations[::-1]: punctuations.insert(newpunct[0], newpunct[1]) for sentence in sentences: if sentence[-1][-1] == ',': sentence[-1] = sentence[-1][:-1] except: print('konnte nicht die gesammelten Characterisierungen prozessieren') sentences = oldsentences punctuations = oldpunctuations return sentences, punctuations