# class to implement GS utils and Search import resource class GS_Utils(object): def __init__(self, language): #print('loading spacy..') import spacy self.nlp = spacy.load(language) #print('done') self.oi = 'oi' def Sentence2GrammarSchema(self, sentence, spacyclass): doc = self.nlp(sentence) #print(doc) GsDBsentence = [] for word in doc: # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag if len(eval(spacyclass)) > 1: GsDBsentence.append(eval(spacyclass)) return GsDBsentence def Sentence2RightGrammarTupel(self, sentence, gs_sentence, right_gs_tupel): grammcorr_sentences = [] #print(sentence) #print(gs_sentence) #print(right_gs_tupel) sentence = sentence.split() for elements in right_gs_tupel: grammcor_sentence = [] usedwordslist = [] usedwords = set(usedwordslist) for element in elements.split(): ok = 0 #print('1') for n in range(len(gs_sentence)): #print(element) #print(gs_sentence) if element == gs_sentence[n] and n not in usedwords: if ok == 0: #print('bla', sentence[n]) grammcor_sentence.append(sentence[n]) usedwordslist.append(n) usedwords = set(usedwordslist) ok = 1 grammcorr_sentences.append(grammcor_sentence) return grammcorr_sentences # gets the best grammar scheme from both, depending on which one appears the most in both, and whether rules are still present. def GetBestgsAccordingRules(self, sentence, gs_sentence1, right_gs_tupel1, right_gs_tupel2, grammcorr_sentences1 , grammcorr_sentences2, rules , specialrules): equals = [] for n in range(len(grammcorr_sentences1)): equalcount = 0 for l in range(len(grammcorr_sentences2)): if len(grammcorr_sentences1[n]) <= len(grammcorr_sentences2[l]): for m in range(len(grammcorr_sentences1[n])): if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]: equalcount += 1 else: for m in range(len(grammcorr_sentences2[l])): if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]: equalcount += 1 equals.append(equalcount) # from here check the if a rule is in the input, if yes then check it for grammar schemes and raise scores. newrules = [] for n in range(len(rules)): newrules.append([]) ruleapplicable = [] for m in range(len(rules)): ruleapplicable.append(False) if len(rules[m]) == 2: for n in range(len(gs_sentence1)-1): if rules[m][0] == gs_sentence1[n]: if rules[m][1] == gs_sentence1[n+1]: ruleapplicable[m] = True newrules[m] = sentence.split()[n:n+2] for n in range(len(grammcorr_sentences1)): if ruleapplicable[m] == True: for p in range(len(grammcorr_sentences1[n])-1): if grammcorr_sentences1[n][p] == newrules[m][0] and grammcorr_sentences1[n][p+1] == newrules[m][1]: equals[n] += 40 * len(newrules[m]) else: pass if len(rules[m]) == 3: for n in range(len(gs_sentence1)-2): if rules[m][0] == gs_sentence1[n]: if rules[m][1] == gs_sentence1[n+1]: if rules[m][2] == gs_sentence1[n+2]: ruleapplicable[m] = True newrules[m] = sentence.split()[n:n+3] for n in range(len(grammcorr_sentences1)): if ruleapplicable[m] == True: for p in range(len(grammcorr_sentences1[n])-2): if grammcorr_sentences1[n][p] == newrules[m][0]: if grammcorr_sentences1[n][p+1] == newrules[m][1]: if grammcorr_sentences1[n][p+2] == newrules[m][2]: equals[n] += 40 * len(newrules[m]) if len(rules[m]) == 4: for n in range(len(gs_sentence1)-3): if rules[m][0] == gs_sentence1[n]: if rules[m][1] == gs_sentence1[n+1]: if rules[m][2] == gs_sentence1[n+2]: if rules[m][3] == gs_sentence1[n+3]: ruleapplicable[m] = True newrules[m] = sentence.split()[n:n+4] for n in range(len(grammcorr_sentences1)): if ruleapplicable[m] == True: for p in range(len(grammcorr_sentences1[n])-3): if grammcorr_sentences1[n][p] == newrules[m][0]: if grammcorr_sentences1[n][p+1] == newrules[m][1]: if grammcorr_sentences1[n][p+2] == newrules[m][2]: if grammcorr_sentences1[n][p+3] == newrules[m][3]: equals[n] += 40 * len(newrules[m]) if len(rules[m]) == 5: for n in range(len(gs_sentence1)-4): if rules[m][0] == gs_sentence1[n]: if rules[m][1] == gs_sentence1[n+1]: if rules[m][2] == gs_sentence1[n+2]: if rules[m][3] == gs_sentence1[n+3]: if rules[m][4] == gs_sentence1[n+4]: ruleapplicable[m] = True newrules[m] = sentence.split()[n:n+5] for n in range(len(grammcorr_sentences1)): if ruleapplicable[m] == True: for p in range(len(grammcorr_sentences1[n])-4): if grammcorr_sentences1[n][p] == newrules[m][0]: if grammcorr_sentences1[n][p+1] == newrules[m][1]: if grammcorr_sentences1[n][p+2] == newrules[m][2]: if grammcorr_sentences1[n][p+3] == newrules[m][3]: if grammcorr_sentences1[n][p+4] == newrules[m][4]: equals[n] += 40 * len(newrules[m]) #print('the found rules from input:',newrules) for n in range(len(grammcorr_sentences1)): for m in range(len(specialrules)): if len(specialrules[m]) == 2: for p in range(len(grammcorr_sentences1[n])-1): if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1]: equals[n] += len(grammcorr_sentences1[n]) else: pass if len(specialrules[m]) == 3: for p in range(len(grammcorr_sentences1[n])-2): if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1] and right_gs_tupel1[n+2][p] == specialrules[m][2]: equals[n] += len(grammcorr_sentences1[n]) else: pass #for n in range(len(grammcorr_sentences1)): #if len(sentence.split()) == grammcorr_sentences1[n]: #equals[n] += 50 indexedequals = [] for n in range(len(equals)): indexedequals.append([n,equals[n]]) indexedequals_sorted = sorted(indexedequals[::-1], key=lambda tup: tup[1], reverse=True) return grammcorr_sentences1[indexedequals_sorted[0][0]] def checkSPO(self, splitsentence, convertedornot): if convertedornot == 0: gs_sentenceSPOProof = self.Sentence2GrammarSchema(' '.join(splitsentence), 'word.dep_') if convertedornot == 1: gs_sentenceSPOProof = splitsentence spoCount = [0,0,0] for word in gs_sentenceSPOProof: if word == 'sb' or word == 'ep' or word == 'ph': spoCount[0] = 1 if word == 'ROOT' or word == 'pd': spoCount[1] = 1 if word == 'oa' or word == 'og' or word == 'oc' or word == 'op' or word == 'mo': spoCount[2] = 1 return spoCount def checkForAnnotation(self, splitsentence, token, spacyclass): gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass) AnnoORnot = 0 for word in gs_sentence_RC_Proof: if word == token: AnnoORnot = 1 return AnnoORnot def checkForAnnotationInTokenizedSentence(self, splitsentence, token): gs_sentence_RC_Proof = splitsentence AnnoORnot = 0 for word in gs_sentence_RC_Proof: if word == token: AnnoORnot = 1 return AnnoORnot def checkForAnnotationTuple(self, splitsentence, token, spacyclass, tupleinwords): #self.spacyclass = spacyclass gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass) AnnotationtupleInwords = [] AnnoORnot = 0 #print(gs_sentence_RC_Proof) for n in range(len(gs_sentence_RC_Proof) - 1): if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tupleinwords[0] or tupleinwords == 'None'): #print('oioioiAYE') #print(gs_sentence_RC_Proof) AnnoORnot = 1 if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tupleinwords[1] or tupleinwords == 'None'): AnnoORnot = 2 AnnotationtupleInwords.append(splitsentence[n:n+2]) #print(token) return AnnoORnot, AnnotationtupleInwords def checkForAnnotationTriple(self, splitsentence, token, spacyclass, tripleinwords): #self.spacyclass = spacyclass gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass) #print('gssentencercprooof', gs_sentence_RC_Proof) AnnoORnot = 0 AnnotationtripleInwords = [] for n in range(len(gs_sentence_RC_Proof) - 2): if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tripleinwords[0] or tripleinwords == 'None'): AnnoORnot = 1 if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tripleinwords[1] or tripleinwords == 'None'): AnnoORnot = 2 if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == tripleinwords[2] or tripleinwords == 'None'): AnnoORnot = 3 AnnotationtripleInwords.append(splitsentence[n:n+3]) return AnnoORnot, AnnotationtripleInwords def checkForAnnotationQuadruple(self, splitsentence, token, spacyclass, quadrupleinwords): #self.spacyclass = spacyclass gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass) #print('gssentencercprooof', gs_sentence_RC_Proof) #print('quadrupleinwords',quadrupleinwords) #print('token', token) AnnoORnot = 0 AnnotationquadrupleInwords = [] for n in range(len(gs_sentence_RC_Proof) - 3): if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == quadrupleinwords[0] or quadrupleinwords == 'None'): AnnoORnot = 1 if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == quadrupleinwords[1] or quadrupleinwords == 'None'): AnnoORnot = 2 if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == quadrupleinwords[2] or quadrupleinwords == 'None'): AnnoORnot = 3 if gs_sentence_RC_Proof[n + 3] == token[3] and (splitsentence[n+3] == quadrupleinwords[3] or quadrupleinwords == 'None'): AnnoORnot = 4 AnnotationquadrupleInwords.append(splitsentence[n:n+4]) #print('AnnotationquadrupleInwords', AnnotationquadrupleInwords) return AnnoORnot, AnnotationquadrupleInwords #input ['this', 'is', 'a', 'sentence'] def GetTuplesinSentence(self,mainsentence): tuplesToCheck = [] tuples = [['ART', 'NE'], ['ART', 'NN'], ['APPR','NN'], ['APPR','ADJD'], ['APPR','NE'], ['ART', 'CARD'], ['APPR', 'CARD'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD'],['ADV', 'ADV'], ['ADV', 'PTKVZ'], ['PTKNEG', 'ADV'], ['ADJA','NN'], ['ADJA','NE'], ['ADV','PIS'], ['ADJA','PIS'], ['ADJD','PIS'], ['APPRART', 'NN'], ['APPRART', 'NE'], ['PDAT', 'NE'], ['PDAT', 'NN'], ['PWAT', 'NE'], ['PWAT', 'NN'], ['PIAT', 'NE'], ['PIAT', 'NN'], ['PROAV', 'ADJD'],['PDS', 'NE'], ['PDS', 'NN'], ['NE', 'NE'], ['CARD', 'NE'], ['CARD', 'NN'] ] #print('beginning of gettuplesinsentence') #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) for tupl in tuples: #print('checking another tuple') #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) checktupleindex, tupleInWords = self.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None') if len(tupleInWords) > 0: for tup in tupleInWords: tuplesToCheck.append([tupl, tup]) #print('oi a tuple was found') #print('after the loop') #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) #print('no going to the triples') triplesToCheck = [] triples = [['APPR', 'ART', 'NN'],['APPR', 'PDAT', 'NN'], ['APPR', 'PDS', 'NN'], ['ART','ADJA','NN'], ['ART','ADJA','NE'], ['APPR', 'ART', 'NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['APPR', 'PIAT', 'NN'], ['APPR', 'ADJA', 'NN'], ['APPR', 'ADJA', 'NE'], ['APPRART', 'NN', 'CARD'], ['APPRART', 'NE', 'CARD'], ['APPRART', 'NN', 'NE'], ['CARD', 'KON', 'CARD'], ['APPR', 'ADV', 'CARD'], ['ADJD', 'KOKOM', 'CARD'], ['APPR', 'NE', 'NE'], ['NN', 'KON', 'NN'], ['NE', 'NN', 'NE'], ['APPR', 'NE', 'NN'], ['APPR', 'CARD', 'NN'], ['APPR', 'CARD', 'NE']] for tripl in triples: #print('checking next triple') checktripleindex, tripleInWords = self.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None') if len(tripleInWords) > 0: for trip in tripleInWords: triplesToCheck.append([tripl, trip]) #print('oi a triple was found') quadruplesToCheck = [] quadruples = [['KOKOM', 'ADV', 'ADJA', 'NN'], ['KOKOM', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NN'], ['ART', 'NN', 'APPR', 'NE'], ['APPR', 'NE', 'NN', 'NE'], ['APPR', 'ART', 'ADJA', 'NN'], ['ART', 'ADJD', 'ADJA', 'NN']] for quadrupl in quadruples: #print('checking next triple') checkquadrupleindex, quadrupleInWords = self.checkForAnnotationQuadruple(mainsentence, quadrupl, 'word.tag_', 'None') if len(quadrupleInWords) > 0: for quad in quadrupleInWords: quadruplesToCheck.append([quadrupl, quad]) #print('gettuples insentences is done') return tuplesToCheck, triplesToCheck, quadruplesToCheck def createTupleofGrammarpieces(self, sentence, tuplesToCheck, triplesToCheck, quadruplesToCheck): #print('going in crate tuple of grammar pieces') tuplestoremove = [] for tupl in tuplesToCheck: for tripl in triplesToCheck: if (tupl[1][0] == tripl[1][0] and tupl[1][1] == tripl[1][1]) or (tupl[1][0] == tripl[1][1] and tupl[1][1] == tripl[1][2]): tuplestoremove.append(tupl) for tupletoremove in tuplestoremove: tuplesToCheck.remove(tupletoremove) #print('in between0', sentence, quadruplesToCheck, tuplesToCheck, triplesToCheck) tuplestoremove = [] for tupl in tuplesToCheck: for quad in quadruplesToCheck: #print('I got here') #print(tupl, quad) #print(tupl[1][0], tupl[1][1], quad[1][2], quad[1][3]) if (tupl[1][0] == quad[1][0] and tupl[1][1] == quad[1][1]) or (tupl[1][0] == quad[1][1] and tupl[1][1] == quad[1][2]) or (tupl[1][0] == quad[1][2] and tupl[1][1] == quad[1][3]): #print('and I got here', tupl) tuplestoremove.append(tupl) for tupletoremove in tuplestoremove: tuplesToCheck.remove(tupletoremove) #print('and until here?') triplestoremove = [] for tripl in triplesToCheck: for quad in quadruplesToCheck: if (tripl[1][0] == quad[1][0] and tripl[1][1] == quad[1][1] and tripl[1][2] == quad[1][2]) or (tripl[1][0] == quad[1][1] and tripl[1][1] == quad[1][2] and tripl[1][2] == quad[1][3]): triplestoremove.append(tripl) for tripltoremove in triplestoremove: triplesToCheck.remove(tripltoremove) bracketinfo = [] bracketinfos = [] bracketindex = 0 #print('in between1', sentence, quadruplesToCheck) for n in range(len(sentence)): if sentence[n] != '': if sentence[n] == '(' or sentence[n][0] == '(': for m in range(n ,len(sentence)): bracketinfo.append(sentence[m]) if sentence[m] == ')' or sentence[m][-1] == ')': wordbeforebracketinfo = None try: wordbeforebracketinfo = sentence[n-1] except: pass bracketinfos.append([bracketinfo, wordbeforebracketinfo]) bracketinfo = [] break #print('in between2', sentence, quadruplesToCheck) #print('bracketinfo',bracketinfos) #print('sentence',sentence) for bracketinfo in bracketinfos: for word in bracketinfo[0]: sentence.remove(word) #print('in between3', sentence, quadruplesToCheck) if len(quadruplesToCheck) != 0: for n in range(len(quadruplesToCheck)): for m in range(len(sentence) - 3): if sentence[m] == quadruplesToCheck[n][1][0]: if sentence[m + 1] == quadruplesToCheck[n][1][1]: if sentence[m + 2] == quadruplesToCheck[n][1][2]: if sentence[m + 3] == quadruplesToCheck[n][1][3]: del sentence[m + 3] del sentence[m + 2] del sentence[m + 1] del sentence[m] sentence.insert(m,' '.join(quadruplesToCheck[n][1])) if len(triplesToCheck) != 0: for n in range(len(triplesToCheck)): for m in range(len(sentence) - 2): if sentence[m] == triplesToCheck[n][1][0]: if sentence[m + 1] == triplesToCheck[n][1][1]: if sentence[m + 2] == triplesToCheck[n][1][2]: del sentence[m + 2] del sentence[m + 1] del sentence[m] sentence.insert(m,' '.join(triplesToCheck[n][1])) if len(tuplesToCheck) != 0: for n in range(len(tuplesToCheck)): for m in range(len(sentence) - 1): if sentence[m] == tuplesToCheck[n][1][0]: if sentence[m + 1] == tuplesToCheck[n][1][1]: del sentence[m + 1] del sentence[m] sentence.insert(m,' '.join(tuplesToCheck[n][1])) for bracketinfo in bracketinfos: bracketinfowasthere = 0 for n in range(len(sentence)): sentencensplit = sentence[n].split() if bracketinfo[1] == sentencensplit[-1]: sentence[n] = sentence[n] + ' '.join(bracketinfo[0]) bracketinfowasthere = 1 break if bracketinfowasthere == 0: sentence.append(' '.join(bracketinfo[0])) #print('sentence in gs create tuple of grammar pieces', sentence) #print('thesentencein create tuple of grammarpieces ',sentence) return sentence # die folgende Klasse ist zu rechenaufwendig def filterpermutationsaccordingtotuples(self, sentences, tuplesToCheck, triplesToCheck): filteredprobsentences = [] for sentence in sentences: tuplchecked = 0 triplchecked = 0 #print('sentence and tuples to check', sentence, tuplesToCheck) for tupl in tuplesToCheck: #print(list(sentence)) checkedsecondtime, tupleinWords = self.checkForAnnotationTuple(sentence, tupl[0], 'word.tag_', tupl[1]) #print(checkedsecondtime) if checkedsecondtime == 1: tuplchecked = 0 if checkedsecondtime == 2: tuplchecked = 1 for tripl in triplesToCheck: #print(sentence) checkedsecondtime, tripleinWords = self.checkForAnnotationTriple(sentence, tripl[0], 'word.tag_', tripl[1]) if checkedsecondtime == 1 or checkedsecondtime == 2: triplchecked = 0 if checkedsecondtime == 3: triplchecked = 1 if tuplchecked == 1 or triplchecked == 1: filteredprobsentences.append(sentence) return filteredprobsentences