|
|
-
- # class to implement GS utils and Search
-
- import resource
-
-
-
- class GS_Utils(object):
-
- def __init__(self, language):
-
-
-
- #print('loading spacy..')
- import spacy
- self.nlp = spacy.load(language)
- #print('done')
- self.oi = 'oi'
-
-
-
- def Sentence2GrammarSchema(self, sentence, spacyclass):
-
- doc = self.nlp(sentence)
-
- #print(doc)
-
- GsDBsentence = []
-
- for word in doc:
-
- # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
- if len(eval(spacyclass)) > 1:
- GsDBsentence.append(eval(spacyclass))
-
- return GsDBsentence
-
- def Sentence2RightGrammarTupel(self, sentence, gs_sentence, right_gs_tupel):
- grammcorr_sentences = []
-
- #print(sentence)
- #print(gs_sentence)
- #print(right_gs_tupel)
-
- sentence = sentence.split()
-
- for elements in right_gs_tupel:
- grammcor_sentence = []
-
- usedwordslist = []
- usedwords = set(usedwordslist)
-
- for element in elements.split():
- ok = 0
- #print('1')
- for n in range(len(gs_sentence)):
- #print(element)
- #print(gs_sentence)
-
- if element == gs_sentence[n] and n not in usedwords:
- if ok == 0:
- #print('bla', sentence[n])
-
- grammcor_sentence.append(sentence[n])
-
- usedwordslist.append(n)
- usedwords = set(usedwordslist)
-
- ok = 1
- grammcorr_sentences.append(grammcor_sentence)
-
- return grammcorr_sentences
-
-
- # gets the best grammar scheme from both, depending on which one appears the most in both, and whether rules are still present.
- def GetBestgsAccordingRules(self, sentence, gs_sentence1, right_gs_tupel1, right_gs_tupel2, grammcorr_sentences1 , grammcorr_sentences2, rules , specialrules):
- equals = []
- for n in range(len(grammcorr_sentences1)):
- equalcount = 0
- for l in range(len(grammcorr_sentences2)):
-
- if len(grammcorr_sentences1[n]) <= len(grammcorr_sentences2[l]):
- for m in range(len(grammcorr_sentences1[n])):
- if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:
- equalcount += 1
- else:
- for m in range(len(grammcorr_sentences2[l])):
- if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:
- equalcount += 1
- equals.append(equalcount)
-
- # from here check the if a rule is in the input, if yes then check it for grammar schemes and raise scores.
-
- newrules = []
- for n in range(len(rules)):
- newrules.append([])
-
- ruleapplicable = []
- for m in range(len(rules)):
-
- ruleapplicable.append(False)
-
-
- if len(rules[m]) == 2:
- for n in range(len(gs_sentence1)-1):
- if rules[m][0] == gs_sentence1[n]:
- if rules[m][1] == gs_sentence1[n+1]:
- ruleapplicable[m] = True
- newrules[m] = sentence.split()[n:n+2]
-
-
-
-
-
- for n in range(len(grammcorr_sentences1)):
- if ruleapplicable[m] == True:
- for p in range(len(grammcorr_sentences1[n])-1):
-
- if grammcorr_sentences1[n][p] == newrules[m][0] and grammcorr_sentences1[n][p+1] == newrules[m][1]:
- equals[n] += 40 * len(newrules[m])
- else:
- pass
-
-
- if len(rules[m]) == 3:
- for n in range(len(gs_sentence1)-2):
- if rules[m][0] == gs_sentence1[n]:
- if rules[m][1] == gs_sentence1[n+1]:
- if rules[m][2] == gs_sentence1[n+2]:
- ruleapplicable[m] = True
- newrules[m] = sentence.split()[n:n+3]
-
- for n in range(len(grammcorr_sentences1)):
- if ruleapplicable[m] == True:
- for p in range(len(grammcorr_sentences1[n])-2):
- if grammcorr_sentences1[n][p] == newrules[m][0]:
- if grammcorr_sentences1[n][p+1] == newrules[m][1]:
- if grammcorr_sentences1[n][p+2] == newrules[m][2]:
- equals[n] += 40 * len(newrules[m])
-
-
- if len(rules[m]) == 4:
- for n in range(len(gs_sentence1)-3):
- if rules[m][0] == gs_sentence1[n]:
- if rules[m][1] == gs_sentence1[n+1]:
- if rules[m][2] == gs_sentence1[n+2]:
- if rules[m][3] == gs_sentence1[n+3]:
- ruleapplicable[m] = True
- newrules[m] = sentence.split()[n:n+4]
-
- for n in range(len(grammcorr_sentences1)):
- if ruleapplicable[m] == True:
- for p in range(len(grammcorr_sentences1[n])-3):
- if grammcorr_sentences1[n][p] == newrules[m][0]:
- if grammcorr_sentences1[n][p+1] == newrules[m][1]:
- if grammcorr_sentences1[n][p+2] == newrules[m][2]:
- if grammcorr_sentences1[n][p+3] == newrules[m][3]:
- equals[n] += 40 * len(newrules[m])
- if len(rules[m]) == 5:
- for n in range(len(gs_sentence1)-4):
- if rules[m][0] == gs_sentence1[n]:
- if rules[m][1] == gs_sentence1[n+1]:
- if rules[m][2] == gs_sentence1[n+2]:
- if rules[m][3] == gs_sentence1[n+3]:
- if rules[m][4] == gs_sentence1[n+4]:
- ruleapplicable[m] = True
- newrules[m] = sentence.split()[n:n+5]
-
- for n in range(len(grammcorr_sentences1)):
- if ruleapplicable[m] == True:
- for p in range(len(grammcorr_sentences1[n])-4):
- if grammcorr_sentences1[n][p] == newrules[m][0]:
- if grammcorr_sentences1[n][p+1] == newrules[m][1]:
- if grammcorr_sentences1[n][p+2] == newrules[m][2]:
- if grammcorr_sentences1[n][p+3] == newrules[m][3]:
- if grammcorr_sentences1[n][p+4] == newrules[m][4]:
- equals[n] += 40 * len(newrules[m])
-
- #print('the found rules from input:',newrules)
-
-
- for n in range(len(grammcorr_sentences1)):
- for m in range(len(specialrules)):
- if len(specialrules[m]) == 2:
- for p in range(len(grammcorr_sentences1[n])-1):
- if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1]:
- equals[n] += len(grammcorr_sentences1[n])
- else:
- pass
- if len(specialrules[m]) == 3:
- for p in range(len(grammcorr_sentences1[n])-2):
- if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1] and right_gs_tupel1[n+2][p] == specialrules[m][2]:
- equals[n] += len(grammcorr_sentences1[n])
- else:
- pass
-
- #for n in range(len(grammcorr_sentences1)):
- #if len(sentence.split()) == grammcorr_sentences1[n]:
- #equals[n] += 50
-
- indexedequals = []
- for n in range(len(equals)):
- indexedequals.append([n,equals[n]])
-
- indexedequals_sorted = sorted(indexedequals[::-1], key=lambda tup: tup[1], reverse=True)
-
-
- return grammcorr_sentences1[indexedequals_sorted[0][0]]
-
-
- def checkSPO(self, splitsentence, convertedornot):
-
- if convertedornot == 0:
- gs_sentenceSPOProof = self.Sentence2GrammarSchema(' '.join(splitsentence), 'word.dep_')
-
- if convertedornot == 1:
- gs_sentenceSPOProof = splitsentence
-
- spoCount = [0,0,0]
-
- for word in gs_sentenceSPOProof:
- if word == 'sb' or word == 'ep' or word == 'ph':
- spoCount[0] = 1
- if word == 'ROOT' or word == 'pd':
- spoCount[1] = 1
- if word == 'oa' or word == 'og' or word == 'oc' or word == 'op' or word == 'mo':
- spoCount[2] = 1
-
- return spoCount
-
- def checkForAnnotation(self, splitsentence, token, spacyclass):
-
- gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
-
- AnnoORnot = 0
- for word in gs_sentence_RC_Proof:
- if word == token:
- AnnoORnot = 1
-
- return AnnoORnot
-
- def checkForAnnotationInTokenizedSentence(self, splitsentence, token):
-
- gs_sentence_RC_Proof = splitsentence
-
- AnnoORnot = 0
- for word in gs_sentence_RC_Proof:
- if word == token:
- AnnoORnot = 1
-
- return AnnoORnot
-
-
- def checkForAnnotationTuple(self, splitsentence, token, spacyclass, tupleinwords):
- #self.spacyclass = spacyclass
- gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
- AnnotationtupleInwords = []
- AnnoORnot = 0
- #print(gs_sentence_RC_Proof)
- for n in range(len(gs_sentence_RC_Proof) - 1):
- if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tupleinwords[0] or tupleinwords == 'None'):
- #print('oioioiAYE')
- #print(gs_sentence_RC_Proof)
- AnnoORnot = 1
- if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tupleinwords[1] or tupleinwords == 'None'):
- AnnoORnot = 2
- AnnotationtupleInwords.append(splitsentence[n:n+2])
- #print(token)
-
- return AnnoORnot, AnnotationtupleInwords
-
- def checkForAnnotationTriple(self, splitsentence, token, spacyclass, tripleinwords):
- #self.spacyclass = spacyclass
- gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
-
- #print('gssentencercprooof', gs_sentence_RC_Proof)
-
- AnnoORnot = 0
- AnnotationtripleInwords = []
- for n in range(len(gs_sentence_RC_Proof) - 2):
- if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tripleinwords[0] or tripleinwords == 'None'):
- AnnoORnot = 1
- if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tripleinwords[1] or tripleinwords == 'None'):
- AnnoORnot = 2
- if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == tripleinwords[2] or tripleinwords == 'None'):
- AnnoORnot = 3
- AnnotationtripleInwords.append(splitsentence[n:n+3])
-
- return AnnoORnot, AnnotationtripleInwords
-
- def checkForAnnotationQuadruple(self, splitsentence, token, spacyclass, quadrupleinwords):
- #self.spacyclass = spacyclass
- gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
-
- #print('gssentencercprooof', gs_sentence_RC_Proof)
- #print('quadrupleinwords',quadrupleinwords)
- #print('token', token)
- AnnoORnot = 0
- AnnotationquadrupleInwords = []
- for n in range(len(gs_sentence_RC_Proof) - 3):
- if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == quadrupleinwords[0] or quadrupleinwords == 'None'):
- AnnoORnot = 1
- if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == quadrupleinwords[1] or quadrupleinwords == 'None'):
- AnnoORnot = 2
- if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == quadrupleinwords[2] or quadrupleinwords == 'None'):
- AnnoORnot = 3
- if gs_sentence_RC_Proof[n + 3] == token[3] and (splitsentence[n+3] == quadrupleinwords[3] or quadrupleinwords == 'None'):
- AnnoORnot = 4
- AnnotationquadrupleInwords.append(splitsentence[n:n+4])
-
- #print('AnnotationquadrupleInwords', AnnotationquadrupleInwords)
-
- return AnnoORnot, AnnotationquadrupleInwords
-
-
- #input ['this', 'is', 'a', 'sentence']
-
- def GetTuplesinSentence(self,mainsentence):
-
- tuplesToCheck = []
- tuples = [['ART', 'NE'], ['ART', 'NN'], ['APPR','NN'], ['APPR','ADJD'], ['APPR','NE'], ['ART', 'CARD'], ['APPR', 'CARD'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD'],['ADV', 'ADV'], ['ADV', 'PTKVZ'], ['PTKNEG', 'ADV'], ['ADJA','NN'], ['ADJA','NE'], ['ADV','PIS'], ['ADJA','PIS'], ['ADJD','PIS'], ['APPRART', 'NN'], ['APPRART', 'NE'], ['PDAT', 'NE'], ['PDAT', 'NN'], ['PWAT', 'NE'], ['PWAT', 'NN'], ['PIAT', 'NE'], ['PIAT', 'NN'], ['PROAV', 'ADJD'],['PDS', 'NE'], ['PDS', 'NN'], ['NE', 'NE'], ['CARD', 'NE'], ['CARD', 'NN'] ]
- #print('beginning of gettuplesinsentence')
- #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
- for tupl in tuples:
- #print('checking another tuple')
- #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
-
- checktupleindex, tupleInWords = self.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')
- if len(tupleInWords) > 0:
- for tup in tupleInWords:
- tuplesToCheck.append([tupl, tup])
- #print('oi a tuple was found')
- #print('after the loop')
- #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
-
- #print('no going to the triples')
- triplesToCheck = []
- triples = [['APPR', 'ART', 'NN'],['APPR', 'PDAT', 'NN'], ['APPR', 'PDS', 'NN'], ['ART','ADJA','NN'], ['ART','ADJA','NE'], ['APPR', 'ART', 'NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['APPR', 'PIAT', 'NN'], ['APPR', 'ADJA', 'NN'], ['APPR', 'ADJA', 'NE'], ['APPRART', 'NN', 'CARD'], ['APPRART', 'NE', 'CARD'], ['APPRART', 'NN', 'NE'], ['CARD', 'KON', 'CARD'], ['APPR', 'ADV', 'CARD'], ['ADJD', 'KOKOM', 'CARD'], ['APPR', 'NE', 'NE'], ['NN', 'KON', 'NN'], ['NE', 'NN', 'NE'], ['APPR', 'NE', 'NN'], ['APPR', 'CARD', 'NN'], ['APPR', 'CARD', 'NE']]
- for tripl in triples:
- #print('checking next triple')
- checktripleindex, tripleInWords = self.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')
- if len(tripleInWords) > 0:
- for trip in tripleInWords:
- triplesToCheck.append([tripl, trip])
- #print('oi a triple was found')
-
- quadruplesToCheck = []
- quadruples = [['KOKOM', 'ADV', 'ADJA', 'NN'], ['KOKOM', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NN'], ['ART', 'NN', 'APPR', 'NE'], ['APPR', 'NE', 'NN', 'NE'], ['APPR', 'ART', 'ADJA', 'NN'], ['ART', 'ADJD', 'ADJA', 'NN']]
-
- for quadrupl in quadruples:
- #print('checking next triple')
- checkquadrupleindex, quadrupleInWords = self.checkForAnnotationQuadruple(mainsentence, quadrupl, 'word.tag_', 'None')
- if len(quadrupleInWords) > 0:
- for quad in quadrupleInWords:
- quadruplesToCheck.append([quadrupl, quad])
-
-
-
- #print('gettuples insentences is done')
- return tuplesToCheck, triplesToCheck, quadruplesToCheck
-
-
- def createTupleofGrammarpieces(self, sentence, tuplesToCheck, triplesToCheck, quadruplesToCheck):
- #print('going in crate tuple of grammar pieces')
- tuplestoremove = []
- for tupl in tuplesToCheck:
- for tripl in triplesToCheck:
- if (tupl[1][0] == tripl[1][0] and tupl[1][1] == tripl[1][1]) or (tupl[1][0] == tripl[1][1] and tupl[1][1] == tripl[1][2]):
- tuplestoremove.append(tupl)
- for tupletoremove in tuplestoremove:
- tuplesToCheck.remove(tupletoremove)
-
- #print('in between0', sentence, quadruplesToCheck, tuplesToCheck, triplesToCheck)
-
- tuplestoremove = []
-
- for tupl in tuplesToCheck:
- for quad in quadruplesToCheck:
- #print('I got here')
- #print(tupl, quad)
- #print(tupl[1][0], tupl[1][1], quad[1][2], quad[1][3])
- if (tupl[1][0] == quad[1][0] and tupl[1][1] == quad[1][1]) or (tupl[1][0] == quad[1][1] and tupl[1][1] == quad[1][2]) or (tupl[1][0] == quad[1][2] and tupl[1][1] == quad[1][3]):
- #print('and I got here', tupl)
- tuplestoremove.append(tupl)
- for tupletoremove in tuplestoremove:
- tuplesToCheck.remove(tupletoremove)
-
- #print('and until here?')
-
- triplestoremove = []
- for tripl in triplesToCheck:
- for quad in quadruplesToCheck:
- if (tripl[1][0] == quad[1][0] and tripl[1][1] == quad[1][1] and tripl[1][2] == quad[1][2]) or (tripl[1][0] == quad[1][1] and tripl[1][1] == quad[1][2] and tripl[1][2] == quad[1][3]):
- triplestoremove.append(tripl)
- for tripltoremove in triplestoremove:
- triplesToCheck.remove(tripltoremove)
-
- bracketinfo = []
- bracketinfos = []
- bracketindex = 0
-
- #print('in between1', sentence, quadruplesToCheck)
-
- for n in range(len(sentence)):
-
- if sentence[n] != '':
- if sentence[n] == '(' or sentence[n][0] == '(':
- for m in range(n ,len(sentence)):
- bracketinfo.append(sentence[m])
- if sentence[m] == ')' or sentence[m][-1] == ')':
-
- wordbeforebracketinfo = None
- try:
- wordbeforebracketinfo = sentence[n-1]
- except:
- pass
- bracketinfos.append([bracketinfo, wordbeforebracketinfo])
- bracketinfo = []
- break
-
- #print('in between2', sentence, quadruplesToCheck)
- #print('bracketinfo',bracketinfos)
- #print('sentence',sentence)
- for bracketinfo in bracketinfos:
- for word in bracketinfo[0]:
- sentence.remove(word)
-
- #print('in between3', sentence, quadruplesToCheck)
-
- if len(quadruplesToCheck) != 0:
- for n in range(len(quadruplesToCheck)):
- for m in range(len(sentence) - 3):
- if sentence[m] == quadruplesToCheck[n][1][0]:
- if sentence[m + 1] == quadruplesToCheck[n][1][1]:
- if sentence[m + 2] == quadruplesToCheck[n][1][2]:
- if sentence[m + 3] == quadruplesToCheck[n][1][3]:
- del sentence[m + 3]
- del sentence[m + 2]
- del sentence[m + 1]
- del sentence[m]
- sentence.insert(m,' '.join(quadruplesToCheck[n][1]))
-
-
- if len(triplesToCheck) != 0:
- for n in range(len(triplesToCheck)):
- for m in range(len(sentence) - 2):
- if sentence[m] == triplesToCheck[n][1][0]:
- if sentence[m + 1] == triplesToCheck[n][1][1]:
- if sentence[m + 2] == triplesToCheck[n][1][2]:
- del sentence[m + 2]
- del sentence[m + 1]
- del sentence[m]
- sentence.insert(m,' '.join(triplesToCheck[n][1]))
-
- if len(tuplesToCheck) != 0:
- for n in range(len(tuplesToCheck)):
- for m in range(len(sentence) - 1):
- if sentence[m] == tuplesToCheck[n][1][0]:
- if sentence[m + 1] == tuplesToCheck[n][1][1]:
- del sentence[m + 1]
- del sentence[m]
- sentence.insert(m,' '.join(tuplesToCheck[n][1]))
-
- for bracketinfo in bracketinfos:
- bracketinfowasthere = 0
- for n in range(len(sentence)):
- sentencensplit = sentence[n].split()
- if bracketinfo[1] == sentencensplit[-1]:
- sentence[n] = sentence[n] + ' '.join(bracketinfo[0])
- bracketinfowasthere = 1
- break
- if bracketinfowasthere == 0:
- sentence.append(' '.join(bracketinfo[0]))
- #print('sentence in gs create tuple of grammar pieces', sentence)
- #print('thesentencein create tuple of grammarpieces ',sentence)
- return sentence
-
- # die folgende Klasse ist zu rechenaufwendig
- def filterpermutationsaccordingtotuples(self, sentences, tuplesToCheck, triplesToCheck):
-
- filteredprobsentences = []
- for sentence in sentences:
-
-
-
- tuplchecked = 0
- triplchecked = 0
- #print('sentence and tuples to check', sentence, tuplesToCheck)
- for tupl in tuplesToCheck:
- #print(list(sentence))
- checkedsecondtime, tupleinWords = self.checkForAnnotationTuple(sentence, tupl[0], 'word.tag_', tupl[1])
-
- #print(checkedsecondtime)
- if checkedsecondtime == 1:
-
- tuplchecked = 0
-
- if checkedsecondtime == 2:
-
- tuplchecked = 1
-
-
- for tripl in triplesToCheck:
- #print(sentence)
- checkedsecondtime, tripleinWords = self.checkForAnnotationTriple(sentence, tripl[0], 'word.tag_', tripl[1])
- if checkedsecondtime == 1 or checkedsecondtime == 2:
-
- triplchecked = 0
-
- if checkedsecondtime == 3:
-
- triplchecked = 1
-
-
-
- if tuplchecked == 1 or triplchecked == 1:
- filteredprobsentences.append(sentence)
-
- return filteredprobsentences
-
-
|