basabuuka_prototyp/Prototyp/GS_Utils.py
2020-08-16 19:36:44 +02:00

521 lines
24 KiB
Python

# class to implement GS utils and Search
import resource
class GS_Utils(object):
def __init__(self, language):
#print('loading spacy..')
import spacy
self.nlp = spacy.load(language)
#print('done')
self.oi = 'oi'
def Sentence2GrammarSchema(self, sentence, spacyclass):
doc = self.nlp(sentence)
#print(doc)
GsDBsentence = []
for word in doc:
# es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
if len(eval(spacyclass)) > 1:
GsDBsentence.append(eval(spacyclass))
return GsDBsentence
def Sentence2RightGrammarTupel(self, sentence, gs_sentence, right_gs_tupel):
grammcorr_sentences = []
#print(sentence)
#print(gs_sentence)
#print(right_gs_tupel)
sentence = sentence.split()
for elements in right_gs_tupel:
grammcor_sentence = []
usedwordslist = []
usedwords = set(usedwordslist)
for element in elements.split():
ok = 0
#print('1')
for n in range(len(gs_sentence)):
#print(element)
#print(gs_sentence)
if element == gs_sentence[n] and n not in usedwords:
if ok == 0:
#print('bla', sentence[n])
grammcor_sentence.append(sentence[n])
usedwordslist.append(n)
usedwords = set(usedwordslist)
ok = 1
grammcorr_sentences.append(grammcor_sentence)
return grammcorr_sentences
# gets the best grammar scheme from both, depending on which one appears the most in both, and whether rules are still present.
def GetBestgsAccordingRules(self, sentence, gs_sentence1, right_gs_tupel1, right_gs_tupel2, grammcorr_sentences1 , grammcorr_sentences2, rules , specialrules):
equals = []
for n in range(len(grammcorr_sentences1)):
equalcount = 0
for l in range(len(grammcorr_sentences2)):
if len(grammcorr_sentences1[n]) <= len(grammcorr_sentences2[l]):
for m in range(len(grammcorr_sentences1[n])):
if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:
equalcount += 1
else:
for m in range(len(grammcorr_sentences2[l])):
if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:
equalcount += 1
equals.append(equalcount)
# from here check the if a rule is in the input, if yes then check it for grammar schemes and raise scores.
newrules = []
for n in range(len(rules)):
newrules.append([])
ruleapplicable = []
for m in range(len(rules)):
ruleapplicable.append(False)
if len(rules[m]) == 2:
for n in range(len(gs_sentence1)-1):
if rules[m][0] == gs_sentence1[n]:
if rules[m][1] == gs_sentence1[n+1]:
ruleapplicable[m] = True
newrules[m] = sentence.split()[n:n+2]
for n in range(len(grammcorr_sentences1)):
if ruleapplicable[m] == True:
for p in range(len(grammcorr_sentences1[n])-1):
if grammcorr_sentences1[n][p] == newrules[m][0] and grammcorr_sentences1[n][p+1] == newrules[m][1]:
equals[n] += 40 * len(newrules[m])
else:
pass
if len(rules[m]) == 3:
for n in range(len(gs_sentence1)-2):
if rules[m][0] == gs_sentence1[n]:
if rules[m][1] == gs_sentence1[n+1]:
if rules[m][2] == gs_sentence1[n+2]:
ruleapplicable[m] = True
newrules[m] = sentence.split()[n:n+3]
for n in range(len(grammcorr_sentences1)):
if ruleapplicable[m] == True:
for p in range(len(grammcorr_sentences1[n])-2):
if grammcorr_sentences1[n][p] == newrules[m][0]:
if grammcorr_sentences1[n][p+1] == newrules[m][1]:
if grammcorr_sentences1[n][p+2] == newrules[m][2]:
equals[n] += 40 * len(newrules[m])
if len(rules[m]) == 4:
for n in range(len(gs_sentence1)-3):
if rules[m][0] == gs_sentence1[n]:
if rules[m][1] == gs_sentence1[n+1]:
if rules[m][2] == gs_sentence1[n+2]:
if rules[m][3] == gs_sentence1[n+3]:
ruleapplicable[m] = True
newrules[m] = sentence.split()[n:n+4]
for n in range(len(grammcorr_sentences1)):
if ruleapplicable[m] == True:
for p in range(len(grammcorr_sentences1[n])-3):
if grammcorr_sentences1[n][p] == newrules[m][0]:
if grammcorr_sentences1[n][p+1] == newrules[m][1]:
if grammcorr_sentences1[n][p+2] == newrules[m][2]:
if grammcorr_sentences1[n][p+3] == newrules[m][3]:
equals[n] += 40 * len(newrules[m])
if len(rules[m]) == 5:
for n in range(len(gs_sentence1)-4):
if rules[m][0] == gs_sentence1[n]:
if rules[m][1] == gs_sentence1[n+1]:
if rules[m][2] == gs_sentence1[n+2]:
if rules[m][3] == gs_sentence1[n+3]:
if rules[m][4] == gs_sentence1[n+4]:
ruleapplicable[m] = True
newrules[m] = sentence.split()[n:n+5]
for n in range(len(grammcorr_sentences1)):
if ruleapplicable[m] == True:
for p in range(len(grammcorr_sentences1[n])-4):
if grammcorr_sentences1[n][p] == newrules[m][0]:
if grammcorr_sentences1[n][p+1] == newrules[m][1]:
if grammcorr_sentences1[n][p+2] == newrules[m][2]:
if grammcorr_sentences1[n][p+3] == newrules[m][3]:
if grammcorr_sentences1[n][p+4] == newrules[m][4]:
equals[n] += 40 * len(newrules[m])
#print('the found rules from input:',newrules)
for n in range(len(grammcorr_sentences1)):
for m in range(len(specialrules)):
if len(specialrules[m]) == 2:
for p in range(len(grammcorr_sentences1[n])-1):
if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1]:
equals[n] += len(grammcorr_sentences1[n])
else:
pass
if len(specialrules[m]) == 3:
for p in range(len(grammcorr_sentences1[n])-2):
if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1] and right_gs_tupel1[n+2][p] == specialrules[m][2]:
equals[n] += len(grammcorr_sentences1[n])
else:
pass
#for n in range(len(grammcorr_sentences1)):
#if len(sentence.split()) == grammcorr_sentences1[n]:
#equals[n] += 50
indexedequals = []
for n in range(len(equals)):
indexedequals.append([n,equals[n]])
indexedequals_sorted = sorted(indexedequals[::-1], key=lambda tup: tup[1], reverse=True)
return grammcorr_sentences1[indexedequals_sorted[0][0]]
def checkSPO(self, splitsentence, convertedornot):
if convertedornot == 0:
gs_sentenceSPOProof = self.Sentence2GrammarSchema(' '.join(splitsentence), 'word.dep_')
if convertedornot == 1:
gs_sentenceSPOProof = splitsentence
spoCount = [0,0,0]
for word in gs_sentenceSPOProof:
if word == 'sb' or word == 'ep' or word == 'ph':
spoCount[0] = 1
if word == 'ROOT' or word == 'pd':
spoCount[1] = 1
if word == 'oa' or word == 'og' or word == 'oc' or word == 'op' or word == 'mo':
spoCount[2] = 1
return spoCount
def checkForAnnotation(self, splitsentence, token, spacyclass):
gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
AnnoORnot = 0
for word in gs_sentence_RC_Proof:
if word == token:
AnnoORnot = 1
return AnnoORnot
def checkForAnnotationInTokenizedSentence(self, splitsentence, token):
gs_sentence_RC_Proof = splitsentence
AnnoORnot = 0
for word in gs_sentence_RC_Proof:
if word == token:
AnnoORnot = 1
return AnnoORnot
def checkForAnnotationTuple(self, splitsentence, token, spacyclass, tupleinwords):
#self.spacyclass = spacyclass
gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
AnnotationtupleInwords = []
AnnoORnot = 0
#print(gs_sentence_RC_Proof)
for n in range(len(gs_sentence_RC_Proof) - 1):
if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tupleinwords[0] or tupleinwords == 'None'):
#print('oioioiAYE')
#print(gs_sentence_RC_Proof)
AnnoORnot = 1
if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tupleinwords[1] or tupleinwords == 'None'):
AnnoORnot = 2
AnnotationtupleInwords.append(splitsentence[n:n+2])
#print(token)
return AnnoORnot, AnnotationtupleInwords
def checkForAnnotationTriple(self, splitsentence, token, spacyclass, tripleinwords):
#self.spacyclass = spacyclass
gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
#print('gssentencercprooof', gs_sentence_RC_Proof)
AnnoORnot = 0
AnnotationtripleInwords = []
for n in range(len(gs_sentence_RC_Proof) - 2):
if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tripleinwords[0] or tripleinwords == 'None'):
AnnoORnot = 1
if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tripleinwords[1] or tripleinwords == 'None'):
AnnoORnot = 2
if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == tripleinwords[2] or tripleinwords == 'None'):
AnnoORnot = 3
AnnotationtripleInwords.append(splitsentence[n:n+3])
return AnnoORnot, AnnotationtripleInwords
def checkForAnnotationQuadruple(self, splitsentence, token, spacyclass, quadrupleinwords):
#self.spacyclass = spacyclass
gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
#print('gssentencercprooof', gs_sentence_RC_Proof)
#print('quadrupleinwords',quadrupleinwords)
#print('token', token)
AnnoORnot = 0
AnnotationquadrupleInwords = []
for n in range(len(gs_sentence_RC_Proof) - 3):
if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == quadrupleinwords[0] or quadrupleinwords == 'None'):
AnnoORnot = 1
if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == quadrupleinwords[1] or quadrupleinwords == 'None'):
AnnoORnot = 2
if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == quadrupleinwords[2] or quadrupleinwords == 'None'):
AnnoORnot = 3
if gs_sentence_RC_Proof[n + 3] == token[3] and (splitsentence[n+3] == quadrupleinwords[3] or quadrupleinwords == 'None'):
AnnoORnot = 4
AnnotationquadrupleInwords.append(splitsentence[n:n+4])
#print('AnnotationquadrupleInwords', AnnotationquadrupleInwords)
return AnnoORnot, AnnotationquadrupleInwords
#input ['this', 'is', 'a', 'sentence']
def GetTuplesinSentence(self,mainsentence):
tuplesToCheck = []
tuples = [['ART', 'NE'], ['ART', 'NN'], ['APPR','NN'], ['APPR','ADJD'], ['APPR','NE'], ['ART', 'CARD'], ['APPR', 'CARD'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD'],['ADV', 'ADV'], ['ADV', 'PTKVZ'], ['PTKNEG', 'ADV'], ['ADJA','NN'], ['ADJA','NE'], ['ADV','PIS'], ['ADJA','PIS'], ['ADJD','PIS'], ['APPRART', 'NN'], ['APPRART', 'NE'], ['PDAT', 'NE'], ['PDAT', 'NN'], ['PWAT', 'NE'], ['PWAT', 'NN'], ['PIAT', 'NE'], ['PIAT', 'NN'], ['PROAV', 'ADJD'],['PDS', 'NE'], ['PDS', 'NN'], ['NE', 'NE'], ['CARD', 'NE'], ['CARD', 'NN'] ]
#print('beginning of gettuplesinsentence')
#print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
for tupl in tuples:
#print('checking another tuple')
#print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
checktupleindex, tupleInWords = self.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')
if len(tupleInWords) > 0:
for tup in tupleInWords:
tuplesToCheck.append([tupl, tup])
#print('oi a tuple was found')
#print('after the loop')
#print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
#print('no going to the triples')
triplesToCheck = []
triples = [['APPR', 'ART', 'NN'],['APPR', 'PDAT', 'NN'], ['APPR', 'PDS', 'NN'], ['ART','ADJA','NN'], ['ART','ADJA','NE'], ['APPR', 'ART', 'NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['APPR', 'PIAT', 'NN'], ['APPR', 'ADJA', 'NN'], ['APPR', 'ADJA', 'NE'], ['APPRART', 'NN', 'CARD'], ['APPRART', 'NE', 'CARD'], ['APPRART', 'NN', 'NE'], ['CARD', 'KON', 'CARD'], ['APPR', 'ADV', 'CARD'], ['ADJD', 'KOKOM', 'CARD'], ['APPR', 'NE', 'NE'], ['NN', 'KON', 'NN'], ['NE', 'NN', 'NE'], ['APPR', 'NE', 'NN'], ['APPR', 'CARD', 'NN'], ['APPR', 'CARD', 'NE']]
for tripl in triples:
#print('checking next triple')
checktripleindex, tripleInWords = self.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')
if len(tripleInWords) > 0:
for trip in tripleInWords:
triplesToCheck.append([tripl, trip])
#print('oi a triple was found')
quadruplesToCheck = []
quadruples = [['KOKOM', 'ADV', 'ADJA', 'NN'], ['KOKOM', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NN'], ['ART', 'NN', 'APPR', 'NE'], ['APPR', 'NE', 'NN', 'NE'], ['APPR', 'ART', 'ADJA', 'NN'], ['ART', 'ADJD', 'ADJA', 'NN']]
for quadrupl in quadruples:
#print('checking next triple')
checkquadrupleindex, quadrupleInWords = self.checkForAnnotationQuadruple(mainsentence, quadrupl, 'word.tag_', 'None')
if len(quadrupleInWords) > 0:
for quad in quadrupleInWords:
quadruplesToCheck.append([quadrupl, quad])
#print('gettuples insentences is done')
return tuplesToCheck, triplesToCheck, quadruplesToCheck
def createTupleofGrammarpieces(self, sentence, tuplesToCheck, triplesToCheck, quadruplesToCheck):
#print('going in crate tuple of grammar pieces')
tuplestoremove = []
for tupl in tuplesToCheck:
for tripl in triplesToCheck:
if (tupl[1][0] == tripl[1][0] and tupl[1][1] == tripl[1][1]) or (tupl[1][0] == tripl[1][1] and tupl[1][1] == tripl[1][2]):
tuplestoremove.append(tupl)
for tupletoremove in tuplestoremove:
tuplesToCheck.remove(tupletoremove)
#print('in between0', sentence, quadruplesToCheck, tuplesToCheck, triplesToCheck)
tuplestoremove = []
for tupl in tuplesToCheck:
for quad in quadruplesToCheck:
#print('I got here')
#print(tupl, quad)
#print(tupl[1][0], tupl[1][1], quad[1][2], quad[1][3])
if (tupl[1][0] == quad[1][0] and tupl[1][1] == quad[1][1]) or (tupl[1][0] == quad[1][1] and tupl[1][1] == quad[1][2]) or (tupl[1][0] == quad[1][2] and tupl[1][1] == quad[1][3]):
#print('and I got here', tupl)
tuplestoremove.append(tupl)
for tupletoremove in tuplestoremove:
tuplesToCheck.remove(tupletoremove)
#print('and until here?')
triplestoremove = []
for tripl in triplesToCheck:
for quad in quadruplesToCheck:
if (tripl[1][0] == quad[1][0] and tripl[1][1] == quad[1][1] and tripl[1][2] == quad[1][2]) or (tripl[1][0] == quad[1][1] and tripl[1][1] == quad[1][2] and tripl[1][2] == quad[1][3]):
triplestoremove.append(tripl)
for tripltoremove in triplestoremove:
triplesToCheck.remove(tripltoremove)
bracketinfo = []
bracketinfos = []
bracketindex = 0
#print('in between1', sentence, quadruplesToCheck)
for n in range(len(sentence)):
if sentence[n] != '':
if sentence[n] == '(' or sentence[n][0] == '(':
for m in range(n ,len(sentence)):
bracketinfo.append(sentence[m])
if sentence[m] == ')' or sentence[m][-1] == ')':
wordbeforebracketinfo = None
try:
wordbeforebracketinfo = sentence[n-1]
except:
pass
bracketinfos.append([bracketinfo, wordbeforebracketinfo])
bracketinfo = []
break
#print('in between2', sentence, quadruplesToCheck)
#print('bracketinfo',bracketinfos)
#print('sentence',sentence)
for bracketinfo in bracketinfos:
for word in bracketinfo[0]:
sentence.remove(word)
#print('in between3', sentence, quadruplesToCheck)
if len(quadruplesToCheck) != 0:
for n in range(len(quadruplesToCheck)):
for m in range(len(sentence) - 3):
if sentence[m] == quadruplesToCheck[n][1][0]:
if sentence[m + 1] == quadruplesToCheck[n][1][1]:
if sentence[m + 2] == quadruplesToCheck[n][1][2]:
if sentence[m + 3] == quadruplesToCheck[n][1][3]:
del sentence[m + 3]
del sentence[m + 2]
del sentence[m + 1]
del sentence[m]
sentence.insert(m,' '.join(quadruplesToCheck[n][1]))
if len(triplesToCheck) != 0:
for n in range(len(triplesToCheck)):
for m in range(len(sentence) - 2):
if sentence[m] == triplesToCheck[n][1][0]:
if sentence[m + 1] == triplesToCheck[n][1][1]:
if sentence[m + 2] == triplesToCheck[n][1][2]:
del sentence[m + 2]
del sentence[m + 1]
del sentence[m]
sentence.insert(m,' '.join(triplesToCheck[n][1]))
if len(tuplesToCheck) != 0:
for n in range(len(tuplesToCheck)):
for m in range(len(sentence) - 1):
if sentence[m] == tuplesToCheck[n][1][0]:
if sentence[m + 1] == tuplesToCheck[n][1][1]:
del sentence[m + 1]
del sentence[m]
sentence.insert(m,' '.join(tuplesToCheck[n][1]))
for bracketinfo in bracketinfos:
bracketinfowasthere = 0
for n in range(len(sentence)):
sentencensplit = sentence[n].split()
if bracketinfo[1] == sentencensplit[-1]:
sentence[n] = sentence[n] + ' '.join(bracketinfo[0])
bracketinfowasthere = 1
break
if bracketinfowasthere == 0:
sentence.append(' '.join(bracketinfo[0]))
#print('sentence in gs create tuple of grammar pieces', sentence)
#print('thesentencein create tuple of grammarpieces ',sentence)
return sentence
# die folgende Klasse ist zu rechenaufwendig
def filterpermutationsaccordingtotuples(self, sentences, tuplesToCheck, triplesToCheck):
filteredprobsentences = []
for sentence in sentences:
tuplchecked = 0
triplchecked = 0
#print('sentence and tuples to check', sentence, tuplesToCheck)
for tupl in tuplesToCheck:
#print(list(sentence))
checkedsecondtime, tupleinWords = self.checkForAnnotationTuple(sentence, tupl[0], 'word.tag_', tupl[1])
#print(checkedsecondtime)
if checkedsecondtime == 1:
tuplchecked = 0
if checkedsecondtime == 2:
tuplchecked = 1
for tripl in triplesToCheck:
#print(sentence)
checkedsecondtime, tripleinWords = self.checkForAnnotationTriple(sentence, tripl[0], 'word.tag_', tripl[1])
if checkedsecondtime == 1 or checkedsecondtime == 2:
triplchecked = 0
if checkedsecondtime == 3:
triplchecked = 1
if tuplchecked == 1 or triplchecked == 1:
filteredprobsentences.append(sentence)
return filteredprobsentences