alpcentaur
/
basabuuka_prototyp


# class to implement GS utils and Search
import resource


class GS_Utils(object):        def __init__(self, language):                                #print('loading spacy..')        import spacy        self.nlp = spacy.load(language)        #print('done')        self.oi = 'oi'                    def Sentence2GrammarSchema(self, sentence, spacyclass):                doc = self.nlp(sentence)                        #print(doc)                GsDBsentence = []                for word in doc:                        # es eignet sich hierbei word.pos_  fuer noun und verb, word.dep_ fuer sb pd, und evtl tag             if len(eval(spacyclass)) > 1:                GsDBsentence.append(eval(spacyclass))                        return GsDBsentence        def Sentence2RightGrammarTupel(self, sentence, gs_sentence, right_gs_tupel):        grammcorr_sentences = []                #print(sentence)        #print(gs_sentence)        #print(right_gs_tupel)                sentence = sentence.split()                for elements in right_gs_tupel:            grammcor_sentence = []                        usedwordslist = []            usedwords = set(usedwordslist)                        for element in elements.split():                ok = 0                #print('1')                for n in range(len(gs_sentence)):                    #print(element)                    #print(gs_sentence)                                        if element == gs_sentence[n] and n not in usedwords:                        if ok == 0:                            #print('bla', sentence[n])                                                        grammcor_sentence.append(sentence[n])                                                        usedwordslist.append(n)                            usedwords = set(usedwordslist)                                                        ok = 1            grammcorr_sentences.append(grammcor_sentence)                return grammcorr_sentences

    # gets the best grammar scheme from both, depending on which one appears the most in both, and whether rules are still present.     def GetBestgsAccordingRules(self, sentence, gs_sentence1, right_gs_tupel1, right_gs_tupel2, grammcorr_sentences1 , grammcorr_sentences2, rules , specialrules):        equals = []        for n in range(len(grammcorr_sentences1)):            equalcount = 0            for l in range(len(grammcorr_sentences2)):                                if len(grammcorr_sentences1[n]) <= len(grammcorr_sentences2[l]):                    for m in range(len(grammcorr_sentences1[n])):                        if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:                            equalcount += 1                else:                    for m in range(len(grammcorr_sentences2[l])):                        if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:                            equalcount += 1            equals.append(equalcount)                # from here check the if a rule is in the input, if yes then check it for grammar schemes and raise scores.                newrules = []        for n in range(len(rules)):            newrules.append([])                ruleapplicable = []        for m in range(len(rules)):                        ruleapplicable.append(False)                              if len(rules[m]) == 2:                for n in range(len(gs_sentence1)-1):                    if rules[m][0] == gs_sentence1[n]:                        if rules[m][1] == gs_sentence1[n+1]:                            ruleapplicable[m] = True                            newrules[m] = sentence.split()[n:n+2]                                                                                                for n in range(len(grammcorr_sentences1)):                    if ruleapplicable[m] == True:                        for p in range(len(grammcorr_sentences1[n])-1):                                                        if grammcorr_sentences1[n][p] == newrules[m][0] and grammcorr_sentences1[n][p+1] == newrules[m][1]:                                equals[n] += 40 * len(newrules[m])                            else:                                pass                                                            if len(rules[m]) == 3:                for n in range(len(gs_sentence1)-2):                    if rules[m][0] == gs_sentence1[n]:                        if rules[m][1] == gs_sentence1[n+1]:                            if rules[m][2] == gs_sentence1[n+2]:                                ruleapplicable[m] = True                                newrules[m] = sentence.split()[n:n+3]                                for n in range(len(grammcorr_sentences1)):                    if ruleapplicable[m] == True:                        for p in range(len(grammcorr_sentences1[n])-2):                            if grammcorr_sentences1[n][p] == newrules[m][0]:                                if grammcorr_sentences1[n][p+1] == newrules[m][1]:                                    if grammcorr_sentences1[n][p+2] == newrules[m][2]:                                        equals[n] += 40 * len(newrules[m])                                                    if len(rules[m]) == 4:                for n in range(len(gs_sentence1)-3):                    if rules[m][0] == gs_sentence1[n]:                        if rules[m][1] == gs_sentence1[n+1]:                            if rules[m][2] == gs_sentence1[n+2]:                                if rules[m][3] == gs_sentence1[n+3]:                                    ruleapplicable[m] = True                                    newrules[m] = sentence.split()[n:n+4]                                                    for n in range(len(grammcorr_sentences1)):                    if ruleapplicable[m] == True:                        for p in range(len(grammcorr_sentences1[n])-3):                            if grammcorr_sentences1[n][p] == newrules[m][0]:                                 if grammcorr_sentences1[n][p+1] == newrules[m][1]:                                    if grammcorr_sentences1[n][p+2] == newrules[m][2]:                                        if grammcorr_sentences1[n][p+3] == newrules[m][3]:                                            equals[n] += 40 * len(newrules[m])            if len(rules[m]) == 5:                for n in range(len(gs_sentence1)-4):                    if rules[m][0] == gs_sentence1[n]:                        if rules[m][1] == gs_sentence1[n+1]:                            if rules[m][2] == gs_sentence1[n+2]:                                if rules[m][3] == gs_sentence1[n+3]:                                    if rules[m][4] == gs_sentence1[n+4]:                                        ruleapplicable[m] = True                                        newrules[m] = sentence.split()[n:n+5]                                                    for n in range(len(grammcorr_sentences1)):                    if ruleapplicable[m] == True:                        for p in range(len(grammcorr_sentences1[n])-4):                            if grammcorr_sentences1[n][p] == newrules[m][0]:                                 if grammcorr_sentences1[n][p+1] == newrules[m][1]:                                    if grammcorr_sentences1[n][p+2] == newrules[m][2]:                                        if grammcorr_sentences1[n][p+3] == newrules[m][3]:                                            if grammcorr_sentences1[n][p+4] == newrules[m][4]:                                                equals[n] += 40 * len(newrules[m])                    #print('the found rules from input:',newrules)                                        for n in range(len(grammcorr_sentences1)):            for m in range(len(specialrules)):                if len(specialrules[m]) == 2:                    for p in range(len(grammcorr_sentences1[n])-1):                        if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1]:                            equals[n] +=  len(grammcorr_sentences1[n])                        else:                            pass                if len(specialrules[m]) == 3:                    for p in range(len(grammcorr_sentences1[n])-2):                        if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1] and right_gs_tupel1[n+2][p] == specialrules[m][2]:                            equals[n] += len(grammcorr_sentences1[n])                        else:                            pass                #for n in range(len(grammcorr_sentences1)):            #if len(sentence.split()) == grammcorr_sentences1[n]:                #equals[n] += 50                indexedequals = []        for n in range(len(equals)):            indexedequals.append([n,equals[n]])                indexedequals_sorted = sorted(indexedequals[::-1], key=lambda tup: tup[1], reverse=True)                            return grammcorr_sentences1[indexedequals_sorted[0][0]]                    def checkSPO(self, splitsentence, convertedornot):                if convertedornot == 0:            gs_sentenceSPOProof = self.Sentence2GrammarSchema(' '.join(splitsentence), 'word.dep_')                if convertedornot == 1:            gs_sentenceSPOProof = splitsentence                spoCount = [0,0,0]                for word in gs_sentenceSPOProof:            if word == 'sb' or word == 'ep' or word == 'ph':                spoCount[0] = 1            if word == 'ROOT' or word == 'pd':                spoCount[1] = 1            if word == 'oa' or word == 'og' or word == 'oc' or word == 'op' or word == 'mo':                spoCount[2] = 1                return spoCount        def checkForAnnotation(self, splitsentence, token, spacyclass):                gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)                AnnoORnot = 0        for word in gs_sentence_RC_Proof:            if word == token:                AnnoORnot = 1                return AnnoORnot        def checkForAnnotationInTokenizedSentence(self, splitsentence, token):                gs_sentence_RC_Proof = splitsentence                AnnoORnot = 0        for word in gs_sentence_RC_Proof:            if word == token:                AnnoORnot = 1                return AnnoORnot            def checkForAnnotationTuple(self, splitsentence, token, spacyclass, tupleinwords):        #self.spacyclass = spacyclass        gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)        AnnotationtupleInwords = []        AnnoORnot = 0        #print(gs_sentence_RC_Proof)        for n in range(len(gs_sentence_RC_Proof) - 1):            if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tupleinwords[0] or tupleinwords == 'None'):                #print('oioioiAYE')                #print(gs_sentence_RC_Proof)                AnnoORnot = 1                if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tupleinwords[1] or tupleinwords == 'None'):                    AnnoORnot = 2                    AnnotationtupleInwords.append(splitsentence[n:n+2])                    #print(token)                        return AnnoORnot, AnnotationtupleInwords        def checkForAnnotationTriple(self, splitsentence, token, spacyclass, tripleinwords):        #self.spacyclass = spacyclass        gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)                #print('gssentencercprooof', gs_sentence_RC_Proof)                AnnoORnot = 0        AnnotationtripleInwords = []        for n in range(len(gs_sentence_RC_Proof) - 2):            if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tripleinwords[0] or tripleinwords == 'None'):                AnnoORnot = 1                if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tripleinwords[1] or tripleinwords == 'None'):                    AnnoORnot = 2                    if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == tripleinwords[2] or tripleinwords == 'None'):                        AnnoORnot = 3                        AnnotationtripleInwords.append(splitsentence[n:n+3])                        return AnnoORnot, AnnotationtripleInwords        def checkForAnnotationQuadruple(self, splitsentence, token, spacyclass, quadrupleinwords):        #self.spacyclass = spacyclass        gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)                #print('gssentencercprooof', gs_sentence_RC_Proof)        #print('quadrupleinwords',quadrupleinwords)        #print('token', token)        AnnoORnot = 0        AnnotationquadrupleInwords = []        for n in range(len(gs_sentence_RC_Proof) - 3):            if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == quadrupleinwords[0] or quadrupleinwords == 'None'):                AnnoORnot = 1                if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == quadrupleinwords[1] or quadrupleinwords == 'None'):                    AnnoORnot = 2                    if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == quadrupleinwords[2] or quadrupleinwords == 'None'):                        AnnoORnot = 3                        if gs_sentence_RC_Proof[n + 3] == token[3] and (splitsentence[n+3] == quadrupleinwords[3] or quadrupleinwords == 'None'):                            AnnoORnot = 4                            AnnotationquadrupleInwords.append(splitsentence[n:n+4])                #print('AnnotationquadrupleInwords', AnnotationquadrupleInwords)                return AnnoORnot, AnnotationquadrupleInwords            #input ['this', 'is', 'a', 'sentence']        def GetTuplesinSentence(self,mainsentence):                tuplesToCheck = []        tuples = [['ART', 'NE'], ['ART', 'NN'], ['APPR','NN'], ['APPR','ADJD'], ['APPR','NE'], ['ART', 'CARD'], ['APPR', 'CARD'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD'],['ADV', 'ADV'], ['ADV', 'PTKVZ'], ['PTKNEG', 'ADV'], ['ADJA','NN'], ['ADJA','NE'], ['ADV','PIS'], ['ADJA','PIS'], ['ADJD','PIS'], ['APPRART', 'NN'], ['APPRART', 'NE'], ['PDAT', 'NE'], ['PDAT', 'NN'], ['PWAT', 'NE'], ['PWAT', 'NN'], ['PIAT', 'NE'], ['PIAT', 'NN'], ['PROAV', 'ADJD'],['PDS', 'NE'], ['PDS', 'NN'], ['NE', 'NE'], ['CARD', 'NE'], ['CARD', 'NN'] ]        #print('beginning of gettuplesinsentence')        #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)        for tupl in tuples:            #print('checking another tuple')            #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)                        checktupleindex, tupleInWords = self.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')            if len(tupleInWords) > 0:                for tup in tupleInWords:                    tuplesToCheck.append([tupl, tup])                    #print('oi a tuple was found')        #print('after the loop')        #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)                #print('no going to the triples')        triplesToCheck = []        triples = [['APPR', 'ART', 'NN'],['APPR', 'PDAT', 'NN'], ['APPR', 'PDS', 'NN'], ['ART','ADJA','NN'], ['ART','ADJA','NE'], ['APPR', 'ART', 'NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['APPR', 'PIAT', 'NN'], ['APPR', 'ADJA', 'NN'], ['APPR', 'ADJA', 'NE'], ['APPRART', 'NN', 'CARD'], ['APPRART', 'NE', 'CARD'], ['APPRART', 'NN', 'NE'], ['CARD', 'KON', 'CARD'], ['APPR', 'ADV', 'CARD'], ['ADJD', 'KOKOM', 'CARD'], ['APPR', 'NE', 'NE'], ['NN', 'KON', 'NN'], ['NE', 'NN', 'NE'], ['APPR', 'NE', 'NN'], ['APPR', 'CARD', 'NN'], ['APPR', 'CARD', 'NE']]        for tripl in triples:            #print('checking next triple')            checktripleindex, tripleInWords = self.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')            if len(tripleInWords) > 0:                for trip in tripleInWords:                    triplesToCheck.append([tripl, trip])                       #print('oi a triple was found')                quadruplesToCheck = []        quadruples = [['KOKOM', 'ADV', 'ADJA', 'NN'], ['KOKOM', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NN'], ['ART', 'NN', 'APPR', 'NE'], ['APPR', 'NE', 'NN', 'NE'], ['APPR', 'ART', 'ADJA', 'NN'], ['ART', 'ADJD', 'ADJA', 'NN']]                for quadrupl in quadruples:            #print('checking next triple')            checkquadrupleindex, quadrupleInWords = self.checkForAnnotationQuadruple(mainsentence, quadrupl, 'word.tag_', 'None')            if len(quadrupleInWords) > 0:                for quad in quadrupleInWords:                    quadruplesToCheck.append([quadrupl, quad])                                #print('gettuples insentences is done')        return tuplesToCheck, triplesToCheck, quadruplesToCheck            def createTupleofGrammarpieces(self, sentence, tuplesToCheck, triplesToCheck, quadruplesToCheck):        #print('going in crate tuple of grammar pieces')        tuplestoremove = []        for tupl in tuplesToCheck:            for tripl in triplesToCheck:                if (tupl[1][0] == tripl[1][0] and tupl[1][1] == tripl[1][1]) or (tupl[1][0] == tripl[1][1] and tupl[1][1] == tripl[1][2]):                    tuplestoremove.append(tupl)        for tupletoremove in tuplestoremove:            tuplesToCheck.remove(tupletoremove)                #print('in between0', sentence, quadruplesToCheck, tuplesToCheck, triplesToCheck)                tuplestoremove = []                for tupl in tuplesToCheck:            for quad in quadruplesToCheck:                #print('I got here')                #print(tupl, quad)                #print(tupl[1][0], tupl[1][1], quad[1][2], quad[1][3])                if (tupl[1][0] == quad[1][0] and tupl[1][1] == quad[1][1]) or (tupl[1][0] == quad[1][1] and tupl[1][1] == quad[1][2]) or (tupl[1][0] == quad[1][2] and tupl[1][1] == quad[1][3]):                    #print('and I got here', tupl)                    tuplestoremove.append(tupl)        for tupletoremove in tuplestoremove:            tuplesToCheck.remove(tupletoremove)                #print('and until here?')                triplestoremove = []        for tripl in triplesToCheck:            for quad in quadruplesToCheck:                if (tripl[1][0] == quad[1][0] and tripl[1][1] == quad[1][1] and tripl[1][2] == quad[1][2]) or (tripl[1][0] == quad[1][1] and tripl[1][1] == quad[1][2] and tripl[1][2] == quad[1][3]):                    triplestoremove.append(tripl)        for tripltoremove in triplestoremove:            triplesToCheck.remove(tripltoremove)                bracketinfo = []        bracketinfos = []        bracketindex = 0                #print('in between1', sentence, quadruplesToCheck)                for n in range(len(sentence)):                        if sentence[n] != '':                if sentence[n] == '(' or sentence[n][0] == '(':                    for m in range(n ,len(sentence)):                        bracketinfo.append(sentence[m])                        if sentence[m] == ')' or sentence[m][-1] == ')':                                                        wordbeforebracketinfo = None                            try:                                wordbeforebracketinfo = sentence[n-1]                            except:                                pass                            bracketinfos.append([bracketinfo, wordbeforebracketinfo])                            bracketinfo = []                            break                #print('in between2', sentence, quadruplesToCheck)        #print('bracketinfo',bracketinfos)        #print('sentence',sentence)        for bracketinfo in bracketinfos:            for word in bracketinfo[0]:                sentence.remove(word)                                   #print('in between3', sentence, quadruplesToCheck)                if len(quadruplesToCheck) != 0:            for n in range(len(quadruplesToCheck)):                for m in range(len(sentence) - 3):                    if sentence[m] == quadruplesToCheck[n][1][0]:                        if sentence[m + 1] == quadruplesToCheck[n][1][1]:                            if sentence[m + 2] == quadruplesToCheck[n][1][2]:                                if sentence[m + 3] == quadruplesToCheck[n][1][3]:                                    del sentence[m + 3]                                    del sentence[m + 2]                                    del sentence[m + 1]                                    del sentence[m]                                    sentence.insert(m,' '.join(quadruplesToCheck[n][1]))                                                    if len(triplesToCheck) != 0:            for n in range(len(triplesToCheck)):                for m in range(len(sentence) - 2):                    if sentence[m] == triplesToCheck[n][1][0]:                        if sentence[m + 1] == triplesToCheck[n][1][1]:                            if sentence[m + 2] == triplesToCheck[n][1][2]:                                del sentence[m + 2]                                del sentence[m + 1]                                del sentence[m]                                sentence.insert(m,' '.join(triplesToCheck[n][1]))                                    if len(tuplesToCheck) != 0:            for n in range(len(tuplesToCheck)):                for m in range(len(sentence) - 1):                    if sentence[m] == tuplesToCheck[n][1][0]:                        if sentence[m + 1] == tuplesToCheck[n][1][1]:                            del sentence[m + 1]                            del sentence[m]                            sentence.insert(m,' '.join(tuplesToCheck[n][1]))                for bracketinfo in bracketinfos:            bracketinfowasthere = 0            for n in range(len(sentence)):                sentencensplit = sentence[n].split()                if bracketinfo[1] == sentencensplit[-1]:                    sentence[n] = sentence[n] + ' '.join(bracketinfo[0])                    bracketinfowasthere = 1                    break            if bracketinfowasthere == 0:                sentence.append(' '.join(bracketinfo[0]))        #print('sentence in gs create tuple of grammar pieces', sentence)                #print('thesentencein create tuple of grammarpieces ',sentence)        return sentence        # die folgende Klasse ist zu rechenaufwendig    def filterpermutationsaccordingtotuples(self, sentences, tuplesToCheck, triplesToCheck):                filteredprobsentences = []        for sentence in sentences:                                                tuplchecked = 0            triplchecked = 0            #print('sentence and tuples to check', sentence, tuplesToCheck)            for tupl in tuplesToCheck:                #print(list(sentence))                  checkedsecondtime, tupleinWords = self.checkForAnnotationTuple(sentence, tupl[0], 'word.tag_', tupl[1])                                #print(checkedsecondtime)                if checkedsecondtime == 1:                                                tuplchecked = 0                                if checkedsecondtime == 2:                                        tuplchecked = 1                                        for tripl in triplesToCheck:                #print(sentence)                checkedsecondtime, tripleinWords = self.checkForAnnotationTriple(sentence, tripl[0], 'word.tag_', tripl[1])                if checkedsecondtime == 1 or checkedsecondtime == 2:                                                    triplchecked = 0                                if checkedsecondtime == 3:                                        triplchecked = 1                                                if tuplchecked == 1 or triplchecked == 1:                filteredprobsentences.append(sentence)                    return filteredprobsentences