alpcentaur
/
basabuuka_prototyp



								# class to implement GS utils and Search


								import resource


								class GS_Utils(object):


								    def __init__(self, language):


								        #print('loading spacy..')

								        import spacy

								        self.nlp = spacy.load(language)

								        #print('done')

								        self.oi = 'oi'


								    def Sentence2GrammarSchema(self, sentence, spacyclass):


								        doc = self.nlp(sentence)


								        #print(doc)


								        GsDBsentence = []


								        for word in doc:


								            # es eignet sich hierbei word.pos_  fuer noun und verb, word.dep_ fuer sb pd, und evtl tag

								            if len(eval(spacyclass)) > 1:

								                GsDBsentence.append(eval(spacyclass))


								        return GsDBsentence


								    def Sentence2RightGrammarTupel(self, sentence, gs_sentence, right_gs_tupel):

								        grammcorr_sentences = []


								        #print(sentence)

								        #print(gs_sentence)

								        #print(right_gs_tupel)


								        sentence = sentence.split()


								        for elements in right_gs_tupel:

								            grammcor_sentence = []


								            usedwordslist = []

								            usedwords = set(usedwordslist)


								            for element in elements.split():

								                ok = 0

								                #print('1')

								                for n in range(len(gs_sentence)):

								                    #print(element)

								                    #print(gs_sentence)


								                    if element == gs_sentence[n] and n not in usedwords:

								                        if ok == 0:

								                            #print('bla', sentence[n])


								                            grammcor_sentence.append(sentence[n])


								                            usedwordslist.append(n)

								                            usedwords = set(usedwordslist)


								                            ok = 1

								            grammcorr_sentences.append(grammcor_sentence)


								        return grammcorr_sentences


								    # gets the best grammar scheme from both, depending on which one appears the most in both, and whether rules are still present.

								    def GetBestgsAccordingRules(self, sentence, gs_sentence1, right_gs_tupel1, right_gs_tupel2, grammcorr_sentences1 , grammcorr_sentences2, rules , specialrules):

								        equals = []

								        for n in range(len(grammcorr_sentences1)):

								            equalcount = 0

								            for l in range(len(grammcorr_sentences2)):


								                if len(grammcorr_sentences1[n]) <= len(grammcorr_sentences2[l]):

								                    for m in range(len(grammcorr_sentences1[n])):

								                        if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:

								                            equalcount += 1

								                else:

								                    for m in range(len(grammcorr_sentences2[l])):

								                        if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:

								                            equalcount += 1

								            equals.append(equalcount)


								        # from here check the if a rule is in the input, if yes then check it for grammar schemes and raise scores.


								        newrules = []

								        for n in range(len(rules)):

								            newrules.append([])


								        ruleapplicable = []

								        for m in range(len(rules)):


								            ruleapplicable.append(False)


								            if len(rules[m]) == 2:

								                for n in range(len(gs_sentence1)-1):

								                    if rules[m][0] == gs_sentence1[n]:

								                        if rules[m][1] == gs_sentence1[n+1]:

								                            ruleapplicable[m] = True

								                            newrules[m] = sentence.split()[n:n+2]


								                for n in range(len(grammcorr_sentences1)):

								                    if ruleapplicable[m] == True:

								                        for p in range(len(grammcorr_sentences1[n])-1):


								                            if grammcorr_sentences1[n][p] == newrules[m][0] and grammcorr_sentences1[n][p+1] == newrules[m][1]:

								                                equals[n] += 40 * len(newrules[m])

								                            else:

								                                pass


								            if len(rules[m]) == 3:

								                for n in range(len(gs_sentence1)-2):

								                    if rules[m][0] == gs_sentence1[n]:

								                        if rules[m][1] == gs_sentence1[n+1]:

								                            if rules[m][2] == gs_sentence1[n+2]:

								                                ruleapplicable[m] = True

								                                newrules[m] = sentence.split()[n:n+3]


								                for n in range(len(grammcorr_sentences1)):

								                    if ruleapplicable[m] == True:

								                        for p in range(len(grammcorr_sentences1[n])-2):

								                            if grammcorr_sentences1[n][p] == newrules[m][0]:

								                                if grammcorr_sentences1[n][p+1] == newrules[m][1]:

								                                    if grammcorr_sentences1[n][p+2] == newrules[m][2]:

								                                        equals[n] += 40 * len(newrules[m])


								            if len(rules[m]) == 4:

								                for n in range(len(gs_sentence1)-3):

								                    if rules[m][0] == gs_sentence1[n]:

								                        if rules[m][1] == gs_sentence1[n+1]:

								                            if rules[m][2] == gs_sentence1[n+2]:

								                                if rules[m][3] == gs_sentence1[n+3]:

								                                    ruleapplicable[m] = True

								                                    newrules[m] = sentence.split()[n:n+4]


								                for n in range(len(grammcorr_sentences1)):

								                    if ruleapplicable[m] == True:

								                        for p in range(len(grammcorr_sentences1[n])-3):

								                            if grammcorr_sentences1[n][p] == newrules[m][0]:

								                                if grammcorr_sentences1[n][p+1] == newrules[m][1]:

								                                    if grammcorr_sentences1[n][p+2] == newrules[m][2]:

								                                        if grammcorr_sentences1[n][p+3] == newrules[m][3]:

								                                            equals[n] += 40 * len(newrules[m])

								            if len(rules[m]) == 5:

								                for n in range(len(gs_sentence1)-4):

								                    if rules[m][0] == gs_sentence1[n]:

								                        if rules[m][1] == gs_sentence1[n+1]:

								                            if rules[m][2] == gs_sentence1[n+2]:

								                                if rules[m][3] == gs_sentence1[n+3]:

								                                    if rules[m][4] == gs_sentence1[n+4]:

								                                        ruleapplicable[m] = True

								                                        newrules[m] = sentence.split()[n:n+5]


								                for n in range(len(grammcorr_sentences1)):

								                    if ruleapplicable[m] == True:

								                        for p in range(len(grammcorr_sentences1[n])-4):

								                            if grammcorr_sentences1[n][p] == newrules[m][0]:

								                                if grammcorr_sentences1[n][p+1] == newrules[m][1]:

								                                    if grammcorr_sentences1[n][p+2] == newrules[m][2]:

								                                        if grammcorr_sentences1[n][p+3] == newrules[m][3]:

								                                            if grammcorr_sentences1[n][p+4] == newrules[m][4]:

								                                                equals[n] += 40 * len(newrules[m])


								        #print('the found rules from input:',newrules)


								        for n in range(len(grammcorr_sentences1)):

								            for m in range(len(specialrules)):

								                if len(specialrules[m]) == 2:

								                    for p in range(len(grammcorr_sentences1[n])-1):

								                        if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1]:

								                            equals[n] +=  len(grammcorr_sentences1[n])

								                        else:

								                            pass

								                if len(specialrules[m]) == 3:

								                    for p in range(len(grammcorr_sentences1[n])-2):

								                        if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1] and right_gs_tupel1[n+2][p] == specialrules[m][2]:

								                            equals[n] += len(grammcorr_sentences1[n])

								                        else:

								                            pass


								        #for n in range(len(grammcorr_sentences1)):

								            #if len(sentence.split()) == grammcorr_sentences1[n]:

								                #equals[n] += 50


								        indexedequals = []

								        for n in range(len(equals)):

								            indexedequals.append([n,equals[n]])


								        indexedequals_sorted = sorted(indexedequals[::-1], key=lambda tup: tup[1], reverse=True)


								        return grammcorr_sentences1[indexedequals_sorted[0][0]]


								    def checkSPO(self, splitsentence, convertedornot):


								        if convertedornot == 0:

								            gs_sentenceSPOProof = self.Sentence2GrammarSchema(' '.join(splitsentence), 'word.dep_')


								        if convertedornot == 1:

								            gs_sentenceSPOProof = splitsentence


								        spoCount = [0,0,0]


								        for word in gs_sentenceSPOProof:

								            if word == 'sb' or word == 'ep' or word == 'ph':

								                spoCount[0] = 1

								            if word == 'ROOT' or word == 'pd':

								                spoCount[1] = 1

								            if word == 'oa' or word == 'og' or word == 'oc' or word == 'op' or word == 'mo':

								                spoCount[2] = 1


								        return spoCount


								    def checkForAnnotation(self, splitsentence, token, spacyclass):


								        gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)


								        AnnoORnot = 0

								        for word in gs_sentence_RC_Proof:

								            if word == token:

								                AnnoORnot = 1


								        return AnnoORnot


								    def checkForAnnotationInTokenizedSentence(self, splitsentence, token):


								        gs_sentence_RC_Proof = splitsentence


								        AnnoORnot = 0

								        for word in gs_sentence_RC_Proof:

								            if word == token:

								                AnnoORnot = 1


								        return AnnoORnot


								    def checkForAnnotationTuple(self, splitsentence, token, spacyclass, tupleinwords):

								        #self.spacyclass = spacyclass

								        gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)

								        AnnotationtupleInwords = []

								        AnnoORnot = 0

								        #print(gs_sentence_RC_Proof)

								        for n in range(len(gs_sentence_RC_Proof) - 1):

								            if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tupleinwords[0] or tupleinwords == 'None'):

								                #print('oioioiAYE')

								                #print(gs_sentence_RC_Proof)

								                AnnoORnot = 1

								                if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tupleinwords[1] or tupleinwords == 'None'):

								                    AnnoORnot = 2

								                    AnnotationtupleInwords.append(splitsentence[n:n+2])

								                    #print(token)


								        return AnnoORnot, AnnotationtupleInwords


								    def checkForAnnotationTriple(self, splitsentence, token, spacyclass, tripleinwords):

								        #self.spacyclass = spacyclass

								        gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)


								        #print('gssentencercprooof', gs_sentence_RC_Proof)


								        AnnoORnot = 0

								        AnnotationtripleInwords = []

								        for n in range(len(gs_sentence_RC_Proof) - 2):

								            if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tripleinwords[0] or tripleinwords == 'None'):

								                AnnoORnot = 1

								                if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tripleinwords[1] or tripleinwords == 'None'):

								                    AnnoORnot = 2

								                    if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == tripleinwords[2] or tripleinwords == 'None'):

								                        AnnoORnot = 3

								                        AnnotationtripleInwords.append(splitsentence[n:n+3])


								        return AnnoORnot, AnnotationtripleInwords


								    def checkForAnnotationQuadruple(self, splitsentence, token, spacyclass, quadrupleinwords):

								        #self.spacyclass = spacyclass

								        gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)


								        #print('gssentencercprooof', gs_sentence_RC_Proof)

								        #print('quadrupleinwords',quadrupleinwords)

								        #print('token', token)

								        AnnoORnot = 0

								        AnnotationquadrupleInwords = []

								        for n in range(len(gs_sentence_RC_Proof) - 3):

								            if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == quadrupleinwords[0] or quadrupleinwords == 'None'):

								                AnnoORnot = 1

								                if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == quadrupleinwords[1] or quadrupleinwords == 'None'):

								                    AnnoORnot = 2

								                    if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == quadrupleinwords[2] or quadrupleinwords == 'None'):

								                        AnnoORnot = 3

								                        if gs_sentence_RC_Proof[n + 3] == token[3] and (splitsentence[n+3] == quadrupleinwords[3] or quadrupleinwords == 'None'):

								                            AnnoORnot = 4

								                            AnnotationquadrupleInwords.append(splitsentence[n:n+4])


								        #print('AnnotationquadrupleInwords', AnnotationquadrupleInwords)


								        return AnnoORnot, AnnotationquadrupleInwords


								    #input ['this', 'is', 'a', 'sentence']


								    def GetTuplesinSentence(self,mainsentence):


								        tuplesToCheck = []

								        tuples = [['ART', 'NE'], ['ART', 'NN'], ['APPR','NN'], ['APPR','ADJD'], ['APPR','NE'], ['ART', 'CARD'], ['APPR', 'CARD'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD'],['ADV', 'ADV'], ['ADV', 'PTKVZ'], ['PTKNEG', 'ADV'], ['ADJA','NN'], ['ADJA','NE'], ['ADV','PIS'], ['ADJA','PIS'], ['ADJD','PIS'], ['APPRART', 'NN'], ['APPRART', 'NE'], ['PDAT', 'NE'], ['PDAT', 'NN'], ['PWAT', 'NE'], ['PWAT', 'NN'], ['PIAT', 'NE'], ['PIAT', 'NN'], ['PROAV', 'ADJD'],['PDS', 'NE'], ['PDS', 'NN'], ['NE', 'NE'], ['CARD', 'NE'], ['CARD', 'NN'] ]

								        #print('beginning of gettuplesinsentence')

								        #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

								        for tupl in tuples:

								            #print('checking another tuple')

								            #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)


								            checktupleindex, tupleInWords = self.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')

								            if len(tupleInWords) > 0:

								                for tup in tupleInWords:

								                    tuplesToCheck.append([tupl, tup])

								                    #print('oi a tuple was found')

								        #print('after the loop')

								        #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)


								        #print('no going to the triples')

								        triplesToCheck = []

								        triples = [['APPR', 'ART', 'NN'],['APPR', 'PDAT', 'NN'], ['APPR', 'PDS', 'NN'], ['ART','ADJA','NN'], ['ART','ADJA','NE'], ['APPR', 'ART', 'NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['APPR', 'PIAT', 'NN'], ['APPR', 'ADJA', 'NN'], ['APPR', 'ADJA', 'NE'], ['APPRART', 'NN', 'CARD'], ['APPRART', 'NE', 'CARD'], ['APPRART', 'NN', 'NE'], ['CARD', 'KON', 'CARD'], ['APPR', 'ADV', 'CARD'], ['ADJD', 'KOKOM', 'CARD'], ['APPR', 'NE', 'NE'], ['NN', 'KON', 'NN'], ['NE', 'NN', 'NE'], ['APPR', 'NE', 'NN'], ['APPR', 'CARD', 'NN'], ['APPR', 'CARD', 'NE']]

								        for tripl in triples:

								            #print('checking next triple')

								            checktripleindex, tripleInWords = self.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')

								            if len(tripleInWords) > 0:

								                for trip in tripleInWords:

								                    triplesToCheck.append([tripl, trip])

								                    #print('oi a triple was found')


								        quadruplesToCheck = []

								        quadruples = [['KOKOM', 'ADV', 'ADJA', 'NN'], ['KOKOM', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NN'], ['ART', 'NN', 'APPR', 'NE'], ['APPR', 'NE', 'NN', 'NE'], ['APPR', 'ART', 'ADJA', 'NN'], ['ART', 'ADJD', 'ADJA', 'NN']]


								        for quadrupl in quadruples:

								            #print('checking next triple')

								            checkquadrupleindex, quadrupleInWords = self.checkForAnnotationQuadruple(mainsentence, quadrupl, 'word.tag_', 'None')

								            if len(quadrupleInWords) > 0:

								                for quad in quadrupleInWords:

								                    quadruplesToCheck.append([quadrupl, quad])


								        #print('gettuples insentences is done')

								        return tuplesToCheck, triplesToCheck, quadruplesToCheck


								    def createTupleofGrammarpieces(self, sentence, tuplesToCheck, triplesToCheck, quadruplesToCheck):

								        #print('going in crate tuple of grammar pieces')

								        tuplestoremove = []

								        for tupl in tuplesToCheck:

								            for tripl in triplesToCheck:

								                if (tupl[1][0] == tripl[1][0] and tupl[1][1] == tripl[1][1]) or (tupl[1][0] == tripl[1][1] and tupl[1][1] == tripl[1][2]):

								                    tuplestoremove.append(tupl)

								        for tupletoremove in tuplestoremove:

								            tuplesToCheck.remove(tupletoremove)


								        #print('in between0', sentence, quadruplesToCheck, tuplesToCheck, triplesToCheck)


								        tuplestoremove = []


								        for tupl in tuplesToCheck:

								            for quad in quadruplesToCheck:

								                #print('I got here')

								                #print(tupl, quad)

								                #print(tupl[1][0], tupl[1][1], quad[1][2], quad[1][3])

								                if (tupl[1][0] == quad[1][0] and tupl[1][1] == quad[1][1]) or (tupl[1][0] == quad[1][1] and tupl[1][1] == quad[1][2]) or (tupl[1][0] == quad[1][2] and tupl[1][1] == quad[1][3]):

								                    #print('and I got here', tupl)

								                    tuplestoremove.append(tupl)

								        for tupletoremove in tuplestoremove:

								            tuplesToCheck.remove(tupletoremove)


								        #print('and until here?')


								        triplestoremove = []

								        for tripl in triplesToCheck:

								            for quad in quadruplesToCheck:

								                if (tripl[1][0] == quad[1][0] and tripl[1][1] == quad[1][1] and tripl[1][2] == quad[1][2]) or (tripl[1][0] == quad[1][1] and tripl[1][1] == quad[1][2] and tripl[1][2] == quad[1][3]):

								                    triplestoremove.append(tripl)

								        for tripltoremove in triplestoremove:

								            triplesToCheck.remove(tripltoremove)


								        bracketinfo = []

								        bracketinfos = []

								        bracketindex = 0


								        #print('in between1', sentence, quadruplesToCheck)


								        for n in range(len(sentence)):


								            if sentence[n] != '':

								                if sentence[n] == '(' or sentence[n][0] == '(':

								                    for m in range(n ,len(sentence)):

								                        bracketinfo.append(sentence[m])

								                        if sentence[m] == ')' or sentence[m][-1] == ')':


								                            wordbeforebracketinfo = None

								                            try:

								                                wordbeforebracketinfo = sentence[n-1]

								                            except:

								                                pass

								                            bracketinfos.append([bracketinfo, wordbeforebracketinfo])

								                            bracketinfo = []

								                            break


								        #print('in between2', sentence, quadruplesToCheck)

								        #print('bracketinfo',bracketinfos)

								        #print('sentence',sentence)

								        for bracketinfo in bracketinfos:

								            for word in bracketinfo[0]:

								                sentence.remove(word)


								        #print('in between3', sentence, quadruplesToCheck)


								        if len(quadruplesToCheck) != 0:

								            for n in range(len(quadruplesToCheck)):

								                for m in range(len(sentence) - 3):

								                    if sentence[m] == quadruplesToCheck[n][1][0]:

								                        if sentence[m + 1] == quadruplesToCheck[n][1][1]:

								                            if sentence[m + 2] == quadruplesToCheck[n][1][2]:

								                                if sentence[m + 3] == quadruplesToCheck[n][1][3]:

								                                    del sentence[m + 3]

								                                    del sentence[m + 2]

								                                    del sentence[m + 1]

								                                    del sentence[m]

								                                    sentence.insert(m,' '.join(quadruplesToCheck[n][1]))


								        if len(triplesToCheck) != 0:

								            for n in range(len(triplesToCheck)):

								                for m in range(len(sentence) - 2):

								                    if sentence[m] == triplesToCheck[n][1][0]:

								                        if sentence[m + 1] == triplesToCheck[n][1][1]:

								                            if sentence[m + 2] == triplesToCheck[n][1][2]:

								                                del sentence[m + 2]

								                                del sentence[m + 1]

								                                del sentence[m]

								                                sentence.insert(m,' '.join(triplesToCheck[n][1]))


								        if len(tuplesToCheck) != 0:

								            for n in range(len(tuplesToCheck)):

								                for m in range(len(sentence) - 1):

								                    if sentence[m] == tuplesToCheck[n][1][0]:

								                        if sentence[m + 1] == tuplesToCheck[n][1][1]:

								                            del sentence[m + 1]

								                            del sentence[m]

								                            sentence.insert(m,' '.join(tuplesToCheck[n][1]))


								        for bracketinfo in bracketinfos:

								            bracketinfowasthere = 0

								            for n in range(len(sentence)):

								                sentencensplit = sentence[n].split()

								                if bracketinfo[1] == sentencensplit[-1]:

								                    sentence[n] = sentence[n] + ' '.join(bracketinfo[0])

								                    bracketinfowasthere = 1

								                    break

								            if bracketinfowasthere == 0:

								                sentence.append(' '.join(bracketinfo[0]))

								        #print('sentence in gs create tuple of grammar pieces', sentence)

								        #print('thesentencein create tuple of grammarpieces ',sentence)

								        return sentence


								    # die folgende Klasse ist zu rechenaufwendig

								    def filterpermutationsaccordingtotuples(self, sentences, tuplesToCheck, triplesToCheck):


								        filteredprobsentences = []

								        for sentence in sentences:


								            tuplchecked = 0

								            triplchecked = 0

								            #print('sentence and tuples to check', sentence, tuplesToCheck)

								            for tupl in tuplesToCheck:

								                #print(list(sentence))

								                checkedsecondtime, tupleinWords = self.checkForAnnotationTuple(sentence, tupl[0], 'word.tag_', tupl[1])


								                #print(checkedsecondtime)

								                if checkedsecondtime == 1:


								                    tuplchecked = 0


								                if checkedsecondtime == 2:


								                    tuplchecked = 1


								            for tripl in triplesToCheck:

								                #print(sentence)

								                checkedsecondtime, tripleinWords = self.checkForAnnotationTriple(sentence, tripl[0], 'word.tag_', tripl[1])

								                if checkedsecondtime == 1 or checkedsecondtime == 2:


								                    triplchecked = 0


								                if checkedsecondtime == 3:


								                    triplchecked = 1


								            if tuplchecked == 1 or triplchecked == 1:

								                filteredprobsentences.append(sentence)


								        return filteredprobsentences