alpcentaur
/
basabuuka_prototyp


								# Class to solve Shortforms, data comes from Abkuerzungen.txt


								import hickle as hkl


								import FASTsearch


								class SolveShorts(object):


								    def __init__(self, hklDatabaseDir_Shorts, hklDatabaseDir_Shorts_All):


								        self.ShortsDB_All = hkl.load(hklDatabaseDir_Shorts_All)

								        self.ShortsDB = hkl.load(hklDatabaseDir_Shorts)


								    # Input: csv file with the form ['d.h.', n] , ['das', 'heißt'] for each line

								    # Output: hkl dump of array in form [[1],[d.h.],['das', 'heißt']]


								    def create_hklDB_from_csv(self, csvDbDir):


								        with open(csvDbDir) as lines:


								            ShortsDB_All = []


								            for line in lines:


								                ShortsDB_All.append(list(eval(line)))


								            #print(ShortsDB_All)


								            #print(ShortsDB_All[0][0])


								            hkldbShorts = []

								            counter = 0

								            for n in range(len(ShortsDB_All)):


								                counter += 1

								                #if counter % 1000 == 0:

								                    #print(counter)


								                hkldbShorts.append([ShortsDB_All[n][0][0]])


								            #print('hkldbShorts', hkldbShorts)


								            #print('creating the hkl dump of ShortsDBAll')

								            hkl.dump(ShortsDB_All, 'hkldbShorts_All.hkl', mode='w', compression='gzip')

								            #print('done..')


								            #print('Creating the hkl dump of ShortsDB')

								            hkl.dump(hkldbShorts, 'hkldbShorts.hkl', mode='w', compression='gzip')

								            #print('done..')


								        return 'done'


								    def load_DB_into_FASTsearch(self):


								        #print('loading hkldbShorts ..')

								        self.fsearch1 = FASTsearch.FASTsearch('hkldbShorts.hkl')

								        #print('done')


								        #print('generating BoW Model..')

								        #self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = True)

								        #print('done')


								        #print('loading the bow model')

								        self.fsearch1.Load_BoW_Model('bagofwordshkldbShorts.pkl', 'DataBaseOneZeroshkldbShorts.hkl')

								        #print('done')

								        import spacy

								        #print('loading the german spacy model..')

								        self.nlp = spacy.load('de_core_news_sm')

								        #print('done')


								        #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())


								    def ExplainShortsInSentencesWithBrackets(self, sentences):

								        outsentences = []

								        count = 0

								        for sentence in sentences:


								            count += 1

								            #print('processing sentence', count)


								            nshort = []

								            therewasapossibleshort = 0

								            explanationlist = []


								            doc = self.nlp(' '.join(sentence))

								            #print('da sentence', sentence)

								            newshorts = []

								            wordcount = 0

								            for oriword in sentence:

								                wordcount += 1

								                if wordcount == len(sentence):

								                     word = oriword + '.'

								                else:

								                    word = oriword

								                newshort = []

								                prenewshort = []

								                punctcount = list(word).count('.')

								                #print(word, list(word), punctcount)

								                if punctcount > 1:

								                    replaceindex = sentence.index(oriword)

								                    dacount = 0

								                    for letter in list(word):

								                        #print('letter in word split', letter)

								                        prenewshort.append(letter)


								                        if letter == '.':

								                            dacount += 1

								                            newshort.append(''.join(prenewshort))

								                            prenewshort = []

								                            if dacount == punctcount:

								                                newshorts.append([newshort, replaceindex])

								            #print(newshorts)

								            for newshort in newshorts[::-1]:

								                if len(newshort) > 0:

								                    del sentence[newshort[1]]

								                    for part in newshort[0][::-1]:

								                        sentence.insert(newshort[1], part)

								            #print('sentence after newshortreplace', sentence)

								            for n in range(len(sentence)):


								                NhasToBeChecked = True

								                for r in range(len(explanationlist)):

								                    if explanationlist[r][3] <= n < explanationlist[r][1]:

								                        NhasToBeChecked = False


								                # Liste von falsch erkannten, zb er sollte nicht erkannt werden :)

								                if sentence[n] in ['Er', 'er', 'ab', 'Ab', 'so', 'da', 'an', 'mit', 'Am', 'am']:

								                    NhasToBeChecked = False


								                if n != 0 and sentence[n][-1] != '.' and doc[n - 1].dep_[:2] != 'ART':

								                    NhasToBeChecked = False


								                if NhasToBeChecked == True:


								                    bestmatches1, matchindex = self.fsearch1.search_with_highest_multiplikation_Output(sentence[n], 1)


								                    #print(bestmatches1, matchindex)


								                    interestingindex = 0

								                    if sentence[n][-1] == '.':

								                        #print(sentence[n])

								                        #print('oioioioioi')

								                        if len(sentence) - n > 5:


								                            for m in range(5):

								                                #print(n, m, n+m+1, len(sentence))

								                                if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.':

								                                    interestingindex = m

								                                    break

								                        if len(sentence) - n <= 5 and n != len(sentence) - 1:

								                            for m in range((len(sentence) - n)):

								                                #print('oleolaolu',n, m, n+m+1, len(sentence))


								                                if m == (len(sentence) - n) - 1:

								                                    if sentence[n + m][-1] == '.':

								                                        interestingindex = m

								                                        break

								                                else:


								                                    if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.' :

								                                        interestingindex = m

								                                        break


								                    #print(interestingindex, 'interestingindex')

								                    if interestingindex == 0:

								                        finalmatchindex = matchindex


								                    if interestingindex >= 1:

								                        thesentence = ''

								                        for i in range(interestingindex + 1):

								                            #print('sentence', sentence[n+i])

								                            #print(thesentence + sentence[n+i])

								                            if i == 0:

								                                presentence = sentence[n + i]

								                            if i >= 1:

								                                presentence = ' ' + sentence[n + i]

								                            thesentence = thesentence + presentence


								                        #print('thesentence',thesentence)

								                        mbestmatches, mmatchindex = self.fsearch1.search_with_highest_multiplikation_Output(thesentence , 1)

								                        #print(mmatchindex)

								                        finalmatchindex = mmatchindex


								                    if finalmatchindex[1] == 1:

								                            wordexplanationIndex = finalmatchindex[0]


								                            wordexplanation = self.ShortsDB_All[wordexplanationIndex][1]


								                            explanationlist.insert(0, [wordexplanation, n + interestingindex + 1, interestingindex, n])


								            #print('explanationlist', explanationlist)

								            for i in range(len(explanationlist)):

								                for k in range(len(explanationlist)):

								                    if explanationlist[i][3] == explanationlist[k][3] and i != k:

								                        if explanationlist[i][2] > explanationlist[k][2]:

								                            del explanationlist[k]

								                        if explanationlist[i][2] < explanationlist[k][2]:

								                            del explanationlist[i]


								            for j in range(len(explanationlist)):


								                sentence.insert(explanationlist[j][1], '(' + ' '.join(explanationlist[j][0]) + ')')


								            #print(sentence)


								            outsentences.append(sentence)

								            # if uebereinstimmung, go to index and exchange

								        return outsentences