alpcentaur
/
basabuuka_prototyp

# Class to solve Shortforms, data comes from Abkuerzungen.txt
import hickle as hkl
import FASTsearch
class SolveShorts(object):        def __init__(self, hklDatabaseDir_Shorts, hklDatabaseDir_Shorts_All):                self.ShortsDB_All = hkl.load(hklDatabaseDir_Shorts_All)        self.ShortsDB = hkl.load(hklDatabaseDir_Shorts)                # Input: csv file with the form ['d.h.', n] , ['das', 'heißt'] for each line    # Output: hkl dump of array in form [[1],[d.h.],['das', 'heißt']]        def create_hklDB_from_csv(self, csvDbDir):                with open(csvDbDir) as lines:                        ShortsDB_All = []                        for line in lines:                                ShortsDB_All.append(list(eval(line)))                                    #print(ShortsDB_All)                        #print(ShortsDB_All[0][0])                                    hkldbShorts = []            counter = 0            for n in range(len(ShortsDB_All)):                                counter += 1                #if counter % 1000 == 0:                    #print(counter)                                hkldbShorts.append([ShortsDB_All[n][0][0]])                            #print('hkldbShorts', hkldbShorts)                        #print('creating the hkl dump of ShortsDBAll')                hkl.dump(ShortsDB_All, 'hkldbShorts_All.hkl', mode='w', compression='gzip')            #print('done..')                        #print('Creating the hkl dump of ShortsDB')            hkl.dump(hkldbShorts, 'hkldbShorts.hkl', mode='w', compression='gzip')            #print('done..')                return 'done'            def load_DB_into_FASTsearch(self):                #print('loading hkldbShorts ..')        self.fsearch1 = FASTsearch.FASTsearch('hkldbShorts.hkl')        #print('done')                #print('generating BoW Model..')        #self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = True)        #print('done')                #print('loading the bow model')        self.fsearch1.Load_BoW_Model('bagofwordshkldbShorts.pkl', 'DataBaseOneZeroshkldbShorts.hkl')        #print('done')        import spacy        #print('loading the german spacy model..')        self.nlp = spacy.load('de_core_news_sm')        #print('done')                        #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())            def ExplainShortsInSentencesWithBrackets(self, sentences):        outsentences = []        count = 0        for sentence in sentences:                        count += 1            #print('processing sentence', count)                        nshort = []            therewasapossibleshort = 0            explanationlist = []                        doc = self.nlp(' '.join(sentence))            #print('da sentence', sentence)            newshorts = []            wordcount = 0            for oriword in sentence:                wordcount += 1                if wordcount == len(sentence):                     word = oriword + '.'                else:                    word = oriword                newshort = []                prenewshort = []                punctcount = list(word).count('.')                #print(word, list(word), punctcount)                if punctcount > 1:                    replaceindex = sentence.index(oriword)                    dacount = 0                    for letter in list(word):                        #print('letter in word split', letter)                        prenewshort.append(letter)                                                if letter == '.':                            dacount += 1                            newshort.append(''.join(prenewshort))                            prenewshort = []                            if dacount == punctcount:                                newshorts.append([newshort, replaceindex])            #print(newshorts)            for newshort in newshorts[::-1]:                if len(newshort) > 0:                    del sentence[newshort[1]]                    for part in newshort[0][::-1]:                        sentence.insert(newshort[1], part)            #print('sentence after newshortreplace', sentence)            for n in range(len(sentence)):                                NhasToBeChecked = True                for r in range(len(explanationlist)):                    if explanationlist[r][3] <= n < explanationlist[r][1]:                        NhasToBeChecked = False                                # Liste von falsch erkannten, zb er sollte nicht erkannt werden :)                if sentence[n] in ['Er', 'er', 'ab', 'Ab', 'so', 'da', 'an', 'mit', 'Am', 'am']:                    NhasToBeChecked = False                                if n != 0 and sentence[n][-1] != '.' and doc[n - 1].dep_[:2] != 'ART':                    NhasToBeChecked = False                                                if NhasToBeChecked == True:                                                                                                    bestmatches1, matchindex = self.fsearch1.search_with_highest_multiplikation_Output(sentence[n], 1)
                    #print(bestmatches1, matchindex)
                    interestingindex = 0                    if sentence[n][-1] == '.':                        #print(sentence[n])                        #print('oioioioioi')                        if len(sentence) - n > 5:
                            for m in range(5):                                #print(n, m, n+m+1, len(sentence))                                if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.':                                    interestingindex = m                                    break                        if len(sentence) - n <= 5 and n != len(sentence) - 1:                            for m in range((len(sentence) - n)):                                #print('oleolaolu',n, m, n+m+1, len(sentence))                                                                if m == (len(sentence) - n) - 1:                                    if sentence[n + m][-1] == '.':                                        interestingindex = m                                        break                                else:                                                                        if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.' :                                        interestingindex = m                                        break
                    #print(interestingindex, 'interestingindex')                    if interestingindex == 0:                        finalmatchindex = matchindex
                    if interestingindex >= 1:                        thesentence = ''                        for i in range(interestingindex + 1):                            #print('sentence', sentence[n+i])                            #print(thesentence + sentence[n+i])                            if i == 0:                                presentence = sentence[n + i]                            if i >= 1:                                presentence = ' ' + sentence[n + i]                            thesentence = thesentence + presentence
                        #print('thesentence',thesentence)                        mbestmatches, mmatchindex = self.fsearch1.search_with_highest_multiplikation_Output(thesentence , 1)                        #print(mmatchindex)                        finalmatchindex = mmatchindex


                    if finalmatchindex[1] == 1:                            wordexplanationIndex = finalmatchindex[0]
                            wordexplanation = self.ShortsDB_All[wordexplanationIndex][1]
                            explanationlist.insert(0, [wordexplanation, n + interestingindex + 1, interestingindex, n])
            #print('explanationlist', explanationlist)            for i in range(len(explanationlist)):                for k in range(len(explanationlist)):                    if explanationlist[i][3] == explanationlist[k][3] and i != k:                        if explanationlist[i][2] > explanationlist[k][2]:                            del explanationlist[k]                        if explanationlist[i][2] < explanationlist[k][2]:                            del explanationlist[i]                        for j in range(len(explanationlist)):                                sentence.insert(explanationlist[j][1], '(' + ' '.join(explanationlist[j][0]) + ')')                            #print(sentence)                         outsentences.append(sentence)            # if uebereinstimmung, go to index and exchange        return outsentences