# Class to solve Shortforms, data comes from Abkuerzungen.txt import hickle as hkl import FASTsearch class SolveShorts(object): def __init__(self, hklDatabaseDir_Shorts, hklDatabaseDir_Shorts_All): self.ShortsDB_All = hkl.load(hklDatabaseDir_Shorts_All) self.ShortsDB = hkl.load(hklDatabaseDir_Shorts) # Input: csv file with the form ['d.h.', n] , ['das', 'heißt'] for each line # Output: hkl dump of array in form [[1],[d.h.],['das', 'heißt']] def create_hklDB_from_csv(self, csvDbDir): with open(csvDbDir) as lines: ShortsDB_All = [] for line in lines: ShortsDB_All.append(list(eval(line))) #print(ShortsDB_All) #print(ShortsDB_All[0][0]) hkldbShorts = [] counter = 0 for n in range(len(ShortsDB_All)): counter += 1 #if counter % 1000 == 0: #print(counter) hkldbShorts.append([ShortsDB_All[n][0][0]]) #print('hkldbShorts', hkldbShorts) #print('creating the hkl dump of ShortsDBAll') hkl.dump(ShortsDB_All, 'hkldbShorts_All.hkl', mode='w', compression='gzip') #print('done..') #print('Creating the hkl dump of ShortsDB') hkl.dump(hkldbShorts, 'hkldbShorts.hkl', mode='w', compression='gzip') #print('done..') return 'done' def load_DB_into_FASTsearch(self): #print('loading hkldbShorts ..') self.fsearch1 = FASTsearch.FASTsearch('hkldbShorts.hkl') #print('done') #print('generating BoW Model..') #self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = True) #print('done') #print('loading the bow model') self.fsearch1.Load_BoW_Model('bagofwordshkldbShorts.pkl', 'DataBaseOneZeroshkldbShorts.hkl') #print('done') import spacy #print('loading the german spacy model..') self.nlp = spacy.load('de_core_news_sm') #print('done') #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names()) def ExplainShortsInSentencesWithBrackets(self, sentences): outsentences = [] count = 0 for sentence in sentences: count += 1 #print('processing sentence', count) nshort = [] therewasapossibleshort = 0 explanationlist = [] doc = self.nlp(' '.join(sentence)) #print('da sentence', sentence) newshorts = [] wordcount = 0 for oriword in sentence: wordcount += 1 if wordcount == len(sentence): word = oriword + '.' else: word = oriword newshort = [] prenewshort = [] punctcount = list(word).count('.') #print(word, list(word), punctcount) if punctcount > 1: replaceindex = sentence.index(oriword) dacount = 0 for letter in list(word): #print('letter in word split', letter) prenewshort.append(letter) if letter == '.': dacount += 1 newshort.append(''.join(prenewshort)) prenewshort = [] if dacount == punctcount: newshorts.append([newshort, replaceindex]) #print(newshorts) for newshort in newshorts[::-1]: if len(newshort) > 0: del sentence[newshort[1]] for part in newshort[0][::-1]: sentence.insert(newshort[1], part) #print('sentence after newshortreplace', sentence) for n in range(len(sentence)): NhasToBeChecked = True for r in range(len(explanationlist)): if explanationlist[r][3] <= n < explanationlist[r][1]: NhasToBeChecked = False # Liste von falsch erkannten, zb er sollte nicht erkannt werden :) if sentence[n] in ['Er', 'er', 'ab', 'Ab', 'so', 'da', 'an', 'mit']: NhasToBeChecked = False if n != 0 and sentence[n][-1] != '.' and doc[n - 1].dep_[:2] != 'ART': NhasToBeChecked = False if NhasToBeChecked == True: bestmatches1, matchindex = self.fsearch1.search_with_highest_multiplikation_Output(sentence[n], 1) #print(bestmatches1, matchindex) interestingindex = 0 if sentence[n][-1] == '.': #print(sentence[n]) #print('oioioioioi') if len(sentence) - n > 5: for m in range(5): #print(n, m, n+m+1, len(sentence)) if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.': interestingindex = m break if len(sentence) - n <= 5 and n != len(sentence) - 1: for m in range((len(sentence) - n)): #print('oleolaolu',n, m, n+m+1, len(sentence)) if m == (len(sentence) - n) - 1: if sentence[n + m][-1] == '.': interestingindex = m break else: if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.' : interestingindex = m break #print(interestingindex, 'interestingindex') if interestingindex == 0: finalmatchindex = matchindex if interestingindex >= 1: thesentence = '' for i in range(interestingindex + 1): #print('sentence', sentence[n+i]) #print(thesentence + sentence[n+i]) if i == 0: presentence = sentence[n + i] if i >= 1: presentence = ' ' + sentence[n + i] thesentence = thesentence + presentence #print('thesentence',thesentence) mbestmatches, mmatchindex = self.fsearch1.search_with_highest_multiplikation_Output(thesentence , 1) #print(mmatchindex) finalmatchindex = mmatchindex if finalmatchindex[1] == 1: wordexplanationIndex = finalmatchindex[0] wordexplanation = self.ShortsDB_All[wordexplanationIndex][1] explanationlist.insert(0, [wordexplanation, n + interestingindex + 1, interestingindex, n]) #print('explanationlist', explanationlist) for i in range(len(explanationlist)): for k in range(len(explanationlist)): if explanationlist[i][3] == explanationlist[k][3] and i != k: if explanationlist[i][2] > explanationlist[k][2]: del explanationlist[k] if explanationlist[i][2] < explanationlist[k][2]: del explanationlist[i] for j in range(len(explanationlist)): sentence.insert(explanationlist[j][1], '(' + ' '.join(explanationlist[j][0]) + ')') #print(sentence) outsentences.append(sentence) # if uebereinstimmung, go to index and exchange return outsentences