import hickle as hkl import FASTsearch class Medio(object): def __init__(self, hklDatabaseDir_Medio, hklDatabaseDir_Medio_All): if hklDatabaseDir_Medio_All is not None: self.MedioDB_All = hkl.load(hklDatabaseDir_Medio_All) return def create_hklDB_from_csv(self, csvDbDir, StemOrNot): print(csvDbDir) with open(csvDbDir) as lines: self.MedioDB_All = [] for line in lines: #print('oi') #print(line) #print(eval(line)) self.MedioDB_All.append(list(eval(line))) self.hkldbMedio1 = [] self.hkldbMedio2 = [] counter = 0 for n in range(len(self.MedioDB_All)): counter += 1 if counter % 1000 == 0: print(counter) self.hkldbMedio1.append([self.MedioDB_All[n][0][0]] ) self.hkldbMedio2.append([self.MedioDB_All[n][1][0]] ) print('creating the hkl dump of MedioDBAll') hkl.dump(self.MedioDB_All, 'hkldbMedio_All.hkl', mode='w', compression='lzf') print('done..') print('Creating the hkl dump of MedioDB 1') hkl.dump(self.hkldbMedio1, 'hkldbMedio1.hkl', mode='w', compression='lzf') #print('done..') print('Creating the hkl dump of MedioDB 2') hkl.dump(self.hkldbMedio2, 'hkldbMedio2.hkl', mode='w', compression='lzf') #print('done..') return 'done' def load_DB_into_FASTsearch(self): #print('loading the hkldbFremd_WB1...') self.hkldbMedio1 = hkl.load('hkldbMedio1.hkl') #print('done') #print('loading the hkldbFremd_WB2...') self.hkldbMedio2 = hkl.load('hkldbMedio2.hkl') #print('done') #print('loading hkldbFremd_WB 1 into FASTsearch..') self.fsearch1 = FASTsearch.FASTsearch('hkldbMedio1.hkl') #print('done') #print('loading hkldbFremd_WB 2 into FASTsearch..') self.fsearch2 = FASTsearch.FASTsearch('hkldbMedio2.hkl') #print('done') #print('generating BoW Model 1..') self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False) #print('done') #print('generating BoW Model 2..') self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False) #print('done') #print('loading the bow model 1') self.fsearch1.Load_BoW_Model('bagofwordshkldbMedio1.pkl', 'DataBaseOneZeroshkldbMedio1.hkl') #print('done') #print('loading the bow model 2') self.fsearch2.Load_BoW_Model('bagofwordshkldbMedio2.pkl', 'DataBaseOneZeroshkldbMedio2.hkl') #print('done') #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names()) #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names()) def Medioreplace(self, sentences, punctuations): outsentences = [] #print('something') sentencecount = 0 alleeintraege = [] for sentence in sentences: medios_of_sentence = [] for word in sentence: if word[-1] in [',', '.', '!', '?', ':', '_']: word = word[:-1] medios_of_sentence.append(word) #print('mediosofsentence',medios_of_sentence) medioeintraege = [] for word in medios_of_sentence: bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1) medio = self.hkldbMedio1[matchindex2[0]][0].split() medioeintrag = self.hkldbMedio2[matchindex2[0]][0].split() #print(medio) #print('medioeintrag', medioeintrag) if medio[0] == word: medioeintraege.append([word, medioeintrag]) #print('medioeintraege',medioeintraege) for eintrag in medioeintraege: for n in range(len(sentence)): if eintrag[0] == sentence[n]: sentence[n] = eintrag[1][0] if eintrag[0] == sentence[:-1]: sentence[n][:-1] = eintrag[1][0] outsentences.append(sentence) #print('the endsentence',sentence) return outsentences, punctuations