155 lines
No EOL
5 KiB
Python
155 lines
No EOL
5 KiB
Python
|
|
import hickle as hkl
|
|
import FASTsearch
|
|
|
|
|
|
class Medio(object):
|
|
|
|
def __init__(self, hklDatabaseDir_Medio, hklDatabaseDir_Medio_All):
|
|
|
|
if hklDatabaseDir_Medio_All is not None:
|
|
self.MedioDB_All = hkl.load(hklDatabaseDir_Medio_All)
|
|
|
|
return
|
|
|
|
|
|
def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
|
|
print(csvDbDir)
|
|
with open(csvDbDir) as lines:
|
|
|
|
self.MedioDB_All = []
|
|
|
|
for line in lines:
|
|
#print('oi')
|
|
#print(line)
|
|
#print(eval(line))
|
|
self.MedioDB_All.append(list(eval(line)))
|
|
|
|
|
|
|
|
|
|
|
|
self.hkldbMedio1 = []
|
|
self.hkldbMedio2 = []
|
|
|
|
|
|
counter = 0
|
|
for n in range(len(self.MedioDB_All)):
|
|
|
|
counter += 1
|
|
if counter % 1000 == 0:
|
|
print(counter)
|
|
|
|
|
|
self.hkldbMedio1.append([self.MedioDB_All[n][0][0]] )
|
|
self.hkldbMedio2.append([self.MedioDB_All[n][1][0]] )
|
|
|
|
|
|
|
|
print('creating the hkl dump of MedioDBAll')
|
|
hkl.dump(self.MedioDB_All, 'hkldbMedio_All.hkl', mode='w', compression='lzf')
|
|
print('done..')
|
|
|
|
print('Creating the hkl dump of MedioDB 1')
|
|
hkl.dump(self.hkldbMedio1, 'hkldbMedio1.hkl', mode='w', compression='lzf')
|
|
#print('done..')
|
|
|
|
print('Creating the hkl dump of MedioDB 2')
|
|
hkl.dump(self.hkldbMedio2, 'hkldbMedio2.hkl', mode='w', compression='lzf')
|
|
#print('done..')
|
|
|
|
|
|
|
|
return 'done'
|
|
|
|
def load_DB_into_FASTsearch(self):
|
|
|
|
#print('loading the hkldbFremd_WB1...')
|
|
self.hkldbMedio1 = hkl.load('hkldbMedio1.hkl')
|
|
#print('done')
|
|
|
|
#print('loading the hkldbFremd_WB2...')
|
|
self.hkldbMedio2 = hkl.load('hkldbMedio2.hkl')
|
|
#print('done')
|
|
|
|
#print('loading hkldbFremd_WB 1 into FASTsearch..')
|
|
self.fsearch1 = FASTsearch.FASTsearch('hkldbMedio1.hkl')
|
|
#print('done')
|
|
|
|
#print('loading hkldbFremd_WB 2 into FASTsearch..')
|
|
self.fsearch2 = FASTsearch.FASTsearch('hkldbMedio2.hkl')
|
|
#print('done')
|
|
|
|
#print('generating BoW Model 1..')
|
|
self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
|
|
#print('done')
|
|
|
|
#print('generating BoW Model 2..')
|
|
self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
|
|
#print('done')
|
|
|
|
|
|
#print('loading the bow model 1')
|
|
self.fsearch1.Load_BoW_Model('bagofwordshkldbMedio1.pkl', 'DataBaseOneZeroshkldbMedio1.hkl')
|
|
#print('done')
|
|
|
|
#print('loading the bow model 2')
|
|
self.fsearch2.Load_BoW_Model('bagofwordshkldbMedio2.pkl', 'DataBaseOneZeroshkldbMedio2.hkl')
|
|
#print('done')
|
|
|
|
|
|
#print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
|
|
#print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
|
|
|
|
|
|
|
|
def Medioreplace(self, sentences, punctuations):
|
|
outsentences = []
|
|
#print('something')
|
|
sentencecount = 0
|
|
alleeintraege = []
|
|
for sentence in sentences:
|
|
|
|
|
|
|
|
medios_of_sentence = []
|
|
|
|
|
|
for word in sentence:
|
|
if word[-1] in [',', '.', '!', '?', ':', '_']:
|
|
word = word[:-1]
|
|
medios_of_sentence.append(word)
|
|
|
|
#print('mediosofsentence',medios_of_sentence)
|
|
medioeintraege = []
|
|
for word in medios_of_sentence:
|
|
|
|
bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1)
|
|
|
|
|
|
|
|
medio = self.hkldbMedio1[matchindex2[0]][0].split()
|
|
medioeintrag = self.hkldbMedio2[matchindex2[0]][0].split()
|
|
|
|
#print(medio)
|
|
#print('medioeintrag', medioeintrag)
|
|
|
|
if medio[0] == word:
|
|
medioeintraege.append([word, medioeintrag])
|
|
#print('medioeintraege',medioeintraege)
|
|
|
|
|
|
for eintrag in medioeintraege:
|
|
for n in range(len(sentence)):
|
|
if eintrag[0] == sentence[n]:
|
|
sentence[n] = eintrag[1][0]
|
|
|
|
if eintrag[0] == sentence[:-1]:
|
|
sentence[n][:-1] = eintrag[1][0]
|
|
|
|
outsentences.append(sentence)
|
|
|
|
#print('the endsentence',sentence)
|
|
return outsentences, punctuations
|
|
|
|
|