126 lines
4.1 KiB
Python
126 lines
4.1 KiB
Python
import hickle as hkl
|
|
|
|
import FASTsearch
|
|
|
|
|
|
class PluritonUpdater(object):
|
|
|
|
def __init__(self):
|
|
|
|
self.ole = 1
|
|
|
|
# Input: csv file with the form ['eine', 'schwere', 'Sprache'] , ['in', 'leicht'] for each line
|
|
# Output: hkl dump of array in form [[['eine', 'schwere', 'Sprache'],['in', 'leicht']],[..]]
|
|
|
|
def create_hklDB_from_csv(self, csvDbDir):
|
|
|
|
with open(csvDbDir) as lines:
|
|
|
|
TranslationsDB_All = []
|
|
|
|
for line in lines:
|
|
|
|
TranslationsDB_All.append(list(eval(line)))
|
|
|
|
|
|
#print(ShortsDB_All)
|
|
|
|
#print(ShortsDB_All[0][0])
|
|
|
|
|
|
hkldbTranslations1 = []
|
|
hkldbTranslations2 = []
|
|
counter = 0
|
|
for n in range(len(TranslationsDB_All)):
|
|
|
|
counter += 1
|
|
#if counter % 1000 == 0:
|
|
#print(counter)
|
|
|
|
hkldbTranslations1.append([TranslationsDB_All[n][0][0]])
|
|
hkldbTranslations2.append([TranslationsDB_All[n][1][0]])
|
|
|
|
|
|
|
|
#print(hkldbTranslations1, TranslationsDB_All)
|
|
#print('creating the hkl dump of TranslationsDBAll')
|
|
hkl.dump(TranslationsDB_All, 'hkldbTranslations_All.hkl', mode='w', compression='gzip')
|
|
#print('done..')
|
|
|
|
#print('Creating the hkl dump of TranslationsDB')
|
|
hkl.dump(hkldbTranslations1, 'hkldbTranslations1.hkl', mode='w', compression='gzip')
|
|
hkl.dump(hkldbTranslations2, 'hkldbTranslations2.hkl', mode='w', compression='gzip')
|
|
#print('done..')
|
|
|
|
return 'done'
|
|
|
|
|
|
def load_DB_into_FASTsearch_and_generate_BOW(self):
|
|
|
|
print('loading the hkldbTranslations1...')
|
|
self.hkldbTranslations1 = hkl.load('hkldbTranslations1.hkl')
|
|
print('done')
|
|
|
|
print('loading the hkldbTranslations2...')
|
|
self.hkldbTranslations2 = hkl.load('hkldbTranslations2.hkl')
|
|
print('done')
|
|
|
|
print('loading hkldbTranslations 1 into FASTsearch..')
|
|
self.fsearch1 = FASTsearch.FASTsearch('hkldbTranslations1.hkl')
|
|
print('done')
|
|
|
|
print('loading hkldbTranslations 2 into FASTsearch..')
|
|
self.fsearch2 = FASTsearch.FASTsearch('hkldbTranslations2.hkl')
|
|
print('done')
|
|
|
|
print('generating BoW Model 1..')
|
|
self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
|
|
print('done')
|
|
|
|
print('generating BoW Model 2..')
|
|
self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
|
|
print('done')
|
|
|
|
return 'done'
|
|
|
|
def loadModels(self):
|
|
|
|
print('loading the hkldbTranslations1...')
|
|
self.hkldbTranslations1 = hkl.load('hkldbTranslations1.hkl')
|
|
print('done')
|
|
|
|
print('loading the hkldbTranslations2...')
|
|
self.hkldbTranslations2 = hkl.load('hkldbTranslations2.hkl')
|
|
print('done')
|
|
|
|
print('loading hkldbTranslations 1 into FASTsearch..')
|
|
self.fsearch1 = FASTsearch.FASTsearch('hkldbTranslations1.hkl')
|
|
print('done')
|
|
|
|
print('loading hkldbTranslations 2 into FASTsearch..')
|
|
self.fsearch2 = FASTsearch.FASTsearch('hkldbTranslations2.hkl')
|
|
print('done')
|
|
|
|
print('loading the bow model 1')
|
|
self.fsearch1.Load_BoW_Model('bagofwordshkldbTranslations1.pkl', 'DataBaseOneZeroshkldbTranslations1.hkl')
|
|
print('done')
|
|
|
|
print('loading the bow model 2')
|
|
self.fsearch2.Load_BoW_Model('bagofwordshkldbTranslations2.pkl', 'DataBaseOneZeroshkldbTranslations2.hkl')
|
|
print('done')
|
|
|
|
|
|
return 'done'
|
|
|
|
def searchNearest2Translate(self, text):
|
|
|
|
|
|
bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(text, 1)
|
|
|
|
|
|
|
|
DifficultText = self.hkldbTranslations1[matchindex2[0]][0].split()
|
|
LeichterText = self.hkldbTranslations2[matchindex2[0]][0].split()
|
|
|
|
return DifficultText, LeichterText
|
|
|