Search on legal documents using Tensorflow and a web_actix web interface
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

126 lines
4.1 KiB

3 years ago
  1. import hickle as hkl
  2. import FASTsearch
  3. class PluritonUpdater(object):
  4. def __init__(self):
  5. self.ole = 1
  6. # Input: csv file with the form ['eine', 'schwere', 'Sprache'] , ['in', 'leicht'] for each line
  7. # Output: hkl dump of array in form [[['eine', 'schwere', 'Sprache'],['in', 'leicht']],[..]]
  8. def create_hklDB_from_csv(self, csvDbDir):
  9. with open(csvDbDir) as lines:
  10. TranslationsDB_All = []
  11. for line in lines:
  12. TranslationsDB_All.append(list(eval(line)))
  13. #print(ShortsDB_All)
  14. #print(ShortsDB_All[0][0])
  15. hkldbTranslations1 = []
  16. hkldbTranslations2 = []
  17. counter = 0
  18. for n in range(len(TranslationsDB_All)):
  19. counter += 1
  20. #if counter % 1000 == 0:
  21. #print(counter)
  22. hkldbTranslations1.append([TranslationsDB_All[n][0][0]])
  23. hkldbTranslations2.append([TranslationsDB_All[n][1][0]])
  24. #print(hkldbTranslations1, TranslationsDB_All)
  25. #print('creating the hkl dump of TranslationsDBAll')
  26. hkl.dump(TranslationsDB_All, 'hkldbTranslations_All.hkl', mode='w', compression='gzip')
  27. #print('done..')
  28. #print('Creating the hkl dump of TranslationsDB')
  29. hkl.dump(hkldbTranslations1, 'hkldbTranslations1.hkl', mode='w', compression='gzip')
  30. hkl.dump(hkldbTranslations2, 'hkldbTranslations2.hkl', mode='w', compression='gzip')
  31. #print('done..')
  32. return 'done'
  33. def load_DB_into_FASTsearch_and_generate_BOW(self):
  34. print('loading the hkldbTranslations1...')
  35. self.hkldbTranslations1 = hkl.load('hkldbTranslations1.hkl')
  36. print('done')
  37. print('loading the hkldbTranslations2...')
  38. self.hkldbTranslations2 = hkl.load('hkldbTranslations2.hkl')
  39. print('done')
  40. print('loading hkldbTranslations 1 into FASTsearch..')
  41. self.fsearch1 = FASTsearch.FASTsearch('hkldbTranslations1.hkl')
  42. print('done')
  43. print('loading hkldbTranslations 2 into FASTsearch..')
  44. self.fsearch2 = FASTsearch.FASTsearch('hkldbTranslations2.hkl')
  45. print('done')
  46. print('generating BoW Model 1..')
  47. self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
  48. print('done')
  49. print('generating BoW Model 2..')
  50. self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
  51. print('done')
  52. return 'done'
  53. def loadModels(self):
  54. print('loading the hkldbTranslations1...')
  55. self.hkldbTranslations1 = hkl.load('hkldbTranslations1.hkl')
  56. print('done')
  57. print('loading the hkldbTranslations2...')
  58. self.hkldbTranslations2 = hkl.load('hkldbTranslations2.hkl')
  59. print('done')
  60. print('loading hkldbTranslations 1 into FASTsearch..')
  61. self.fsearch1 = FASTsearch.FASTsearch('hkldbTranslations1.hkl')
  62. print('done')
  63. print('loading hkldbTranslations 2 into FASTsearch..')
  64. self.fsearch2 = FASTsearch.FASTsearch('hkldbTranslations2.hkl')
  65. print('done')
  66. print('loading the bow model 1')
  67. self.fsearch1.Load_BoW_Model('bagofwordshkldbTranslations1.pkl', 'DataBaseOneZeroshkldbTranslations1.hkl')
  68. print('done')
  69. print('loading the bow model 2')
  70. self.fsearch2.Load_BoW_Model('bagofwordshkldbTranslations2.pkl', 'DataBaseOneZeroshkldbTranslations2.hkl')
  71. print('done')
  72. return 'done'
  73. def searchNearest2Translate(self, text):
  74. bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(text, 1)
  75. DifficultText = self.hkldbTranslations1[matchindex2[0]][0].split()
  76. LeichterText = self.hkldbTranslations2[matchindex2[0]][0].split()
  77. return DifficultText, LeichterText