You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

154 lines
5.0 KiB

  1. import hickle as hkl
  2. import FASTsearch
  3. class Medio(object):
  4. def __init__(self, hklDatabaseDir_Medio, hklDatabaseDir_Medio_All):
  5. if hklDatabaseDir_Medio_All is not None:
  6. self.MedioDB_All = hkl.load(hklDatabaseDir_Medio_All)
  7. return
  8. def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
  9. print(csvDbDir)
  10. with open(csvDbDir) as lines:
  11. self.MedioDB_All = []
  12. for line in lines:
  13. #print('oi')
  14. #print(line)
  15. #print(eval(line))
  16. self.MedioDB_All.append(list(eval(line)))
  17. self.hkldbMedio1 = []
  18. self.hkldbMedio2 = []
  19. counter = 0
  20. for n in range(len(self.MedioDB_All)):
  21. counter += 1
  22. if counter % 1000 == 0:
  23. print(counter)
  24. self.hkldbMedio1.append([self.MedioDB_All[n][0][0]] )
  25. self.hkldbMedio2.append([self.MedioDB_All[n][1][0]] )
  26. print('creating the hkl dump of MedioDBAll')
  27. hkl.dump(self.MedioDB_All, 'hkldbMedio_All.hkl', mode='w', compression='lzf')
  28. print('done..')
  29. print('Creating the hkl dump of MedioDB 1')
  30. hkl.dump(self.hkldbMedio1, 'hkldbMedio1.hkl', mode='w', compression='lzf')
  31. #print('done..')
  32. print('Creating the hkl dump of MedioDB 2')
  33. hkl.dump(self.hkldbMedio2, 'hkldbMedio2.hkl', mode='w', compression='lzf')
  34. #print('done..')
  35. return 'done'
  36. def load_DB_into_FASTsearch(self):
  37. #print('loading the hkldbFremd_WB1...')
  38. self.hkldbMedio1 = hkl.load('hkldbMedio1.hkl')
  39. #print('done')
  40. #print('loading the hkldbFremd_WB2...')
  41. self.hkldbMedio2 = hkl.load('hkldbMedio2.hkl')
  42. #print('done')
  43. #print('loading hkldbFremd_WB 1 into FASTsearch..')
  44. self.fsearch1 = FASTsearch.FASTsearch('hkldbMedio1.hkl')
  45. #print('done')
  46. #print('loading hkldbFremd_WB 2 into FASTsearch..')
  47. self.fsearch2 = FASTsearch.FASTsearch('hkldbMedio2.hkl')
  48. #print('done')
  49. #print('generating BoW Model 1..')
  50. self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
  51. #print('done')
  52. #print('generating BoW Model 2..')
  53. self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
  54. #print('done')
  55. #print('loading the bow model 1')
  56. self.fsearch1.Load_BoW_Model('bagofwordshkldbMedio1.pkl', 'DataBaseOneZeroshkldbMedio1.hkl')
  57. #print('done')
  58. #print('loading the bow model 2')
  59. self.fsearch2.Load_BoW_Model('bagofwordshkldbMedio2.pkl', 'DataBaseOneZeroshkldbMedio2.hkl')
  60. #print('done')
  61. #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
  62. #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
  63. def Medioreplace(self, sentences, punctuations):
  64. outsentences = []
  65. #print('something')
  66. sentencecount = 0
  67. alleeintraege = []
  68. for sentence in sentences:
  69. medios_of_sentence = []
  70. for word in sentence:
  71. if word[-1] in [',', '.', '!', '?', ':', '_']:
  72. word = word[:-1]
  73. medios_of_sentence.append(word)
  74. #print('mediosofsentence',medios_of_sentence)
  75. medioeintraege = []
  76. for word in medios_of_sentence:
  77. bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1)
  78. medio = self.hkldbMedio1[matchindex2[0]][0].split()
  79. medioeintrag = self.hkldbMedio2[matchindex2[0]][0].split()
  80. #print(medio)
  81. #print('medioeintrag', medioeintrag)
  82. if medio[0] == word:
  83. medioeintraege.append([word, medioeintrag])
  84. #print('medioeintraege',medioeintraege)
  85. for eintrag in medioeintraege:
  86. for n in range(len(sentence)):
  87. if eintrag[0] == sentence[n]:
  88. sentence[n] = eintrag[1][0]
  89. if eintrag[0] == sentence[:-1]:
  90. sentence[n][:-1] = eintrag[1][0]
  91. outsentences.append(sentence)
  92. #print('the endsentence',sentence)
  93. return outsentences, punctuations