You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

185 lines
5.9 KiB

4 years ago
4 years ago
  1. import spacy
  2. import nltk
  3. from nltk.stem.snowball import SnowballStemmer
  4. import hickle as hkl
  5. import FASTsearch
  6. stemmer = SnowballStemmer("german")
  7. class FremdWB(object):
  8. def __init__(self, hklDatabaseDir_Fremd_WB, hklDatabaseDir_Fremd_WB_All):
  9. if hklDatabaseDir_Fremd_WB_All is not None:
  10. self.Fremd_WBDB_All = hkl.load(hklDatabaseDir_Fremd_WB_All)
  11. #print('loading the german spacy model..')
  12. self.nlp = spacy.load('de_core_news_sm')
  13. #print('done')
  14. #print('loading the stemmer..')
  15. self.stemmer = SnowballStemmer("german")
  16. #print('done')
  17. return
  18. def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
  19. with open(csvDbDir) as lines:
  20. self.Fremd_WBDB_All = []
  21. for line in lines:
  22. #print(line)
  23. self.Fremd_WBDB_All.append(list(eval(line)))
  24. self.hkldbFremd_WB1 = []
  25. self.hkldbFremd_WB2 = []
  26. counter = 0
  27. for n in range(len(self.Fremd_WBDB_All)):
  28. counter += 1
  29. if counter % 1000 == 0:
  30. print(counter)
  31. self.hkldbFremd_WB1.append([self.Fremd_WBDB_All[n][0][0]] )
  32. self.hkldbFremd_WB2.append([self.Fremd_WBDB_All[n][1][0]] )
  33. print('creating the hkl dump of Fremd_WBDBAll')
  34. hkl.dump(self.Fremd_WBDB_All, 'hkldbFremd_WB_All.hkl', mode='w', compression='lzf')
  35. print('done..')
  36. print('Creating the hkl dump of Fremd_WBDB 1')
  37. hkl.dump(self.hkldbFremd_WB1, 'hkldbFremd_WB1.hkl', mode='w', compression='lzf')
  38. #print('done..')
  39. print('Creating the hkl dump of Fremd_WBDB 2')
  40. hkl.dump(self.hkldbFremd_WB2, 'hkldbFremd_WB2.hkl', mode='w', compression='lzf')
  41. #print('done..')
  42. return 'done'
  43. def load_DB_into_FASTsearch(self):
  44. #print('loading the hkldbFremd_WB1...')
  45. self.hkldbFremd_WB1 = hkl.load('hkldbFremd_WB1.hkl')
  46. #print('done')
  47. #print('loading the hkldbFremd_WB2...')
  48. self.hkldbFremd_WB2 = hkl.load('hkldbFremd_WB2.hkl')
  49. #print('done')
  50. #print('loading hkldbFremd_WB 1 into FASTsearch..')
  51. self.fsearch1 = FASTsearch.FASTsearch('hkldbFremd_WB1.hkl')
  52. #print('done')
  53. #print('loading hkldbFremd_WB 2 into FASTsearch..')
  54. self.fsearch2 = FASTsearch.FASTsearch('hkldbFremd_WB2.hkl')
  55. #print('done')
  56. #print('generating BoW Model 1..')
  57. self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
  58. #print('done')
  59. #print('generating BoW Model 2..')
  60. self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
  61. #print('done')
  62. #print('loading the bow model 1')
  63. self.fsearch1.Load_BoW_Model('bagofwordshkldbFremd_WB1.pkl', 'DataBaseOneZeroshkldbFremd_WB1.hkl')
  64. #print('done')
  65. #print('loading the bow model 2')
  66. self.fsearch2.Load_BoW_Model('bagofwordshkldbFremd_WB2.pkl', 'DataBaseOneZeroshkldbFremd_WB2.hkl')
  67. #print('done')
  68. #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
  69. #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
  70. def fremdEintragAppend(self, sentences, punctuations):
  71. outsentences = []
  72. #print('something')
  73. sentencecount = 0
  74. alleeintraege = []
  75. for sentence in sentences:
  76. oldpunctuations = punctuations
  77. try:
  78. #print('sentence', sentence)
  79. sentencecount += 1
  80. #print('processing sentence', sentencecount)
  81. doc = self.nlp(' '.join(sentence))
  82. fremds_of_sentence = []
  83. count = 0
  84. for word in doc:
  85. count += 1
  86. if word.tag_[0] == 'V' or word.tag_[0] == 'N' or word.tag_[0] == 'A':
  87. fremds_of_sentence.append(word.text)
  88. #print(fremds_of_sentence)
  89. fremdeintraege = []
  90. for word in fremds_of_sentence:
  91. bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1)
  92. fremd = self.hkldbFremd_WB1[matchindex2[0]][0].split()
  93. fremdeintrag = self.hkldbFremd_WB2[matchindex2[0]][0].split()
  94. #print(fremd)
  95. #print('fremdeintrag', fremdeintrag)
  96. if fremd[0] == word:
  97. fremdeintraege.append(fremdeintrag)
  98. #print('fremdeintraege',fremdeintraege)
  99. outsentences.append(sentence)
  100. for eintrag in fremdeintraege:
  101. if eintrag[-1][-1] == '.':
  102. eintrag[-1] = eintrag[-1][:-1]
  103. if eintrag not in alleeintraege:
  104. outsentences.append(eintrag)
  105. punctuations.insert(sentencecount, '.')
  106. alleeintraege.append(eintrag)
  107. #print('the endsentence',sentence)
  108. except:
  109. #print('konnte nicht' + str(sentence) + 'in FremdWB prozessieren..')
  110. #print('outsentence und co ', outsentences[-1], eintrag, sentence)
  111. if sentence != outsentences[-1] and alleeintraege[-1] != outsentences[-1]:
  112. outsentences.append(sentence)
  113. punctuations = oldpunctuations
  114. return outsentences, punctuations