You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

327 lines
13 KiB

4 years ago
  1. # Klasse zum Konvertieren von nicht Aussagen zu postiven Aussagen.
  2. # Notiz: nicht + Adjektiv kann direkt durch Gegenwort ausgetauscht werden.
  3. # nicht + verb kann zum Gegenwort des Verbes ausgetauscht werden, dabei muss aber nach Hause weggecuttet werden bei bsp Er ging nicht nach Hause. Er blieb
  4. # in wiktionary {{Gegenwörter}} Kategorie
  5. import spacy
  6. import nltk
  7. from nltk.stem.snowball import SnowballStemmer
  8. import hickle as hkl
  9. import FASTsearch
  10. stemmer = SnowballStemmer("german")
  11. class SayYes(object):
  12. def __init__(self, hklDatabaseDir_Opposites, hklDatabaseDir_Opposites_All):
  13. if hklDatabaseDir_Opposites is not None:
  14. self.OppositesDB = hkl.load(hklDatabaseDir_Opposites)
  15. #print('loading the german spacy model..')
  16. self.nlp = spacy.load('de_core_news_sm')
  17. #print('done')
  18. #print('loading the stemmer..')
  19. self.stemmer = SnowballStemmer("german")
  20. #print('done')
  21. return
  22. def create_hklDB_from_csv(self, csvDbDir):
  23. with open(csvDbDir) as lines:
  24. self.OppositesDB_All = []
  25. for line in lines:
  26. #print(line)
  27. self.OppositesDB_All.append(list(eval(line)))
  28. self.hkldbOpposites1 = []
  29. self.hkldbOpposites2 = []
  30. counter = 0
  31. for n in range(len(self.OppositesDB_All)):
  32. counter += 1
  33. if counter % 1000 == 0:
  34. print(counter)
  35. self.hkldbOpposites1.append([self.OppositesDB_All[n][0][0]] + [self.stemmer.stem(word) for word in self.OppositesDB_All[n][0]] )
  36. self.hkldbOpposites2.append([self.OppositesDB_All[n][1][0]] + [stemmer.stem(word) for word in self.OppositesDB_All[n][1]] )
  37. #print('hkldbOpposites1', self.hkldbOpposites1)
  38. #print('hkldbOpposites2', self.hkldbOpposites2)
  39. print('creating the hkl dump of OppositesDBAll')
  40. hkl.dump(self.OppositesDB_All, 'hkldbOpposites_All.hkl', mode='w', compression='gzip')
  41. print('done..')
  42. print('Creating the hkl dump of OppositesDB 1')
  43. hkl.dump(self.hkldbOpposites1, 'hkldbOpposites1.hkl', mode='w', compression='gzip')
  44. print('done..')
  45. print('Creating the hkl dump of OppositesDB 2')
  46. hkl.dump(self.hkldbOpposites2, 'hkldbOpposites2.hkl', mode='w', compression='gzip')
  47. print('done..')
  48. return 'done'
  49. def load_DB_into_FASTsearch(self):
  50. #print('Loading the hklDB1..')
  51. self.hkldbOpposites1 = hkl.load('hkldbOpposites1.hkl')
  52. #print('done')
  53. #print('Loading the hklDB2')
  54. self.hkldbOpposites2 = hkl.load('hkldbOpposites2.hkl')
  55. #print('done')
  56. #print('loading hkldbOpposites 1..')
  57. self.fsearch1 = FASTsearch.FASTsearch('hkldbOpposites1.hkl')
  58. #print('done')
  59. #print('loading hkldbOpposites 2..')
  60. self.fsearch2 = FASTsearch.FASTsearch('hkldbOpposites2.hkl')
  61. #print('done')
  62. #print('generating BoW Model 1..')
  63. #self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = False)
  64. #print('done')
  65. #print('generating BoW Model 2..')
  66. #self.fsearch2.Gen_BoW_Model(3000, "word", punctuation = False)
  67. #print('done')
  68. #print('loading the bow model 1')
  69. self.fsearch1.Load_BoW_Model('bagofwordshkldbOpposites1.pkl', 'DataBaseOneZeroshkldbOpposites1.hkl')
  70. #print('done')
  71. #print('loading the bow model 2')
  72. self.fsearch2.Load_BoW_Model('bagofwordshkldbOpposites2.pkl', 'DataBaseOneZeroshkldbOpposites2.hkl')
  73. #print('done')
  74. #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
  75. #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
  76. def replaceOpposites(self, sentences):
  77. outsentences = []
  78. #print('wenigstens etwas')
  79. sentencecount = 0
  80. for sentence in sentences:
  81. #print('oloa')
  82. sentencecount += 1
  83. #print('processing sentence', sentencecount)
  84. listofAdjektives = []
  85. nichtIndex = None
  86. KeinIndex = None
  87. for m in range(len(sentence)):
  88. if sentence[m] == 'nicht':
  89. nichtIndex = m
  90. if sentence[m][:4] == 'kein':
  91. KeinIndex = m
  92. #if KeinIndex not None or nichtIndex not None:
  93. #if len(listofAdjektives) == 0:
  94. #if word.dep_[0] == 'V':
  95. #print('ola')
  96. if (KeinIndex is not None) or (nichtIndex is not None):
  97. doc = self.nlp(' '.join(sentence))
  98. count = 0
  99. for word in doc:
  100. count += 1
  101. if word.text == ',':
  102. count -= 1
  103. #print(word.text, word.tag_, word.tag_[:1])
  104. if word.tag_[:2] == 'AD':
  105. listofAdjektives.append([word.text, count - 1])
  106. listOfOpposites = []
  107. if (KeinIndex is not None):
  108. #print(sentence[KeinIndex + 1])
  109. if len(listofAdjektives) == 0 or len(listofAdjektives) > 1:
  110. AdjIndex = 1
  111. #print('listofadjectives', listofAdjektives)
  112. for n in range(len(listofAdjektives)):
  113. ad = listofAdjektives[n]
  114. #print(ad[1])
  115. if ad[1] == KeinIndex + 2 and ad[0] == 'zu':
  116. if listofAdjektives[n + 1][1] == KeinIndex + 3:
  117. AdjIndex = 2
  118. #print('Adj und stemadj 0 2')
  119. #print(sentence[KeinIndex + AdjIndex])
  120. #print(self.stemmer.stem(sentence[KeinIndex + AdjIndex]))
  121. bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1)
  122. bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[KeinIndex + AdjIndex]), 1)
  123. Austauschindex = KeinIndex + AdjIndex
  124. else:
  125. Adjektiv = listofAdjektives[0][0]
  126. #print('Adj und stemadj')
  127. #print(Adjektiv)
  128. #print(self.stemmer.stem(Adjektiv))
  129. Austauschindex = listofAdjektives[0][1]
  130. bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
  131. bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
  132. Opposite = None
  133. #print('thetheone')
  134. if (nichtIndex is not None):
  135. #print(sentence[nichtIndex + 1])
  136. #print('theone')
  137. if len(listofAdjektives) == 0 or len(listofAdjektives) > 1:
  138. #print('1')
  139. #print(nichtIndex)
  140. #print('2')
  141. if nichtIndex == (len(sentence) - 1):
  142. Austauschindex = nichtIndex - 1
  143. else:
  144. Austauschindex = nichtIndex + 1
  145. # TO DO: egal formen auf infinitiv mappen
  146. # Das muss mit machine learnign gelöst werden..
  147. # --> ergiebt sich aus den übersetzungen ( welches wort fehl, welches neu
  148. # da, dann daraus eine maschine die sich die gegenteile merkt =)
  149. #itisaVerb = False
  150. #if doc[Austauschindex].dep_[0] == 'V':
  151. # itisaVerb = True
  152. #someform = sentence[Austauschindex]
  153. bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1)
  154. bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(sentence[Austauschindex]), 1)
  155. else:
  156. Adjektiv = listofAdjektives[0][0]
  157. Austauschindex = listofAdjektives[0][1]
  158. bestmatches1, matchindex1 = self.fsearch1.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
  159. bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(self.stemmer.stem(Adjektiv), 1)
  160. Opposite = None
  161. #print(sentence)
  162. #print(bestmatches1, matchindex1)
  163. #print(bestmatches1, matchindex1)
  164. #print(len(listOfOpposites))
  165. if matchindex1[1] >= 1:
  166. OppositeIndex = matchindex1[0]
  167. Opposite = self.hkldbOpposites2[OppositeIndex][0]
  168. #print('Opposite in match1', Opposite)
  169. listOfOpposites.append([Opposite,Austauschindex])
  170. if matchindex2[1] >= 1:
  171. OppositeIndex = matchindex2[0]
  172. Opposite = self.hkldbOpposites1[OppositeIndex][0]
  173. #print('opposite in match2', Opposite)
  174. listOfOpposites.append([Opposite,Austauschindex])
  175. #print(listOfOpposites)
  176. for opposite in listOfOpposites:
  177. if sentence[opposite[1]][-1] == ',':
  178. if sentence[opposite[1]][-3:] == 'es,':
  179. opposite[0] = opposite[0] + 'es'
  180. if sentence[opposite[1]][-3:] == 'er,':
  181. opposite[0] = opposite[0] + 'er'
  182. if sentence[opposite[1]][-3:] == 'em,':
  183. opposite[0] = opposite[0] + 'em'
  184. if sentence[opposite[1]][-2:] == 'e,':
  185. opposite[0] = opposite[0] + 'e'
  186. sentence[opposite[1]] = opposite[0] + ','
  187. else:
  188. if sentence[opposite[1]][-2:] == 'es':
  189. opposite[0] = opposite[0] + 'es'
  190. if sentence[opposite[1]][-2:] == 'er':
  191. opposite[0] = opposite[0] + 'er'
  192. if sentence[opposite[1]][-2:] == 'em':
  193. opposite[0] = opposite[0] + 'em'
  194. if sentence[opposite[1]][-1:] == 'e':
  195. opposite[0] = opposite[0] + 'e'
  196. sentence[opposite[1]] = opposite[0]
  197. if KeinIndex is not None and len(listOfOpposites) > 0:
  198. #print(KeinIndex)
  199. sentence[KeinIndex] = sentence[KeinIndex][1:]
  200. if nichtIndex is not None and len(listOfOpposites) > 0:
  201. #print(nichtIndex)
  202. printer = sentence.pop(nichtIndex)
  203. #print(printer)
  204. outsentences.append(sentence)
  205. return outsentences