You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

310 lines
12 KiB

4 years ago
  1. # erstmal schauen, welche Art von Datenbank Leo geparst hat.
  2. # Dann kann ich schauen welche Art von DB noch erstellt, bzw was noch erweitert werden muss.
  3. # if in db conjunktive but not in indicative ( oft ist conj und ind gleich, dann macht austasuch nicht sinn. ) then replace
  4. import spacy
  5. import nltk
  6. from nltk.stem.snowball import SnowballStemmer
  7. import hickle as hkl
  8. import FASTsearch
  9. stemmer = SnowballStemmer("german")
  10. class ConjunctSolve(object):
  11. def __init__(self, hklDatabaseDir_Indi_Conju, hklDatabaseDir_Indi_Conju_All):
  12. if hklDatabaseDir_Indi_Conju_All is not None:
  13. self.Indi_ConjuDB_All = hkl.load(hklDatabaseDir_Indi_Conju_All)
  14. #print('loading the german spacy model..')
  15. self.nlp = spacy.load('de_core_news_sm')
  16. #print('done')
  17. #print('loading the stemmer..')
  18. self.stemmer = SnowballStemmer("german")
  19. #print('done')
  20. return
  21. def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
  22. with open(csvDbDir) as lines:
  23. self.Indi_ConjuDB_All = []
  24. for line in lines:
  25. #print(line)
  26. self.Indi_ConjuDB_All.append(list(eval(line)))
  27. self.hkldbIndi_Conju1 = []
  28. self.hkldbIndi_Conju2 = []
  29. counter = 0
  30. for n in range(len(self.Indi_ConjuDB_All)):
  31. counter += 1
  32. if counter % 1000 == 0:
  33. print(counter)
  34. self.hkldbIndi_Conju1.append([self.Indi_ConjuDB_All[n][0][0]] )
  35. self.hkldbIndi_Conju2.append([self.Indi_ConjuDB_All[n][1][0]] )
  36. print('creating the hkl dump of Indi_ConjuDBAll')
  37. hkl.dump(self.Indi_ConjuDB_All, 'hkldbIndi_Conju_All.hkl', mode='w', compression='lzf')
  38. print('done..')
  39. print('Creating the hkl dump of Indi_ConjuDB 1')
  40. hkl.dump(self.hkldbIndi_Conju1, 'hkldbIndi_Conju1.hkl', mode='w', compression='lzf')
  41. #print('done..')
  42. print('Creating the hkl dump of Indi_ConjuDB 2')
  43. hkl.dump(self.hkldbIndi_Conju2, 'hkldbIndi_Conju2.hkl', mode='w', compression='lzf')
  44. #print('done..')
  45. return 'done'
  46. def load_DB_into_FASTsearch(self):
  47. #print('loading the hkldbIndi_Conju1...')
  48. self.hkldbIndi_Conju1 = hkl.load('hkldbIndi_Conju1.hkl')
  49. #print('done')
  50. #print('loading the hkldbIndi_Conju2...')
  51. self.hkldbIndi_Conju2 = hkl.load('hkldbIndi_Conju2.hkl')
  52. #print('done')
  53. #print('loading hkldbIndi_Conju 1 into FASTsearch..')
  54. self.fsearch1 = FASTsearch.FASTsearch('hkldbIndi_Conju1.hkl')
  55. #print('done')
  56. #print('loading hkldbIndi_Conju 2 into FASTsearch..')
  57. self.fsearch2 = FASTsearch.FASTsearch('hkldbIndi_Conju2.hkl')
  58. #print('done')
  59. #print('generating BoW Model 1..')
  60. #self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
  61. #print('done')
  62. #print('generating BoW Model 2..')
  63. #self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
  64. #print('done')
  65. #print('loading the bow model 1')
  66. self.fsearch1.Load_BoW_Model('bagofwordshkldbIndi_Conju1.pkl', 'DataBaseOneZeroshkldbIndi_Conju1.hkl')
  67. #print('done')
  68. #print('loading the bow model 2')
  69. self.fsearch2.Load_BoW_Model('bagofwordshkldbIndi_Conju2.pkl', 'DataBaseOneZeroshkldbIndi_Conju2.hkl')
  70. #print('done')
  71. #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
  72. #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
  73. def replaceConjunctives(self, sentences):
  74. outsentences = []
  75. sentencecount = 0
  76. for sentence in sentences:
  77. sentencecount += 1
  78. #print('processing sentence', sentencecount)
  79. doc = self.nlp(' '.join(sentence))
  80. verbs_of_sentence = []
  81. wordindex_to_replace = []
  82. count = 0
  83. thereisanIch = 0
  84. thereisaDu = 0
  85. thereisaWir = 0
  86. thereisanIhr = 0
  87. thereisaSie = 0
  88. for word in doc:
  89. count += 1
  90. if word.text == 'ich' or word.text == 'Ich':
  91. thereisanIch = 1
  92. if word.text == 'du' or word.text == 'Du':
  93. thereisaDu = 1
  94. if word.text == 'wir' or word.text == 'Wir':
  95. thereisaWir = 1
  96. if word.text == 'ihr' or word.text == 'Ihr':
  97. thereisanIhr = 1
  98. if word.text == 'sie' or word.text == 'Sie':
  99. thereisaSie = 1
  100. if word.tag_[0] == 'V':
  101. #print(word.tag_)
  102. #print(word.text)
  103. verbs_of_sentence.append(word.text)
  104. for verb in verbs_of_sentence:
  105. verbcounter = 0
  106. for word in sentence:
  107. verbcounter += 1
  108. if word == verb or word[:-1] == verb or word[1:] == verb:
  109. wordindex_to_replace.append(verbcounter)
  110. for n in range(len(verbs_of_sentence)):
  111. if verbs_of_sentence[n] == 'habe' or verbs_of_sentence[n] == 'sei':
  112. if thereisanIch == 0:
  113. verbs_of_sentence.append('er/sie/es')
  114. if thereisanIch == 1:
  115. verbs_of_sentence.append('ich')
  116. if thereisaDu == 1:
  117. verbs_of_sentence.append('du')
  118. if thereisaWir == 1:
  119. verbs_of_sentence.append('wir')
  120. if thereisanIhr == 1:
  121. verbs_of_sentence.append('ihr')
  122. if thereisaSie == 1:
  123. verbs_of_sentence.append('sie')
  124. nothingtodo = 0
  125. if nothingtodo == 0:
  126. verbs_of_sentence_string = ' '.join(verbs_of_sentence)
  127. bestmatches2, matchindex2 = self.fsearch2.search_with_highest_multiplikation_Output(verbs_of_sentence_string, 1)
  128. #print(bestmatches2, matchindex2)
  129. indicative_form = self.hkldbIndi_Conju1[matchindex2[0]][0].split()
  130. conjunctive_form = self.hkldbIndi_Conju2[matchindex2[0]][0].split()
  131. #print('oioioioioi')
  132. #print('verbsofsentencestring',verbs_of_sentence_string)
  133. #print('indikativform',indicative_form)
  134. #print('conjunctive_form', conjunctive_form)
  135. therewasaconjunctive = 0
  136. for n in range(len(conjunctive_form)):
  137. for m in range(len(verbs_of_sentence)):
  138. if conjunctive_form[n] == verbs_of_sentence[m] and n != 0:
  139. therewasaconjunctive = 1
  140. if therewasaconjunctive == 1:
  141. count = 0
  142. exchangeindizee = []
  143. for verb in conjunctive_form:
  144. count += 1
  145. count2 = 0
  146. for ver in verbs_of_sentence:
  147. count2 += 1
  148. #print('Aye')
  149. #print(verb)
  150. #print(ver)
  151. if verb == ver:
  152. exchangeindizee.append([count, count2])
  153. #print('indicative form', indicative_form)
  154. #print('the exchangeindizee ', exchangeindizee)
  155. #print('verbs of sentence before split', verbs_of_sentence)
  156. #print('before exchange')
  157. #print('conjunctive form', conjunctive_form)
  158. #print('verbs of sentence', verbs_of_sentence)
  159. #print('indicative form', indicative_form)
  160. for indizee in exchangeindizee:
  161. #print('indizee',indizee)
  162. #print(indicative_form[indizee[0]-1])
  163. #print(len(verbs_of_sentence))
  164. if indicative_form[indizee[0] - 1] not in ['euch','ihr','wir','sie','du', 'er/sie/es']:
  165. verbs_of_sentence[indizee[1] - 1] = indicative_form[indizee[0] - 1]
  166. #print('verbs of sentence after change', verbs_of_sentence)
  167. donothing = 0
  168. if therewasaconjunctive == 0:
  169. donothing = 1
  170. #print(conjunctive_form)
  171. #print(conjunctive_form[0].split())
  172. #print(conjunctive_form[0].split()[0])
  173. if thereisanIch == 1 and conjunctive_form[0].split()[0] == 'er/sie/es':
  174. donothing = 1
  175. if donothing == 0:
  176. #print(wordindex_to_replace)
  177. if len(verbs_of_sentence) < len(wordindex_to_replace):
  178. thelen = len(verbs_of_sentence)
  179. else:
  180. thelen = len(wordindex_to_replace)
  181. #print('cs sentence and verbsofsentence', sentence, verbs_of_sentence, wordindex_to_replace)
  182. for n in range(thelen):
  183. #print(indicative_form, wordindex_to_replace, sentence, verbs_of_sentence)
  184. wasreplaced = 0
  185. if sentence[wordindex_to_replace[n] - 1][-1] == ',':
  186. changesent = list(sentence[wordindex_to_replace[n] - 1])
  187. changesent[:-1] = list(verbs_of_sentence[n])
  188. sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
  189. wasreplaced = 1
  190. if sentence[wordindex_to_replace[n] - 1][-1] == '.':
  191. changesent = list(sentence[wordindex_to_replace[n] - 1])
  192. changesent[:-1] = list(verbs_of_sentence[n])
  193. sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
  194. wasreplaced = 1
  195. if sentence[wordindex_to_replace[n] - 1][-1] == ')':
  196. changesent = list(sentence[wordindex_to_replace[n] - 1])
  197. changesent[:-1] = list(verbs_of_sentence[n])
  198. sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
  199. wasreplaced = 1
  200. if sentence[wordindex_to_replace[n] - 1][0] == '(':
  201. changesent = list(sentence[wordindex_to_replace[n] - 1])
  202. changesent[1:] = list(verbs_of_sentence[n])
  203. sentence[wordindex_to_replace[n] - 1] = ''.join(changesent)
  204. wasreplaced = 1
  205. if wasreplaced == 0:
  206. sentence[wordindex_to_replace[n] - 1] = verbs_of_sentence[n]
  207. #print(word.tag_ )
  208. outsentences.append(sentence)
  209. #print('the endsentence',sentence)
  210. return outsentences