You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

224 lines
8.6 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. # Class to solve Shortforms, data comes from Abkuerzungen.txt
  2. import hickle as hkl
  3. import FASTsearch
  4. class SolveShorts(object):
  5. def __init__(self, hklDatabaseDir_Shorts, hklDatabaseDir_Shorts_All):
  6. self.ShortsDB_All = hkl.load(hklDatabaseDir_Shorts_All)
  7. self.ShortsDB = hkl.load(hklDatabaseDir_Shorts)
  8. # Input: csv file with the form ['d.h.', n] , ['das', 'heißt'] for each line
  9. # Output: hkl dump of array in form [[1],[d.h.],['das', 'heißt']]
  10. def create_hklDB_from_csv(self, csvDbDir):
  11. with open(csvDbDir) as lines:
  12. ShortsDB_All = []
  13. for line in lines:
  14. ShortsDB_All.append(list(eval(line)))
  15. #print(ShortsDB_All)
  16. #print(ShortsDB_All[0][0])
  17. hkldbShorts = []
  18. counter = 0
  19. for n in range(len(ShortsDB_All)):
  20. counter += 1
  21. #if counter % 1000 == 0:
  22. #print(counter)
  23. hkldbShorts.append([ShortsDB_All[n][0][0]])
  24. #print('hkldbShorts', hkldbShorts)
  25. #print('creating the hkl dump of ShortsDBAll')
  26. hkl.dump(ShortsDB_All, 'hkldbShorts_All.hkl', mode='w', compression='gzip')
  27. #print('done..')
  28. #print('Creating the hkl dump of ShortsDB')
  29. hkl.dump(hkldbShorts, 'hkldbShorts.hkl', mode='w', compression='gzip')
  30. #print('done..')
  31. return 'done'
  32. def load_DB_into_FASTsearch(self):
  33. #print('loading hkldbShorts ..')
  34. self.fsearch1 = FASTsearch.FASTsearch('hkldbShorts.hkl')
  35. #print('done')
  36. #print('generating BoW Model..')
  37. #self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = True)
  38. #print('done')
  39. #print('loading the bow model')
  40. self.fsearch1.Load_BoW_Model('bagofwordshkldbShorts.pkl', 'DataBaseOneZeroshkldbShorts.hkl')
  41. #print('done')
  42. import spacy
  43. #print('loading the german spacy model..')
  44. self.nlp = spacy.load('de_core_news_sm')
  45. #print('done')
  46. #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
  47. def ExplainShortsInSentencesWithBrackets(self, sentences):
  48. outsentences = []
  49. count = 0
  50. for sentence in sentences:
  51. count += 1
  52. #print('processing sentence', count)
  53. nshort = []
  54. therewasapossibleshort = 0
  55. explanationlist = []
  56. doc = self.nlp(' '.join(sentence))
  57. #print('da sentence', sentence)
  58. newshorts = []
  59. wordcount = 0
  60. for oriword in sentence:
  61. wordcount += 1
  62. if wordcount == len(sentence):
  63. word = oriword + '.'
  64. else:
  65. word = oriword
  66. newshort = []
  67. prenewshort = []
  68. punctcount = list(word).count('.')
  69. #print(word, list(word), punctcount)
  70. if punctcount > 1:
  71. replaceindex = sentence.index(oriword)
  72. dacount = 0
  73. for letter in list(word):
  74. #print('letter in word split', letter)
  75. prenewshort.append(letter)
  76. if letter == '.':
  77. dacount += 1
  78. newshort.append(''.join(prenewshort))
  79. prenewshort = []
  80. if dacount == punctcount:
  81. newshorts.append([newshort, replaceindex])
  82. #print(newshorts)
  83. for newshort in newshorts[::-1]:
  84. if len(newshort) > 0:
  85. del sentence[newshort[1]]
  86. for part in newshort[0][::-1]:
  87. sentence.insert(newshort[1], part)
  88. #print('sentence after newshortreplace', sentence)
  89. for n in range(len(sentence)):
  90. NhasToBeChecked = True
  91. for r in range(len(explanationlist)):
  92. if explanationlist[r][3] <= n < explanationlist[r][1]:
  93. NhasToBeChecked = False
  94. # Liste von falsch erkannten, zb er sollte nicht erkannt werden :)
  95. if sentence[n] in ['Er', 'er', 'ab', 'Ab', 'so', 'da', 'an', 'mit', 'Am', 'am']:
  96. NhasToBeChecked = False
  97. if n != 0 and sentence[n][-1] != '.' and doc[n - 1].dep_[:2] != 'ART':
  98. NhasToBeChecked = False
  99. if NhasToBeChecked == True:
  100. bestmatches1, matchindex = self.fsearch1.search_with_highest_multiplikation_Output(sentence[n], 1)
  101. #print(bestmatches1, matchindex)
  102. interestingindex = 0
  103. if sentence[n][-1] == '.':
  104. #print(sentence[n])
  105. #print('oioioioioi')
  106. if len(sentence) - n > 5:
  107. for m in range(5):
  108. #print(n, m, n+m+1, len(sentence))
  109. if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.':
  110. interestingindex = m
  111. break
  112. if len(sentence) - n <= 5 and n != len(sentence) - 1:
  113. for m in range((len(sentence) - n)):
  114. #print('oleolaolu',n, m, n+m+1, len(sentence))
  115. if m == (len(sentence) - n) - 1:
  116. if sentence[n + m][-1] == '.':
  117. interestingindex = m
  118. break
  119. else:
  120. if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.' :
  121. interestingindex = m
  122. break
  123. #print(interestingindex, 'interestingindex')
  124. if interestingindex == 0:
  125. finalmatchindex = matchindex
  126. if interestingindex >= 1:
  127. thesentence = ''
  128. for i in range(interestingindex + 1):
  129. #print('sentence', sentence[n+i])
  130. #print(thesentence + sentence[n+i])
  131. if i == 0:
  132. presentence = sentence[n + i]
  133. if i >= 1:
  134. presentence = ' ' + sentence[n + i]
  135. thesentence = thesentence + presentence
  136. #print('thesentence',thesentence)
  137. mbestmatches, mmatchindex = self.fsearch1.search_with_highest_multiplikation_Output(thesentence , 1)
  138. #print(mmatchindex)
  139. finalmatchindex = mmatchindex
  140. if finalmatchindex[1] == 1:
  141. wordexplanationIndex = finalmatchindex[0]
  142. wordexplanation = self.ShortsDB_All[wordexplanationIndex][1]
  143. explanationlist.insert(0, [wordexplanation, n + interestingindex + 1, interestingindex, n])
  144. #print('explanationlist', explanationlist)
  145. for i in range(len(explanationlist)):
  146. for k in range(len(explanationlist)):
  147. if explanationlist[i][3] == explanationlist[k][3] and i != k:
  148. if explanationlist[i][2] > explanationlist[k][2]:
  149. del explanationlist[k]
  150. if explanationlist[i][2] < explanationlist[k][2]:
  151. del explanationlist[i]
  152. for j in range(len(explanationlist)):
  153. sentence.insert(explanationlist[j][1], '(' + ' '.join(explanationlist[j][0]) + ')')
  154. #print(sentence)
  155. outsentences.append(sentence)
  156. # if uebereinstimmung, go to index and exchange
  157. return outsentences