You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

521 lines
24 KiB

4 years ago
  1. # class to implement GS utils and Search
  2. import resource
  3. class GS_Utils(object):
  4. def __init__(self, language):
  5. #print('loading spacy..')
  6. import spacy
  7. self.nlp = spacy.load(language)
  8. #print('done')
  9. self.oi = 'oi'
  10. def Sentence2GrammarSchema(self, sentence, spacyclass):
  11. doc = self.nlp(sentence)
  12. #print(doc)
  13. GsDBsentence = []
  14. for word in doc:
  15. # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
  16. if len(eval(spacyclass)) > 1:
  17. GsDBsentence.append(eval(spacyclass))
  18. return GsDBsentence
  19. def Sentence2RightGrammarTupel(self, sentence, gs_sentence, right_gs_tupel):
  20. grammcorr_sentences = []
  21. #print(sentence)
  22. #print(gs_sentence)
  23. #print(right_gs_tupel)
  24. sentence = sentence.split()
  25. for elements in right_gs_tupel:
  26. grammcor_sentence = []
  27. usedwordslist = []
  28. usedwords = set(usedwordslist)
  29. for element in elements.split():
  30. ok = 0
  31. #print('1')
  32. for n in range(len(gs_sentence)):
  33. #print(element)
  34. #print(gs_sentence)
  35. if element == gs_sentence[n] and n not in usedwords:
  36. if ok == 0:
  37. #print('bla', sentence[n])
  38. grammcor_sentence.append(sentence[n])
  39. usedwordslist.append(n)
  40. usedwords = set(usedwordslist)
  41. ok = 1
  42. grammcorr_sentences.append(grammcor_sentence)
  43. return grammcorr_sentences
  44. # gets the best grammar scheme from both, depending on which one appears the most in both, and whether rules are still present.
  45. def GetBestgsAccordingRules(self, sentence, gs_sentence1, right_gs_tupel1, right_gs_tupel2, grammcorr_sentences1 , grammcorr_sentences2, rules , specialrules):
  46. equals = []
  47. for n in range(len(grammcorr_sentences1)):
  48. equalcount = 0
  49. for l in range(len(grammcorr_sentences2)):
  50. if len(grammcorr_sentences1[n]) <= len(grammcorr_sentences2[l]):
  51. for m in range(len(grammcorr_sentences1[n])):
  52. if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:
  53. equalcount += 1
  54. else:
  55. for m in range(len(grammcorr_sentences2[l])):
  56. if grammcorr_sentences1[n][m] == grammcorr_sentences2[l][m]:
  57. equalcount += 1
  58. equals.append(equalcount)
  59. # from here check the if a rule is in the input, if yes then check it for grammar schemes and raise scores.
  60. newrules = []
  61. for n in range(len(rules)):
  62. newrules.append([])
  63. ruleapplicable = []
  64. for m in range(len(rules)):
  65. ruleapplicable.append(False)
  66. if len(rules[m]) == 2:
  67. for n in range(len(gs_sentence1)-1):
  68. if rules[m][0] == gs_sentence1[n]:
  69. if rules[m][1] == gs_sentence1[n+1]:
  70. ruleapplicable[m] = True
  71. newrules[m] = sentence.split()[n:n+2]
  72. for n in range(len(grammcorr_sentences1)):
  73. if ruleapplicable[m] == True:
  74. for p in range(len(grammcorr_sentences1[n])-1):
  75. if grammcorr_sentences1[n][p] == newrules[m][0] and grammcorr_sentences1[n][p+1] == newrules[m][1]:
  76. equals[n] += 40 * len(newrules[m])
  77. else:
  78. pass
  79. if len(rules[m]) == 3:
  80. for n in range(len(gs_sentence1)-2):
  81. if rules[m][0] == gs_sentence1[n]:
  82. if rules[m][1] == gs_sentence1[n+1]:
  83. if rules[m][2] == gs_sentence1[n+2]:
  84. ruleapplicable[m] = True
  85. newrules[m] = sentence.split()[n:n+3]
  86. for n in range(len(grammcorr_sentences1)):
  87. if ruleapplicable[m] == True:
  88. for p in range(len(grammcorr_sentences1[n])-2):
  89. if grammcorr_sentences1[n][p] == newrules[m][0]:
  90. if grammcorr_sentences1[n][p+1] == newrules[m][1]:
  91. if grammcorr_sentences1[n][p+2] == newrules[m][2]:
  92. equals[n] += 40 * len(newrules[m])
  93. if len(rules[m]) == 4:
  94. for n in range(len(gs_sentence1)-3):
  95. if rules[m][0] == gs_sentence1[n]:
  96. if rules[m][1] == gs_sentence1[n+1]:
  97. if rules[m][2] == gs_sentence1[n+2]:
  98. if rules[m][3] == gs_sentence1[n+3]:
  99. ruleapplicable[m] = True
  100. newrules[m] = sentence.split()[n:n+4]
  101. for n in range(len(grammcorr_sentences1)):
  102. if ruleapplicable[m] == True:
  103. for p in range(len(grammcorr_sentences1[n])-3):
  104. if grammcorr_sentences1[n][p] == newrules[m][0]:
  105. if grammcorr_sentences1[n][p+1] == newrules[m][1]:
  106. if grammcorr_sentences1[n][p+2] == newrules[m][2]:
  107. if grammcorr_sentences1[n][p+3] == newrules[m][3]:
  108. equals[n] += 40 * len(newrules[m])
  109. if len(rules[m]) == 5:
  110. for n in range(len(gs_sentence1)-4):
  111. if rules[m][0] == gs_sentence1[n]:
  112. if rules[m][1] == gs_sentence1[n+1]:
  113. if rules[m][2] == gs_sentence1[n+2]:
  114. if rules[m][3] == gs_sentence1[n+3]:
  115. if rules[m][4] == gs_sentence1[n+4]:
  116. ruleapplicable[m] = True
  117. newrules[m] = sentence.split()[n:n+5]
  118. for n in range(len(grammcorr_sentences1)):
  119. if ruleapplicable[m] == True:
  120. for p in range(len(grammcorr_sentences1[n])-4):
  121. if grammcorr_sentences1[n][p] == newrules[m][0]:
  122. if grammcorr_sentences1[n][p+1] == newrules[m][1]:
  123. if grammcorr_sentences1[n][p+2] == newrules[m][2]:
  124. if grammcorr_sentences1[n][p+3] == newrules[m][3]:
  125. if grammcorr_sentences1[n][p+4] == newrules[m][4]:
  126. equals[n] += 40 * len(newrules[m])
  127. #print('the found rules from input:',newrules)
  128. for n in range(len(grammcorr_sentences1)):
  129. for m in range(len(specialrules)):
  130. if len(specialrules[m]) == 2:
  131. for p in range(len(grammcorr_sentences1[n])-1):
  132. if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1]:
  133. equals[n] += len(grammcorr_sentences1[n])
  134. else:
  135. pass
  136. if len(specialrules[m]) == 3:
  137. for p in range(len(grammcorr_sentences1[n])-2):
  138. if right_gs_tupel1[n][p] == specialrules[m][0] and right_gs_tupel1[n+1][p] == specialrules[m][1] and right_gs_tupel1[n+2][p] == specialrules[m][2]:
  139. equals[n] += len(grammcorr_sentences1[n])
  140. else:
  141. pass
  142. #for n in range(len(grammcorr_sentences1)):
  143. #if len(sentence.split()) == grammcorr_sentences1[n]:
  144. #equals[n] += 50
  145. indexedequals = []
  146. for n in range(len(equals)):
  147. indexedequals.append([n,equals[n]])
  148. indexedequals_sorted = sorted(indexedequals[::-1], key=lambda tup: tup[1], reverse=True)
  149. return grammcorr_sentences1[indexedequals_sorted[0][0]]
  150. def checkSPO(self, splitsentence, convertedornot):
  151. if convertedornot == 0:
  152. gs_sentenceSPOProof = self.Sentence2GrammarSchema(' '.join(splitsentence), 'word.dep_')
  153. if convertedornot == 1:
  154. gs_sentenceSPOProof = splitsentence
  155. spoCount = [0,0,0]
  156. for word in gs_sentenceSPOProof:
  157. if word == 'sb' or word == 'ep' or word == 'ph':
  158. spoCount[0] = 1
  159. if word == 'ROOT' or word == 'pd':
  160. spoCount[1] = 1
  161. if word == 'oa' or word == 'og' or word == 'oc' or word == 'op' or word == 'mo':
  162. spoCount[2] = 1
  163. return spoCount
  164. def checkForAnnotation(self, splitsentence, token, spacyclass):
  165. gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
  166. AnnoORnot = 0
  167. for word in gs_sentence_RC_Proof:
  168. if word == token:
  169. AnnoORnot = 1
  170. return AnnoORnot
  171. def checkForAnnotationInTokenizedSentence(self, splitsentence, token):
  172. gs_sentence_RC_Proof = splitsentence
  173. AnnoORnot = 0
  174. for word in gs_sentence_RC_Proof:
  175. if word == token:
  176. AnnoORnot = 1
  177. return AnnoORnot
  178. def checkForAnnotationTuple(self, splitsentence, token, spacyclass, tupleinwords):
  179. #self.spacyclass = spacyclass
  180. gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
  181. AnnotationtupleInwords = []
  182. AnnoORnot = 0
  183. #print(gs_sentence_RC_Proof)
  184. for n in range(len(gs_sentence_RC_Proof) - 1):
  185. if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tupleinwords[0] or tupleinwords == 'None'):
  186. #print('oioioiAYE')
  187. #print(gs_sentence_RC_Proof)
  188. AnnoORnot = 1
  189. if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tupleinwords[1] or tupleinwords == 'None'):
  190. AnnoORnot = 2
  191. AnnotationtupleInwords.append(splitsentence[n:n+2])
  192. #print(token)
  193. return AnnoORnot, AnnotationtupleInwords
  194. def checkForAnnotationTriple(self, splitsentence, token, spacyclass, tripleinwords):
  195. #self.spacyclass = spacyclass
  196. gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
  197. #print('gssentencercprooof', gs_sentence_RC_Proof)
  198. AnnoORnot = 0
  199. AnnotationtripleInwords = []
  200. for n in range(len(gs_sentence_RC_Proof) - 2):
  201. if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == tripleinwords[0] or tripleinwords == 'None'):
  202. AnnoORnot = 1
  203. if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == tripleinwords[1] or tripleinwords == 'None'):
  204. AnnoORnot = 2
  205. if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == tripleinwords[2] or tripleinwords == 'None'):
  206. AnnoORnot = 3
  207. AnnotationtripleInwords.append(splitsentence[n:n+3])
  208. return AnnoORnot, AnnotationtripleInwords
  209. def checkForAnnotationQuadruple(self, splitsentence, token, spacyclass, quadrupleinwords):
  210. #self.spacyclass = spacyclass
  211. gs_sentence_RC_Proof = self.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass)
  212. #print('gssentencercprooof', gs_sentence_RC_Proof)
  213. #print('quadrupleinwords',quadrupleinwords)
  214. #print('token', token)
  215. AnnoORnot = 0
  216. AnnotationquadrupleInwords = []
  217. for n in range(len(gs_sentence_RC_Proof) - 3):
  218. if gs_sentence_RC_Proof[n] == token[0] and (splitsentence[n] == quadrupleinwords[0] or quadrupleinwords == 'None'):
  219. AnnoORnot = 1
  220. if gs_sentence_RC_Proof[n + 1] == token[1] and (splitsentence[n+1] == quadrupleinwords[1] or quadrupleinwords == 'None'):
  221. AnnoORnot = 2
  222. if gs_sentence_RC_Proof[n + 2] == token[2] and (splitsentence[n+2] == quadrupleinwords[2] or quadrupleinwords == 'None'):
  223. AnnoORnot = 3
  224. if gs_sentence_RC_Proof[n + 3] == token[3] and (splitsentence[n+3] == quadrupleinwords[3] or quadrupleinwords == 'None'):
  225. AnnoORnot = 4
  226. AnnotationquadrupleInwords.append(splitsentence[n:n+4])
  227. #print('AnnotationquadrupleInwords', AnnotationquadrupleInwords)
  228. return AnnoORnot, AnnotationquadrupleInwords
  229. #input ['this', 'is', 'a', 'sentence']
  230. def GetTuplesinSentence(self,mainsentence):
  231. tuplesToCheck = []
  232. tuples = [['ART', 'NE'], ['ART', 'NN'], ['APPR','NN'], ['APPR','ADJD'], ['APPR','NE'], ['ART', 'CARD'], ['APPR', 'CARD'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD'],['ADV', 'ADV'], ['ADV', 'PTKVZ'], ['PTKNEG', 'ADV'], ['ADJA','NN'], ['ADJA','NE'], ['ADV','PIS'], ['ADJA','PIS'], ['ADJD','PIS'], ['APPRART', 'NN'], ['APPRART', 'NE'], ['PDAT', 'NE'], ['PDAT', 'NN'], ['PWAT', 'NE'], ['PWAT', 'NN'], ['PIAT', 'NE'], ['PIAT', 'NN'], ['PROAV', 'ADJD'],['PDS', 'NE'], ['PDS', 'NN'], ['NE', 'NE'], ['CARD', 'NE'], ['CARD', 'NN'] ]
  233. #print('beginning of gettuplesinsentence')
  234. #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
  235. for tupl in tuples:
  236. #print('checking another tuple')
  237. #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
  238. checktupleindex, tupleInWords = self.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')
  239. if len(tupleInWords) > 0:
  240. for tup in tupleInWords:
  241. tuplesToCheck.append([tupl, tup])
  242. #print('oi a tuple was found')
  243. #print('after the loop')
  244. #print('inkb',resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
  245. #print('no going to the triples')
  246. triplesToCheck = []
  247. triples = [['APPR', 'ART', 'NN'],['APPR', 'PDAT', 'NN'], ['APPR', 'PDS', 'NN'], ['ART','ADJA','NN'], ['ART','ADJA','NE'], ['APPR', 'ART', 'NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['APPR', 'PIAT', 'NN'], ['APPR', 'ADJA', 'NN'], ['APPR', 'ADJA', 'NE'], ['APPRART', 'NN', 'CARD'], ['APPRART', 'NE', 'CARD'], ['APPRART', 'NN', 'NE'], ['CARD', 'KON', 'CARD'], ['APPR', 'ADV', 'CARD'], ['ADJD', 'KOKOM', 'CARD'], ['APPR', 'NE', 'NE'], ['NN', 'KON', 'NN'], ['NE', 'NN', 'NE'], ['APPR', 'NE', 'NN'], ['APPR', 'CARD', 'NN'], ['APPR', 'CARD', 'NE']]
  248. for tripl in triples:
  249. #print('checking next triple')
  250. checktripleindex, tripleInWords = self.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')
  251. if len(tripleInWords) > 0:
  252. for trip in tripleInWords:
  253. triplesToCheck.append([tripl, trip])
  254. #print('oi a triple was found')
  255. quadruplesToCheck = []
  256. quadruples = [['KOKOM', 'ADV', 'ADJA', 'NN'], ['KOKOM', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NE'], ['APPR', 'ADV', 'ADJA', 'NN'], ['ART', 'NN', 'APPR', 'NE'], ['APPR', 'NE', 'NN', 'NE'], ['APPR', 'ART', 'ADJA', 'NN'], ['ART', 'ADJD', 'ADJA', 'NN']]
  257. for quadrupl in quadruples:
  258. #print('checking next triple')
  259. checkquadrupleindex, quadrupleInWords = self.checkForAnnotationQuadruple(mainsentence, quadrupl, 'word.tag_', 'None')
  260. if len(quadrupleInWords) > 0:
  261. for quad in quadrupleInWords:
  262. quadruplesToCheck.append([quadrupl, quad])
  263. #print('gettuples insentences is done')
  264. return tuplesToCheck, triplesToCheck, quadruplesToCheck
  265. def createTupleofGrammarpieces(self, sentence, tuplesToCheck, triplesToCheck, quadruplesToCheck):
  266. #print('going in crate tuple of grammar pieces')
  267. tuplestoremove = []
  268. for tupl in tuplesToCheck:
  269. for tripl in triplesToCheck:
  270. if (tupl[1][0] == tripl[1][0] and tupl[1][1] == tripl[1][1]) or (tupl[1][0] == tripl[1][1] and tupl[1][1] == tripl[1][2]):
  271. tuplestoremove.append(tupl)
  272. for tupletoremove in tuplestoremove:
  273. tuplesToCheck.remove(tupletoremove)
  274. #print('in between0', sentence, quadruplesToCheck, tuplesToCheck, triplesToCheck)
  275. tuplestoremove = []
  276. for tupl in tuplesToCheck:
  277. for quad in quadruplesToCheck:
  278. #print('I got here')
  279. #print(tupl, quad)
  280. #print(tupl[1][0], tupl[1][1], quad[1][2], quad[1][3])
  281. if (tupl[1][0] == quad[1][0] and tupl[1][1] == quad[1][1]) or (tupl[1][0] == quad[1][1] and tupl[1][1] == quad[1][2]) or (tupl[1][0] == quad[1][2] and tupl[1][1] == quad[1][3]):
  282. #print('and I got here', tupl)
  283. tuplestoremove.append(tupl)
  284. for tupletoremove in tuplestoremove:
  285. tuplesToCheck.remove(tupletoremove)
  286. #print('and until here?')
  287. triplestoremove = []
  288. for tripl in triplesToCheck:
  289. for quad in quadruplesToCheck:
  290. if (tripl[1][0] == quad[1][0] and tripl[1][1] == quad[1][1] and tripl[1][2] == quad[1][2]) or (tripl[1][0] == quad[1][1] and tripl[1][1] == quad[1][2] and tripl[1][2] == quad[1][3]):
  291. triplestoremove.append(tripl)
  292. for tripltoremove in triplestoremove:
  293. triplesToCheck.remove(tripltoremove)
  294. bracketinfo = []
  295. bracketinfos = []
  296. bracketindex = 0
  297. #print('in between1', sentence, quadruplesToCheck)
  298. for n in range(len(sentence)):
  299. if sentence[n] != '':
  300. if sentence[n] == '(' or sentence[n][0] == '(':
  301. for m in range(n ,len(sentence)):
  302. bracketinfo.append(sentence[m])
  303. if sentence[m] == ')' or sentence[m][-1] == ')':
  304. wordbeforebracketinfo = None
  305. try:
  306. wordbeforebracketinfo = sentence[n-1]
  307. except:
  308. pass
  309. bracketinfos.append([bracketinfo, wordbeforebracketinfo])
  310. bracketinfo = []
  311. break
  312. #print('in between2', sentence, quadruplesToCheck)
  313. #print('bracketinfo',bracketinfos)
  314. #print('sentence',sentence)
  315. for bracketinfo in bracketinfos:
  316. for word in bracketinfo[0]:
  317. sentence.remove(word)
  318. #print('in between3', sentence, quadruplesToCheck)
  319. if len(quadruplesToCheck) != 0:
  320. for n in range(len(quadruplesToCheck)):
  321. for m in range(len(sentence) - 3):
  322. if sentence[m] == quadruplesToCheck[n][1][0]:
  323. if sentence[m + 1] == quadruplesToCheck[n][1][1]:
  324. if sentence[m + 2] == quadruplesToCheck[n][1][2]:
  325. if sentence[m + 3] == quadruplesToCheck[n][1][3]:
  326. del sentence[m + 3]
  327. del sentence[m + 2]
  328. del sentence[m + 1]
  329. del sentence[m]
  330. sentence.insert(m,' '.join(quadruplesToCheck[n][1]))
  331. if len(triplesToCheck) != 0:
  332. for n in range(len(triplesToCheck)):
  333. for m in range(len(sentence) - 2):
  334. if sentence[m] == triplesToCheck[n][1][0]:
  335. if sentence[m + 1] == triplesToCheck[n][1][1]:
  336. if sentence[m + 2] == triplesToCheck[n][1][2]:
  337. del sentence[m + 2]
  338. del sentence[m + 1]
  339. del sentence[m]
  340. sentence.insert(m,' '.join(triplesToCheck[n][1]))
  341. if len(tuplesToCheck) != 0:
  342. for n in range(len(tuplesToCheck)):
  343. for m in range(len(sentence) - 1):
  344. if sentence[m] == tuplesToCheck[n][1][0]:
  345. if sentence[m + 1] == tuplesToCheck[n][1][1]:
  346. del sentence[m + 1]
  347. del sentence[m]
  348. sentence.insert(m,' '.join(tuplesToCheck[n][1]))
  349. for bracketinfo in bracketinfos:
  350. bracketinfowasthere = 0
  351. for n in range(len(sentence)):
  352. sentencensplit = sentence[n].split()
  353. if bracketinfo[1] == sentencensplit[-1]:
  354. sentence[n] = sentence[n] + ' '.join(bracketinfo[0])
  355. bracketinfowasthere = 1
  356. break
  357. if bracketinfowasthere == 0:
  358. sentence.append(' '.join(bracketinfo[0]))
  359. #print('sentence in gs create tuple of grammar pieces', sentence)
  360. #print('thesentencein create tuple of grammarpieces ',sentence)
  361. return sentence
  362. # die folgende Klasse ist zu rechenaufwendig
  363. def filterpermutationsaccordingtotuples(self, sentences, tuplesToCheck, triplesToCheck):
  364. filteredprobsentences = []
  365. for sentence in sentences:
  366. tuplchecked = 0
  367. triplchecked = 0
  368. #print('sentence and tuples to check', sentence, tuplesToCheck)
  369. for tupl in tuplesToCheck:
  370. #print(list(sentence))
  371. checkedsecondtime, tupleinWords = self.checkForAnnotationTuple(sentence, tupl[0], 'word.tag_', tupl[1])
  372. #print(checkedsecondtime)
  373. if checkedsecondtime == 1:
  374. tuplchecked = 0
  375. if checkedsecondtime == 2:
  376. tuplchecked = 1
  377. for tripl in triplesToCheck:
  378. #print(sentence)
  379. checkedsecondtime, tripleinWords = self.checkForAnnotationTriple(sentence, tripl[0], 'word.tag_', tripl[1])
  380. if checkedsecondtime == 1 or checkedsecondtime == 2:
  381. triplchecked = 0
  382. if checkedsecondtime == 3:
  383. triplchecked = 1
  384. if tuplchecked == 1 or triplchecked == 1:
  385. filteredprobsentences.append(sentence)
  386. return filteredprobsentences