You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2223 lines
91 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. # split sentences
  2. # in den Listen fehlt noch sondern ( und noch weitere Dinge..)
  3. # Folgende Konjunktionen brauchen keine Satzumformungen:
  4. # Woraufhin, zudem, zumal, umso - desto,
  5. # sondern ist schwierig zu lösen.. am besten mit sondern weg, und anschließend SentGlue
  6. class SentSeg(object):
  7. def __init__(self, language):
  8. self.language = language
  9. self.punktuation_list = ['.', '?', '!', ';', ':']
  10. self.wrappunktuation_list = [',', '-']
  11. self.adversativ_list = ['wohingegen', 'Wohingegen', 'aber', 'Aber', 'wobei', 'Wobei', 'hingegen']
  12. self.final_list = ['damit','Damit', 'um', 'Um']
  13. self.kausal_list = ['weil', 'Weil', 'da', 'Da', 'denn', 'falls', 'Falls' ]
  14. self.konditional_list = ['wenn', 'Wenn', 'sobald', 'Sobald', 'als', 'falls']
  15. self.konsekutiv_list = ['dass', 'Dass']
  16. self.konzessiv_list = ['obwohl', 'Obwohl', 'obgleich', 'Obgleich', 'trotzdem', 'Trotzdem', 'wenngleich', 'doch']
  17. self.lokal_list = ['wo', 'Wo']
  18. self.temporal_list_vor = ['bevor', 'Bevor']
  19. self.temporal_list_nach = ['nachdem', 'Nachdem']
  20. self.instrumental_list = ['indem', 'Indem']
  21. self.indirectspeech_list = ['ob', 'Ob', 'wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso']
  22. self.firstwordlist = []
  23. #self.firstwordlist = ['wann', 'Wann', 'wer', 'Wer', 'wie', 'Wie', 'warum', 'Warum', 'weshalb', 'Weshalb', 'wieso', 'Wieso', 'dies', 'dann', 'jedoch', 'deswegen', 'trotzdem', 'danach', 'davor', 'wenn', 'sobald']
  24. self.full_list = self.adversativ_list + self.final_list + self.kausal_list + self.konditional_list + self.konsekutiv_list + self.konzessiv_list + self.lokal_list + self.temporal_list_nach + self.temporal_list_vor + self.instrumental_list + self.indirectspeech_list
  25. def ReadDoc2Sent(self, document):
  26. splitsentences = []
  27. splitsentence = []
  28. with open(document) as sentences:
  29. counter = 0
  30. for sentence in sentences:
  31. counter += 1
  32. if counter % 1000 == 0:
  33. print(counter)
  34. words = sentence.split()
  35. for word in words:
  36. splitsentence.append(word)
  37. if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
  38. splitsentences.append([splitsentence])
  39. splitsentence = []
  40. return splitsentences
  41. def AndOrSolver(self, sentences, punctuations):
  42. for n in range(len(punctuations)):
  43. if punctuations[n] == ':' or punctuations[n] == '-':
  44. punctuations[n] = '.'
  45. #print(sentences, punctuations)
  46. splitsentences = []
  47. counter = 0
  48. newsentences = []
  49. for sentence in sentences:
  50. newpunctuationsindexes = []
  51. utterancenumber = sentence[2]
  52. commainfo = sentence[1]
  53. commaornot = commainfo[0]
  54. sentence = sentence[0]
  55. counter += 1
  56. doc = self.nlp(' '.join(sentence))
  57. subjectcount = 0
  58. separationwords = []
  59. subjectcounts = []
  60. doccounter = 0
  61. subjectindex = []
  62. rcornot = 0
  63. for word in doc:
  64. doccounter += 1
  65. if word.dep_ == 'sb' or word.dep_ == 'ep':
  66. subjectcount += 1
  67. subjectindex.append(doccounter - 1)
  68. if word.dep_ == 'rc':
  69. rcornot = 1
  70. if word.tag_ == '$,':
  71. subjectcounts.append([subjectcount, doccounter - 2, subjectindex, rcornot])
  72. subjectindex = []
  73. subjectcount = 0
  74. #print('aleaole',sentence[doccounter - 2])
  75. if len(sentence[doccounter - 2]) > 1:
  76. doccounter -= 1
  77. if word.text == 'und' or word.text == 'also' or word.text == 'oder' or word.text == 'schon' or word.text == 'bald' or word.text == 'doch' or word.text == 'jedoch' or word.text == 'sondern':
  78. separationwords.append(doccounter - 1)
  79. #print('separationwords', separationwords)
  80. #print('subjectcounts', subjectcounts)
  81. separationwordstocut = []
  82. listofownsentencessubjectindexes = []
  83. for n in range(len(subjectcounts) - 1):
  84. if subjectcounts[n][0] > 0 and subjectcounts[n + 1][0] > 0 and subjectcounts[n + 1][3] == 0:
  85. listofownsentencessubjectindexes.append(subjectcounts[n])
  86. for m in range(len(separationwords)):
  87. if subjectcounts[n][1] < separationwords[m] < subjectcounts[n + 1][1]:
  88. #print(subjectcounts[n + 1], separationwords[m])
  89. if subjectcounts[n + 1][0] > 1:
  90. if subjectcounts[n + 1][2][0] < separationwords[m] <= subjectcounts[n + 1][2][-1]:
  91. separationwordstocut.append(separationwords[m])
  92. processed = 0
  93. #print('oioioi')
  94. #print(listofownsentencessubjectindexes)
  95. #print(separationwordstocut)
  96. if len(listofownsentencessubjectindexes) > 0:
  97. for n in range(len(listofownsentencessubjectindexes)):
  98. sentence[listofownsentencessubjectindexes[n][1]] = sentence[listofownsentencessubjectindexes[n][1]] + 'alohaseparator'
  99. newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
  100. #print('a new punctuation1')
  101. processed = 1
  102. if len(separationwordstocut) > 0:
  103. for n in range(len(separationwordstocut)):
  104. sentence[separationwordstocut[n] - 1] = sentence[separationwordstocut[n] - 1] + 'alohaseparator'
  105. #print('a new punctuation2')
  106. newpunctuationsindexes.append([punctuations[counter - 1], counter - 1])
  107. processed = 1
  108. if processed == 0:
  109. newsentences.append([sentence])
  110. if processed == 1:
  111. #print(sentence)
  112. splitsentence = []
  113. for word in sentence:
  114. splitsentence.append(word)
  115. if word[-14:] == 'alohaseparator':
  116. if splitsentence[-1][-15] == ',':
  117. splitsentence[-1] = splitsentence[-1][:-15]
  118. else:
  119. splitsentence[-1] = splitsentence[-1][:-14]
  120. newsentences.append([splitsentence])
  121. splitsentence = []
  122. newsentences.append([splitsentence])
  123. #print(newpunctuationsindexes)
  124. newpunctuationsindexes = newpunctuationsindexes[::-1]
  125. for n in range(len(newpunctuationsindexes)):
  126. punctuations.insert(newpunctuationsindexes[n][1], newpunctuationsindexes[n][0])
  127. #print(newsentences, punctuations)
  128. return newsentences, punctuations
  129. def LoadBoWModelAndDatabaseOnesZeros(self):
  130. import FASTsearch
  131. #print('loading the tag hkl db..')
  132. self.fsearch1 = FASTsearch.FASTsearch('GS_DB_word.tag_.hkl')
  133. #print('done')
  134. #print('generating BoW Model..')
  135. self.fsearch1.Gen_BoW_Model(1000, "word")
  136. #print('done')
  137. #print('loading the bow model')
  138. self.fsearch1.Load_BoW_Model('bagofwordsGS_DB_word.tag_.pkl', 'DataBaseOneZerosGS_DB_word.tag_.hkl')
  139. #print('done')
  140. #print('loading the dep hkl db..')
  141. self.fsearch2 = FASTsearch.FASTsearch('GS_DB_word.dep_.hkl')
  142. #print('done')
  143. #print('generating BoW Model..')
  144. self.fsearch2.Gen_BoW_Model(1000, "word")
  145. #print('done')
  146. #print('loading the bow model')
  147. self.fsearch2.Load_BoW_Model('bagofwordsGS_DB_word.dep_.pkl', 'DataBaseOneZerosGS_DB_word.dep_.hkl')
  148. #print('done')
  149. def LoadSentGlueSGDandGSUtils(self):
  150. import GS_Utils
  151. #print('initializing the gs utils..')
  152. self.gs = GS_Utils.GS_Utils('de_core_news_sm')
  153. #print('done')
  154. from SentGlue import SentGlueMach
  155. #print('loading the Stochastic Gradient models..')
  156. self.sgm = SentGlueMach('trainedSGD_twolabel.pkl', 'bagofwordstwolabel.pkl')
  157. #print('done')
  158. #print('initializing the SGM..')
  159. self.sgm.initialize()
  160. #print('done')
  161. #print('importing spacy..')
  162. import spacy
  163. #print('done')
  164. #print('importing german model..')
  165. self.nlp = spacy.load('de_core_news_sm')
  166. #print('done')
  167. return 'done'
  168. def CommaSentenceOrNot(self, sentences):
  169. nlp = self.nlp
  170. commasentences = []
  171. counter = 0
  172. #print('creating array of comma or not..')
  173. for sentence in sentences:
  174. doc = nlp(' '.join(sentence[0]))
  175. #print(doc)
  176. counter += 1
  177. #if counter % 100 == 0:
  178. #print(counter)
  179. n = 0
  180. firstone = 0
  181. token = []
  182. nextword = 0
  183. for word in doc:
  184. #print(word.tag_)
  185. # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
  186. if firstone == 0:
  187. token.append(word.text)
  188. firstone = 1
  189. if nextword == 1:
  190. token.append(word.text)
  191. nextword = 0
  192. if word.tag_ == '$,':
  193. n += 1
  194. nextword = 1
  195. sentence.append([n, token])
  196. commasentences.append(sentence)
  197. #print('done')
  198. return commasentences
  199. def EnumerationSolver(self, sentences):
  200. gs = self.gs
  201. nlp = self.nlp
  202. sgm = self.sgm
  203. enumerationsentences = []
  204. counter = 0
  205. NOTenumerations = []
  206. #print('processing enumerations..')
  207. for sentence in sentences:
  208. doc = nlp(' '.join(sentence[0]))
  209. #print(doc)
  210. counter += 1
  211. #if counter % 100 == 0:
  212. #print(counter)
  213. n = 0
  214. firstone = 0
  215. token = []
  216. nextword = 0
  217. enumeration = False
  218. splitsentence = []
  219. splitsentence_deps = []
  220. splitsentence_tags = []
  221. splitsentences = []
  222. splitsentences_deps = []
  223. splitsentences_tags = []
  224. for word in doc:
  225. #print(word.tag_)
  226. # es eignet sich hierbei word.pos_ fuer noun und verb, word.dep_ fuer sb pd, und evtl tag
  227. nextword = 0
  228. if word.tag_ == '$,':
  229. n += 1
  230. nextword = 1
  231. if (word.text == 'und' or word.text == 'oder') and n >= 1:
  232. enumeration = True
  233. break
  234. output = []
  235. if enumeration == True:
  236. for word in doc:
  237. #print(word.text)
  238. if word.text != ',' and word.text != '.' and word.text != 'und':
  239. splitsentence.append(word.text)
  240. splitsentence_deps.append(word.dep_)
  241. splitsentence_tags.append(word.tag_)
  242. if word.text == ',' or word.text == 'und':
  243. #print('oi')
  244. splitsentences.append(splitsentence)
  245. splitsentences_deps.append(splitsentence_deps)
  246. splitsentences_tags.append(splitsentence_tags)
  247. splitsentence = []
  248. splitsentence_deps = []
  249. splitsentence_tags = []
  250. splitsentences.append(splitsentence)
  251. splitsentences_deps.append(splitsentence_deps)
  252. splitsentences_tags.append(splitsentence_tags)
  253. #print( 'splitsentences', splitsentences)
  254. token = []
  255. enumerations = []
  256. enumerationsSPOs = []
  257. NOTenumerations = []
  258. for sentence in splitsentences:
  259. token.append(sentence[0])
  260. if sentence[0] not in self.full_list:
  261. enumerations.append(sentence)
  262. enumerationsSPOs.append(gs.checkSPO(sentence, 0))
  263. else:
  264. NOTenumerations.append(sentence)
  265. #print(enumerationsSPOs)
  266. #print('enumerations', enumerations)
  267. biggest = []
  268. for i in range(len(enumerationsSPOs)):
  269. biggest.append([i, sum(enumerationsSPOs[i])])
  270. sortedbiggest = sorted(biggest[::-1], key=lambda tup: tup[1], reverse=True)
  271. for i in range(len(sortedbiggest)):
  272. if sortedbiggest[i][0] == 0:
  273. mainsentenceIndex = sortedbiggest[i][0]
  274. lastornot = 0
  275. break
  276. if sortedbiggest[i][0] == len(biggest) - 1:
  277. mainsentenceIndex = sortedbiggest[i][0]
  278. lastornot = 1
  279. break
  280. # Hier muss noch für den Fall Er, sie und der Beamte LACHTEN den Clown aus --> das lachten abgefangen werden mit der Datenbank der Fälle, sprich enumeration im spo 1 0 0 + plural muss dann zu singular werden abhängig von den artikeln.
  281. #print('enumerations', enumerations)
  282. mainsentence = enumerations[mainsentenceIndex]
  283. #print('main', mainsentence)
  284. probablemainsentences = []
  285. for i in range(len(enumerations)):
  286. if i != mainsentenceIndex:
  287. iprobablemainsentences = []
  288. probablemainsentence = []
  289. if lastornot == 0:
  290. for j in range(1, len(mainsentence)):
  291. probablemainsentence = mainsentence[0:j] + enumerations[i]
  292. #print(probablemainsentence)
  293. iprobablemainsentences.append(' '.join(probablemainsentence))
  294. if lastornot == 1:
  295. for j in range(1, len(mainsentence)):
  296. probablemainsentence = enumerations[i] + mainsentence[-j:]
  297. iprobablemainsentences.append(' '.join(probablemainsentence))
  298. probablemainsentences.append(iprobablemainsentences)
  299. # hier wird auf noch da geprüft, aber es ist wichtiger in diesem fall, dass ein tuple nicht zerissen vorkommt AENDERN !!!!
  300. #print('probablemainsentences', probablemainsentences)
  301. tuplesToCheck = []
  302. tuples = [['ART', 'NN'], ['APPR','NN'], ['ART', 'CARD']]
  303. for tupl in tuples:
  304. checktupleindex, tupleInWords = gs.checkForAnnotationTuple(mainsentence, tupl , 'word.tag_', 'None')
  305. if checktupleindex == 2:
  306. tuplesToCheck.append([tupl, tupleInWords])
  307. triplesToCheck = []
  308. triples = [['ART','ADJA','NN'], ['APPR', 'ART', 'NN'], ['KOKOM', 'ART', 'NN']]
  309. for tripl in triples:
  310. checktripleindex, tripleInWords = gs.checkForAnnotationTriple(mainsentence, tripl, 'word.tag_', 'None')
  311. if checktripleindex == 3:
  312. triplesToCheck.append([tripl, tripleInWords])
  313. #print('tuples to check', tuplesToCheck)
  314. #print('triples to check', triplesToCheck)
  315. #print('probablemainsentences', probablemainsentences)
  316. for probsentences in probablemainsentences:
  317. checktripleindexes = []
  318. checktupleindexes = []
  319. #print(probsentences)
  320. filteredprobsentences = []
  321. for sentence in probsentences:
  322. tuplchecked = 0
  323. triplchecked = 0
  324. #print('sentence and tuples to check', sentence, tuplesToCheck)
  325. for tupl in tuplesToCheck:
  326. checkedsecondtime, tupleinWords = gs.checkForAnnotationTuple(sentence.split(), tupl[0], 'word.tag_', tupl[1])
  327. #print(sentence, checkedsecondtime)
  328. if checkedsecondtime == 1:
  329. tuplchecked = 0
  330. if checkedsecondtime == 2:
  331. tuplchecked = 1
  332. for tripl in triplesToCheck:
  333. checkedsecondtime, tripleinWords = gs.checkForAnnotationTriple(sentence.split(), tripl[0], 'word.tag_', tripl[1])
  334. if checkedsecondtime == 1 or checkedsecondtime == 2:
  335. triplchecked = 0
  336. if checkedsecondtime == 3:
  337. triplchecked = 1
  338. if triplchecked == 1 or tuplchecked == 1:
  339. filteredprobsentences.append(sentence)
  340. #print('filteredprobsentences', filteredprobsentences)
  341. if len(filteredprobsentences) == 0:
  342. filteredprobsentences = probsentences
  343. # here is still the problem, that there are lists of words instead of proper sentences..
  344. #print('filteredprobsentences', filteredprobsentences)
  345. probsMatrix = sgm.predictprobsOnSentenceList(filteredprobsentences, filteredprobsentences)
  346. #print(probsMatrix)
  347. for i in range(len(probsMatrix)):
  348. probsMatrix[i][0] = i
  349. #print(probsMatrix)
  350. sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
  351. #print(sortedprobsMatrix)
  352. bestindex = sortedprobsMatrix[0][0]
  353. #print(bestindex)
  354. #print('probablemainsentences', filteredprobsentences)
  355. probablemainsentence = filteredprobsentences[int(bestindex)]
  356. #print('oi', probablemainsentence)
  357. #print('probablemainsentence', probablemainsentence)
  358. enumerationsentences.append([probablemainsentence])
  359. enumerationsentences.append([' '.join(mainsentence)])
  360. for notenum in NOTenumerations:
  361. #print(enumerationsentences)
  362. #print(enumerationsentences[-1])
  363. #print('enum no1', enumerationsentences)
  364. #print('notenum', notenum)
  365. enumerationsentences[-1].append(' '.join(notenum))
  366. #print('enumsentences',enumerationsentences[-1])
  367. enumerationsentences[-1] = [', '.join(enumerationsentences[-1])]
  368. else:
  369. enumerationsentences.append([sentence])
  370. output.append(enumerationsentences)
  371. for n in range(len(output[0])):
  372. #print('out',output[0][n])
  373. try:
  374. output[0][n] = [output[0][n][0].split()]
  375. except:
  376. output[0][n] = [output[0][n][0][0]]
  377. #print('done')
  378. return output[0]
  379. def GetUtteranceNumber(self, sentences):
  380. nlp = self.nlp
  381. uttersentences = []
  382. for sentence in sentences:
  383. doc = nlp(' '.join(sentence[0]))
  384. subjectcount = 0
  385. for word in doc:
  386. if word.dep_ == 'sb' or word.dep_ == 'ep':
  387. subjectcount += 1
  388. sentence.append(subjectcount)
  389. uttersentences.append(sentence)
  390. return uttersentences
  391. def GetQuestionOrNot(self, sentences):
  392. nlp = self.nlp
  393. uttersentences = []
  394. questionmark = 0
  395. for sentence in sentences:
  396. doc = nlp(' '.join(sentence[0]))
  397. count = 0
  398. for word in doc:
  399. count += 1
  400. if word.text == '?':
  401. questionmark = 1
  402. sentence.append(questionmark)
  403. uttersentences.append(sentence)
  404. return uttersentences
  405. def SplitSentencesIntoHauptNebenTuple(self, sentences, punctuations):
  406. oldsplitsentences = []
  407. #print('hauptneben inputsentences', sentences)
  408. gs = self.gs
  409. #print('importing spacy..')
  410. import spacy
  411. #print('done')
  412. nlp = self.nlp
  413. outputsentences = []
  414. sentencesThatAreOutoutput = []
  415. outsentences = []
  416. for generalindex in range(len(sentences)):
  417. presentence = sentences[generalindex]
  418. splitsentence = []
  419. splitsentence_deps = []
  420. splitsentence_tags = []
  421. splitsentences = []
  422. splitsentences_deps = []
  423. splitsentences_tags = []
  424. commainfo = presentence[1]
  425. outputsentence = []
  426. token = commainfo[1]
  427. commaornot = commainfo[0]
  428. numberutterances = presentence[2]
  429. sentence = presentence[0]
  430. oldsentence = presentence[0]
  431. #print(commaornot)
  432. if commaornot >= 2:
  433. #print('nla')
  434. sentence[0] = sentence[0].title()
  435. doc = nlp(' '.join(sentence))
  436. for word in doc:
  437. #print(word.text)
  438. if word.text != ',' and word.text != '.':
  439. splitsentence.append(word.text)
  440. splitsentence_deps.append(word.dep_)
  441. splitsentence_tags.append(word.tag_)
  442. if word.text == ',':
  443. #print('oi')
  444. splitsentences.append(splitsentence)
  445. splitsentences_deps.append(splitsentence_deps)
  446. splitsentences_tags.append(splitsentence_tags)
  447. splitsentence = []
  448. splitsentence_deps = []
  449. splitsentence_tags = []
  450. splitsentences.append(splitsentence)
  451. splitsentences[0][0] = splitsentences[0][0].lower()
  452. splitsentences_deps.append(splitsentence_deps)
  453. splitsentences_tags.append(splitsentence_tags)
  454. oldsplitsentences = splitsentences
  455. #print(splitsentences)
  456. #print(splitsentences_tags)
  457. #print(splitsentences_deps)
  458. spo = []
  459. for n in range(len(splitsentences)):
  460. prespo = []
  461. prespo = gs.checkSPO(splitsentences_deps[n], 1)
  462. prespo.append( gs.checkForAnnotation(splitsentences[n], 'VVINF', 'word.tag_'))
  463. prespo.append(gs.checkForAnnotation(splitsentences[n], 'VAFIN', 'word.tag_'))
  464. prespo.append(gs.checkForAnnotation(splitsentences[n], 'VVFIN', 'word.tag_'))
  465. prespo.append(gs.checkForAnnotation(splitsentences[n], 'VMFIN', 'word.tag_'))
  466. spo.append(prespo)
  467. #print(splitsentences_deps)
  468. #print(splitsentences)
  469. #print(spo)
  470. indexSPO = []
  471. lastm = len(splitsentences)
  472. for o in range(len(splitsentences)):
  473. m = len(splitsentences) - 1 - o
  474. for n in range(len(splitsentences)):
  475. if m < n - 1 and n < lastm:
  476. #print('spo s',spo[m], spo[n])
  477. sb = spo[m][0] + spo[n][0]
  478. Vafin = 1
  479. if spo[m][3] == 1 or spo[n][3] == 1:
  480. Vafin = spo[m][3] + spo[n][3]
  481. Vvinf = 1
  482. if spo[m][4] == 1 or spo[n][4] == 1:
  483. Vvinf = spo[m][4] + spo[n][4]
  484. Vvfin = 1
  485. if spo[m][5] == 1 or spo[n][5] == 1:
  486. Vvfin = spo[m][5] + spo[n][5]
  487. Vmfin = 1
  488. if spo[m][6] == 1 or spo[n][6] == 1:
  489. Vmfin == spo[m][6] + spo[n][6]
  490. #wrapped = 0
  491. #for n in range(len(indexSPO)):
  492. #if n == indexSPO[n][0] + 1 and n == indexSPO[n][1] - 1:
  493. #wrapped = 1
  494. #print(sb, Vafin, Vvinf, Vvfin, Vmfin, 'm n', m, n)
  495. if sb == 1 and Vafin == 1 and Vvinf == 1 and (Vvfin == 1 or Vmfin == 1):
  496. indexSPO.append([m,n])
  497. #print([m,n])
  498. lastm = m
  499. #print('lastm',lastm)
  500. #print(splitsentences)
  501. Hauptsentences = []
  502. for n in range(len(indexSPO)):
  503. if indexSPO[n][0] > indexSPO[n][1]:
  504. i = 1
  505. j = 0
  506. else:
  507. i = 0
  508. j = 1
  509. Hauptsentences.append([splitsentences[indexSPO[n][i]] + splitsentences[indexSPO[n][j]] , indexSPO[n][i], indexSPO[n][j] ])
  510. HauptSentences = []
  511. for n in range(len(Hauptsentences)):
  512. m = len(Hauptsentences) - 1 - n
  513. HauptSentences.append(Hauptsentences[m])
  514. #print('Hauptsentences', Hauptsentences)
  515. #print('HauptSentences', HauptSentences)
  516. sentencesThatAreOut =[]
  517. for n in range(len(HauptSentences)):
  518. index = HauptSentences[n][1]
  519. finish = 0
  520. #print('Oi',HauptSentences[n])
  521. if n == len(HauptSentences) - 1:
  522. #print('lenHauptsentences', len(HauptSentences))
  523. stopindex = len(splitsentences)
  524. finish = 1
  525. else:
  526. stopindex = HauptSentences[n + 1][1]
  527. #print('stopindex', stopindex)
  528. vvfinisthere = 0
  529. if finish == 0:
  530. if splitsentences_tags[stopindex][0] == 'VVFIN':
  531. stopindex -= 1
  532. vvfinisthere = 1
  533. if splitsentences_tags[index][0] == 'VVFIN':
  534. vvfinisthere = 1
  535. if vvfinisthere == 1:
  536. HNTuple = HauptSentences[n][0] + [','] + splitsentences[index - 1]
  537. outputsentence.append(HNTuple)
  538. sentencesThatAreOut.append(index - 1)
  539. sentencesThatAreOut.append(Hauptsentences[n][1])
  540. sentencesThatAreOut.append(Hauptsentences[n][2])
  541. for m in range(index + 1, stopindex ):
  542. if m != HauptSentences[n][2]:
  543. HNTuple = HauptSentences[n][0] + [','] + splitsentences[m]
  544. #print('check', HauptSentences[n], n)
  545. #print('check', splitsentences[m], m)
  546. #print('double', HNTuple)
  547. outputsentence.append(HNTuple)
  548. sentencesThatAreOut.append(m)
  549. sentencesThatAreOut.append(Hauptsentences[n][1])
  550. sentencesThatAreOut.append(Hauptsentences[n][2])
  551. sentencesThatAreOutoutput.append(sentencesThatAreOut)
  552. cpOrNots = []
  553. rcOrNots = []
  554. for splitsentence in splitsentences_deps:
  555. cpOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'cp')
  556. cpOrNots.append(cpOrNot)
  557. rcOrNot = gs.checkForAnnotationInTokenizedSentence(splitsentence, 'rc')
  558. rcOrNots.append(rcOrNot)
  559. #print('Laenge splitsentences', len(splitsentences))
  560. #print('laenge cpOrNots', len(cpOrNots))
  561. #print(cpOrNots)
  562. #print('rc or nots', rcOrNots)
  563. pairs = []
  564. for n in range(len(cpOrNots)):
  565. index = len(cpOrNots) - 1 - n
  566. done = 0
  567. if rcOrNots[index] == 1:
  568. pairs.append([index, index - 1])
  569. done = 1
  570. if done == 0 and cpOrNots[index] == 1:
  571. try:
  572. if splitsentences_tags[index + 1][0] == 'VVFIN':
  573. pairs.append([index, index + 1])
  574. done = 1
  575. except:
  576. pass
  577. try:
  578. if done == 0 and rcOrNots[index - 1] == 0:
  579. pairs.append([index, index - 1])
  580. done = 1
  581. except:
  582. pass
  583. try:
  584. if done == 0 and rcOrNots[index - 1] == 1:
  585. if rcOrNots[index - 2] == 0:
  586. pairs.append([index, index - 2])
  587. except:
  588. pass
  589. for pair in pairs[::-1]:
  590. if pair[0] not in set(sentencesThatAreOut) or pair[1] not in set(sentencesThatAreOut):
  591. outputsentence.append(splitsentences[pair[1]] + [','] + splitsentences[pair[0]])
  592. #print('hnhn',sentences)
  593. sentences[generalindex][0] = outputsentence
  594. #print('outputsentence hntuple',outputsentence)
  595. #outputsentences.append([outputsentence , i])
  596. #print('Oio', outputsentences)
  597. #print(sentencesThatAreOutoutput)
  598. #print(splitsentences)
  599. #print('oioioioioioioio',sentences)
  600. #print(sentences[0][0])
  601. #print('oioi',sentences[n])
  602. #print('malatesta', sentences[n][0][0])
  603. #print('generalindex sentences index 0', sentences[generalindex][0])
  604. try:
  605. if type(sentences[generalindex][0][0]) == str:
  606. sentences[generalindex][0] = [sentences[generalindex][0]]
  607. except:
  608. pass
  609. #print('generalindex sentences index 0', sentences[generalindex][0])
  610. #print('oldsentence', oldsentence)
  611. newgeneratedsentences = len(sentences[generalindex][0])
  612. if newgeneratedsentences > 1:
  613. #print('goti t')
  614. for sentence in sentences[generalindex][0]:
  615. punctuations.insert(generalindex, punctuations[generalindex])
  616. outsentences.append(sentence)
  617. del punctuations[generalindex]
  618. if newgeneratedsentences == 1:
  619. if len(sentences[generalindex][0][0]) > 1:
  620. outsentences.append(sentences[generalindex][0][0])
  621. else:
  622. outsentences.append(oldsentence)
  623. if newgeneratedsentences == 0:
  624. #print('case oldsentence', oldsentence)
  625. outsentences.append(oldsentence)
  626. #print('oioi', sentences[n])
  627. # connect alonestanding commatas with the word before
  628. #print('theoutsentences', outsentences)
  629. for outsentence in outsentences:
  630. todelete = []
  631. for n in range(len(outsentence)):
  632. if outsentence[n] == ',':
  633. todelete.append(n)
  634. outsentence[n-1] = outsentence[n-1] + ','
  635. for deleteindex in todelete[::-1]:
  636. del outsentence[deleteindex]
  637. for index in range(len(outsentences)):
  638. outsentences[index] = [outsentences[index]]
  639. #print('theoutsentences', outsentences)
  640. #removing doubles
  641. doubledsentences = []
  642. for o in range(len(outsentences)):
  643. sentence = outsentences[o][0]
  644. for m in range(len(outsentences)):
  645. if m != o:
  646. count = 0
  647. for n in range(len(sentence)):
  648. if sentence[n] in outsentences[m][0] or sentence[n][:-1] in outsentences[m][0]:
  649. count += 1
  650. if count == len(sentence):
  651. doubledsentences.append(sentence)
  652. punctdeleteindex = []
  653. tmp = set()
  654. for sentence in doubledsentences:
  655. tmp.add(tuple(sentence))
  656. #print(list(tmp))
  657. doubledsentences = []
  658. for tup in tmp:
  659. doubledsentences.append([list(tup)])
  660. #print('doubledsentences',doubledsentences)
  661. punctdeleteindexes = []
  662. for double in doubledsentences:
  663. if double in outsentences:
  664. punctdeleteindex = outsentences[::-1].index(double)
  665. del outsentences[len(outsentences) - 1 - punctdeleteindex]
  666. punctdeleteindexes.append(punctdeleteindex)
  667. for index in punctdeleteindexes[::-1]:
  668. del punctuations[len(outsentences) - 1 - index]
  669. #print('oldsplit',oldsplitsentences)
  670. #print('outsents',outsentences)
  671. for o in range(len(oldsplitsentences)):
  672. for m in range(len(outsentences)):
  673. counter = 0
  674. for n in range(len(oldsplitsentences[o])):
  675. if oldsplitsentences[o][n] in outsentences[m][0] or oldsplitsentences[o][n] + ',' in outsentences[m][0]:
  676. counter += 1
  677. if counter >= len(oldsplitsentences[o]):
  678. break
  679. if m == len(outsentences) - 1 and counter < len(oldsplitsentences[o]):
  680. if o == 0:
  681. outsentences.insert(0,[oldsplitsentences[o]])
  682. punctuations.insert(0, punctuations[0])
  683. else:
  684. newones = []
  685. for i in range(len(outsentences)):
  686. if outsentences[i][0][-1] == oldsplitsentences[o - 1][-1]:
  687. if len(outsentences[i][0]) > 2 and len(oldsplitsentences[o - 1]) > 2:
  688. if outsentences[i][0][-2] == oldsplitsentences[o - 1][-2]:
  689. if outsentences[i][0][-3] == oldsplitsentences[o - 1][-3]:
  690. newones.append([i + 1, [oldsplitsentences[o]]])
  691. for newone in newones[::-1]:
  692. #print(newones)
  693. outsentences.insert(newone[0], newone[1])
  694. punctuations.insert(newone[0], punctuations[newone[0] - 1])
  695. #print('outsentences at the very end ', outsentences, punctuations)
  696. return outsentences, punctuations
  697. # Notiz: Hier muss der Input immer Paare sein, von Hauptsatz/Nebensatz. D.h. eine weitere vorgeschaltete Klasse ist von Nöten.
  698. def SplitCommatas(self, Inputsentences, punctuations):
  699. gs = self.gs
  700. nlp = self.nlp
  701. gramcorr_splitsentences = []
  702. counter = 0
  703. newpunctuationsindex = []
  704. for Inputsentence in Inputsentences:
  705. counter += 1
  706. commainfo = Inputsentence[1]
  707. token = commainfo[1]
  708. commaornot = commainfo[0]
  709. numberutterances = Inputsentence[2]
  710. if commaornot == 0:
  711. gramcorr_splitsentences.append(Inputsentence[0])
  712. if commaornot > 1:
  713. gramcorr_splitsentences.append(Inputsentence[0])
  714. if commaornot == 1:
  715. oldsentence = Inputsentence[0]
  716. Inputsentence = [[Inputsentence[0]]]
  717. for sentence in Inputsentence[0]:
  718. splitsentence = []
  719. splitsentences = []
  720. processed = 0
  721. wasNotInAnyList = 0
  722. try:
  723. for n in range(len(token)):
  724. if token[n] in self.final_list:
  725. splitsentence = []
  726. for word in sentence:
  727. if word != token[n]:
  728. if word[-1] == ',':
  729. splitsentence.append(word[:-1])
  730. if word[-1] != ',':
  731. splitsentence.append(word)
  732. if word[-1] == ',' or word == ',':
  733. splitsentences.append(splitsentence)
  734. splitsentence = []
  735. splitsentences.append(splitsentence)
  736. if n == 1:
  737. if token[n] == 'um' or token[n] == 'Um':
  738. splitsentences[n].insert(0,'dies')
  739. splitsentences[n].insert(0,'um')
  740. else:
  741. splitsentences[n].insert(0,'dann')
  742. if n == 0:
  743. if token[n] == 'um' or token[n] == 'Um':
  744. splitsentences[n].insert(0,'dies')
  745. splitsentences[n].insert(0,'um')
  746. splitsentences = splitsentences[::-1]
  747. else:
  748. splitsentences[n].insert(0,'dann')
  749. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  750. generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
  751. processed = 1
  752. if token[n] in self.adversativ_list:
  753. splitsentence = []
  754. for word in sentence:
  755. if word != token[n]:
  756. if word[-1] == ',':
  757. splitsentence.append(word[:-1])
  758. if word == ',':
  759. pass
  760. if word[-1] != ',':
  761. splitsentence.append(word)
  762. if word[-1] == ',' or word == ',':
  763. splitsentences.append(splitsentence)
  764. splitsentence = []
  765. splitsentences.append(splitsentence)
  766. splitsentences[n].append('jedoch')
  767. generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
  768. processed = 1
  769. if token[n] in self.kausal_list:
  770. splitsentence = []
  771. for word in sentence:
  772. if word != token[n]:
  773. if word[-1] == ',':
  774. splitsentence.append(word[:-1])
  775. if word == ',':
  776. pass
  777. if word[-1] != ',':
  778. splitsentence.append(word)
  779. if word[-1] == ',' or word == ',':
  780. splitsentences.append(splitsentence)
  781. splitsentence = []
  782. splitsentences.append(splitsentence)
  783. # Da deswegen an den anderen Satz gehaengt wird, muss der input zu commasentences immer ZWEI sentences sein.
  784. #print('splitsentences in kausal', splitsentences)
  785. if n == 1:
  786. splitsentences[n - 1].insert(0,'deswegen')
  787. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  788. if n == 0:
  789. splitsentences[n + 1].insert(0,'deswegen')
  790. #print('splitsentences in kausal', splitsentences)
  791. generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
  792. processed = 1
  793. # from here come konsekutiv sentences, they have to be split according https://www.deutschplus.net/pages/Konsekutivsatz
  794. if token[n] in self.konsekutiv_list:
  795. #print('oi konsekutiv')
  796. splitsentence = []
  797. for word in sentence:
  798. if word != token[n]:
  799. if word[-1] == ',':
  800. splitsentence.append(word[:-1])
  801. if word == ',':
  802. pass
  803. if word[-1] != ',':
  804. splitsentence.append(word)
  805. if word[-1] == ',' or word == ',':
  806. splitsentences.append(splitsentence)
  807. splitsentence = []
  808. splitsentences.append(splitsentence)
  809. generalrules = [['KOUS','PPER']]
  810. processed = 1
  811. if token[n] in self.konditional_list:
  812. splitsentence = []
  813. for word in sentence:
  814. if word[-1] == ',':
  815. splitsentence.append(word[:-1])
  816. if word == ',':
  817. pass
  818. if word[-1] != ',':
  819. splitsentence.append(word)
  820. if word[-1] == ',' or word == ',':
  821. splitsentences.append(splitsentence)
  822. splitsentence = []
  823. splitsentences.append(splitsentence)
  824. if n == 1:
  825. spoCount = gs.checkSPO(splitsentences[n], 0)
  826. spoCount = sum(spoCount)
  827. if spoCount == 2:
  828. thereisanes = 0
  829. for word in splitsentences[n]:
  830. if word == 'es' or word == 'Es':
  831. thereisanes = 1
  832. if thereisanes == 0:
  833. splitsentences[n].append('es')
  834. if n == 0:
  835. spoCount = gs.checkSPO(splitsentences[n], 0)
  836. spoCount = sum(spoCount)
  837. if spoCount == 2:
  838. thereisanes = 0
  839. for word in splitsentences[n]:
  840. if word == 'es' or word == 'Es':
  841. thereisanes = 1
  842. if thereisanes == 0:
  843. splitsentences[n].append('es')
  844. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  845. generalrules = [['KOUS','PPER']]
  846. processed = 1
  847. if token[n] in self.konzessiv_list:
  848. splitsentence = []
  849. for word in sentence:
  850. if word != token[n]:
  851. if word[-1] == ',':
  852. splitsentence.append(word[:-1])
  853. if word == ',':
  854. pass
  855. if word[-1] != ',':
  856. splitsentence.append(word)
  857. if word[-1] == ',' or word == ',':
  858. splitsentences.append(splitsentence)
  859. splitsentence = []
  860. splitsentences.append(splitsentence)
  861. if n == 1:
  862. splitsentences[n - 1].insert(0,'trotzdem')
  863. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  864. if n == 0:
  865. splitsentences[n + 1].insert(0,'trotzdem')
  866. generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
  867. processed = 1
  868. if token[n] in self.lokal_list:
  869. #print('lokal ole ole ')
  870. splitsentence = []
  871. for word in sentence:
  872. if word != token[n]:
  873. if word[-1] == ',':
  874. splitsentence.append(word[:-1])
  875. if word == ',':
  876. pass
  877. if word[-1] != ',':
  878. splitsentence.append(word)
  879. if word[-1] == ',' or word == ',':
  880. splitsentences.append(splitsentence)
  881. splitsentence = []
  882. splitsentences.append(splitsentence)
  883. if n == 1:
  884. splitsentences[n - 1].insert(0,'dort')
  885. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  886. if n == 0:
  887. splitsentences[n + 1].insert(0,'dort')
  888. generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
  889. processed = 1
  890. if token[n] in self.instrumental_list:
  891. splitsentence = []
  892. for word in sentence:
  893. if word != token[n]:
  894. if word[-1] == ',':
  895. splitsentence.append(word[:-1])
  896. if word == ',':
  897. pass
  898. if word[-1] != ',':
  899. splitsentence.append(word)
  900. if word[-1] == ',' or word == ',':
  901. splitsentences.append(splitsentence)
  902. splitsentence = []
  903. splitsentences.append(splitsentence)
  904. if n == 1:
  905. splitsentences[n - 1].insert(0,'so')
  906. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  907. if n == 0:
  908. splitsentences[n + 1].insert(0,'so')
  909. generalrules = [['ADV','VAFIN'], ['ADV', 'VVFIN']]
  910. processed = 1
  911. if token[n] in self.temporal_list_vor:
  912. splitsentence = []
  913. for word in sentence:
  914. if word != token[n]:
  915. if word[-1] == ',':
  916. splitsentence.append(word[:-1])
  917. if word == ',':
  918. pass
  919. if word[-1] != ',':
  920. splitsentence.append(word)
  921. if word[-1] == ',' or word == ',':
  922. splitsentences.append(splitsentence)
  923. splitsentence = []
  924. splitsentences.append(splitsentence)
  925. if n == 1:
  926. splitsentences[n].insert(0,'danach')
  927. if n == 0:
  928. splitsentences[n].insert(0,'danach')
  929. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  930. generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
  931. processed = 1
  932. if token[n] in self.temporal_list_nach:
  933. splitsentence = []
  934. for word in sentence:
  935. if word != token[n]:
  936. if word[-1] == ',':
  937. splitsentence.append(word[:-1])
  938. if word == ',':
  939. pass
  940. if word[-1] != ',':
  941. splitsentence.append(word)
  942. if word[-1] == ',' or word == ',':
  943. splitsentences.append(splitsentence)
  944. splitsentence = []
  945. splitsentences.append(splitsentence)
  946. if n == 1:
  947. splitsentences[n].insert(0,'davor')
  948. if n == 0:
  949. splitsentences[n].insert(0,'davor')
  950. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  951. generalrules = [['PROAV','VAFIN'], ['PROAV', 'VVFIN']]
  952. processed = 1
  953. #print(token[n])
  954. if token[n] == 'der' or token[n] == 'welcher':
  955. tokens = self.nlp(' '.join(sentence))
  956. for word in tokens:
  957. if word.dep_ == 'rc':
  958. wordwithrc = word.text
  959. rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
  960. oldsplitsentences = splitsentences
  961. splitsentences = []
  962. if rcORnot == 1:
  963. splitsentence = []
  964. for word in sentence:
  965. if word[-1] == ',':
  966. splitsentence.append(word[:-1])
  967. if word == ',':
  968. pass
  969. if word[-1] != ',':
  970. splitsentence.append(word)
  971. if word[-1] == ',' or word == ',':
  972. splitsentences.append(splitsentence)
  973. splitsentence = []
  974. splitsentences.append(splitsentence)
  975. # das umtauschen wird hier vollzogen, da ansonsten spacy dieser nicht als PDS einliest.. analog in den anderen.
  976. if wordwithrc in splitsentences[n]:
  977. splitsentences[n][0] = 'dieser'
  978. verb = splitsentences[n][-1]
  979. splitsentences[n] = splitsentences[n][:-1]
  980. splitsentences[n].insert(1, verb)
  981. #print('Vorsicht', splitsentences)
  982. generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
  983. processed = 1
  984. else:
  985. splitsentences = oldsplitsentences
  986. splitsentence = []
  987. if token[n] == 'die' or token[n] == 'welche':
  988. tokens = self.nlp(' '.join(sentence))
  989. for word in tokens:
  990. if word.dep_ == 'rc':
  991. wordwithrc = word.text
  992. rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
  993. oldsplitsentences = splitsentences
  994. splitsentences = []
  995. if rcORnot == 1:
  996. #print('it went to rcornot in case die')
  997. splitsentence = []
  998. for word in sentence:
  999. if word[-1] == ',':
  1000. splitsentence.append(word[:-1])
  1001. if word == ',':
  1002. pass
  1003. if word[-1] != ',':
  1004. splitsentence.append(word)
  1005. if word[-1] == ',' or word == ',':
  1006. splitsentences.append(splitsentence)
  1007. splitsentence = []
  1008. splitsentences.append(splitsentence)
  1009. if wordwithrc in splitsentences[n]:
  1010. #print('wordwithrc was in sentence')
  1011. #print(wordwithrc)
  1012. #print(splitsentences[n])
  1013. #print('wordwithrcend')
  1014. splitsentences[n][0] = 'diese'
  1015. verb = splitsentences[n][-1]
  1016. splitsentences[n] = splitsentences[n][:-1]
  1017. splitsentences[n].insert(1, verb)
  1018. generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
  1019. processed = 1
  1020. else:
  1021. splitsentences = oldsplitsentences
  1022. splitsentence = []
  1023. if token[n] == 'dem':
  1024. tokens = self.nlp(' '.join(sentence))
  1025. for word in tokens:
  1026. if word.dep_ == 'rc':
  1027. wordwithrc = word.text
  1028. rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
  1029. oldsplitsentences = splitsentences
  1030. splitsentences = []
  1031. if rcORnot == 1:
  1032. splitsentence = []
  1033. for word in sentence:
  1034. if word[-1] == ',':
  1035. splitsentence.append(word[:-1])
  1036. if word == ',':
  1037. pass
  1038. if word[-1] != ',' and word[-1] != '.':
  1039. splitsentence.append(word)
  1040. if word[-1] == ',':
  1041. splitsentences.append(splitsentence)
  1042. splitsentence = []
  1043. splitsentences.append(splitsentence)
  1044. if wordwithrc in splitsentences[n]:
  1045. splitsentences[n][0] = 'diesem'
  1046. verb = splitsentences[n][-1]
  1047. splitsentences[n] = splitsentences[n][:-1]
  1048. splitsentences[n].insert(1, verb)
  1049. generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
  1050. processed = 1
  1051. else:
  1052. splitsentences = oldsplitsentences
  1053. splitsentence = []
  1054. if token[n] == 'das' or token[n] == 'welches':
  1055. tokens = self.nlp(' '.join(sentence))
  1056. for word in tokens:
  1057. if word.dep_ == 'rc':
  1058. wordwithrc = word.text
  1059. rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
  1060. #print('Oeeee',rcORnot)
  1061. oldsplitsentences = splitsentences
  1062. splitsentences = []
  1063. if rcORnot == 1:
  1064. splitsentence = []
  1065. for word in sentence:
  1066. if word[-1] == ',':
  1067. splitsentence.append(word[:-1])
  1068. if word == ',':
  1069. pass
  1070. if word[-1] != ',':
  1071. splitsentence.append(word)
  1072. if word[-1] == ',' or word == ',':
  1073. splitsentences.append(splitsentence)
  1074. splitsentence = []
  1075. splitsentences.append(splitsentence)
  1076. #print('splitsentence in das rc', splitsentences)
  1077. if wordwithrc in splitsentences[n]:
  1078. splitsentences[n][0] = 'dieses'
  1079. verb = splitsentences[n][-1]
  1080. #print('verb',verb)
  1081. splitsentences[n] = splitsentences[n][:-1]
  1082. splitsentences[n].insert(1, verb)
  1083. generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
  1084. processed = 1
  1085. else:
  1086. splitsentences = oldsplitsentences
  1087. splitsentence = []
  1088. if token[n] == 'dessen' or token[n] == 'wessen':
  1089. tokens = self.nlp(' '.join(sentence))
  1090. for word in tokens:
  1091. if word.dep_ == 'rc':
  1092. wordwithrc = word.text
  1093. rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
  1094. oldsplitsentences = splitsentences
  1095. splitsentences = []
  1096. if rcORnot == 1:
  1097. splitsentence = []
  1098. for word in sentence:
  1099. if word[-1] == ',':
  1100. splitsentence.append(word[:-1])
  1101. if word == ',':
  1102. pass
  1103. if word[-1] != ',':
  1104. splitsentence.append(word)
  1105. if word[-1] == ',' or word == ',':
  1106. splitsentences.append(splitsentence)
  1107. splitsentence = []
  1108. splitsentences.append(splitsentence)
  1109. if wordwithrc in splitsentences[n]:
  1110. verb = splitsentences[n][-1]
  1111. splitsentences[n] = splitsentences[n][:-1]
  1112. splitsentences[n].insert(1, verb)
  1113. generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
  1114. processed = 1
  1115. else:
  1116. splitsentences = oldsplitsentences
  1117. splitsentence = []
  1118. if token[n] == 'den' or token[n] == 'welchen':
  1119. tokens = self.nlp(' '.join(sentence))
  1120. for word in tokens:
  1121. if word.dep_ == 'rc':
  1122. wordwithrc = word.text
  1123. rcORnot = gs.checkForAnnotation(sentence, 'rc', 'word.dep_')
  1124. oldsplitsentences = splitsentences
  1125. splitsentences = []
  1126. if rcORnot == 1:
  1127. splitsentence = []
  1128. for word in sentence:
  1129. if word[-1] == ',':
  1130. splitsentence.append(word[:-1])
  1131. if word == ',':
  1132. pass
  1133. if word[-1] != ',':
  1134. splitsentence.append(word)
  1135. if word[-1] == ',' or word == ',':
  1136. splitsentences.append(splitsentence)
  1137. splitsentence = []
  1138. splitsentences.append(splitsentence)
  1139. if wordwithrc in splitsentences[n]:
  1140. splitsentences[n][0] = 'diesen'
  1141. verb = splitsentences[n][-1]
  1142. splitsentences[n] = splitsentences[n][:-1]
  1143. splitsentences[n].insert(1, verb)
  1144. generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
  1145. processed = 1
  1146. else:
  1147. splitsentences = oldsplitsentences
  1148. splitsentence = []
  1149. if token[n] == 'wem' or token[n] == 'Wem' or token[n] == 'welchem':
  1150. daORnot = gs.checkForAnnotation(sentence, 'da', 'word.dep_')
  1151. oaORnot = gs.checkForAnnotation(sentence, 'oa', 'word.dep_')
  1152. reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
  1153. oldsplitsentences = splitsentences
  1154. splitsentences = []
  1155. for word in sentence:
  1156. if word[-1] == ',':
  1157. splitsentence.append(word[:-1])
  1158. if word == ',':
  1159. pass
  1160. if word[-1] != ',':
  1161. splitsentence.append(word)
  1162. if word[-1] == ',' or word == ',':
  1163. splitsentences.append(splitsentence)
  1164. splitsentence = []
  1165. splitsentences.append(splitsentence)
  1166. if n == 0:
  1167. index = 1
  1168. if n == 1:
  1169. index = 0
  1170. if reORnot == 1:
  1171. pass
  1172. if daORnot == 1 and reORnot == 0:
  1173. splitsentences[index].insert(1, 'das')
  1174. if oaORnot == 1 and reORnot == 0:
  1175. splitsentences[index].insert(1, 'dem')
  1176. if n == 1:
  1177. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  1178. generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
  1179. processed = 1
  1180. if token[n] in self.indirectspeech_list and token[1] not in self.konsekutiv_list:
  1181. reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
  1182. oldsplitsentences = splitsentences
  1183. splitsentences = []
  1184. splitsentence = []
  1185. for word in sentence:
  1186. if word[-1] == ',':
  1187. splitsentence.append(word[:-1])
  1188. if word == ',':
  1189. pass
  1190. if word[-1] != ',':
  1191. splitsentence.append(word)
  1192. if word[-1] == ',' or word == ',':
  1193. splitsentences.append(splitsentence)
  1194. splitsentence = []
  1195. splitsentences.append(splitsentence)
  1196. if n == 0:
  1197. index = 1
  1198. if n == 1:
  1199. index = 0
  1200. if reORnot == 0:
  1201. if splitsentences[index][0] != 'was':
  1202. splitsentences[index].insert(1, 'das')
  1203. if n == 1:
  1204. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  1205. generalrules = [['PDS','VAFIN'], ['PDS', 'VVFIN']]
  1206. processed = 1
  1207. if processed == 0 and n == 1:
  1208. ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VVINF'], 'word.tag_', 'None')
  1209. if ZUVINFTupelORnot == 0:
  1210. ZUVINFTupelORnot = gs.checkForAnnotationTuple(sentence, ['PTKZU', 'VAINF'], 'word.tag_', 'None')
  1211. if ZUVINFTupelORnot == 1:
  1212. reORnot = gs.checkForAnnotation(sentence, 're', 'word.dep_')
  1213. splitsentence = []
  1214. for word in sentence:
  1215. if word[-1] == ',':
  1216. splitsentence.append(word[:-1])
  1217. if word == ',':
  1218. pass
  1219. if word[-1] != ',' :
  1220. splitsentence.append(word)
  1221. if word[-1] == ',' or word == ',':
  1222. splitsentences.append(splitsentence)
  1223. processed = 1
  1224. splitsentence = []
  1225. splitsentences.append(splitsentence)
  1226. for m in range(2):
  1227. ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VVINF'], 'word.tag_','None')
  1228. if ZUINForNOT == 0:
  1229. ZUINForNOT = gs.checkForAnnotationTuple(splitsentences[m], ['PTKZU', 'VAINF'], 'word.tag_','None')
  1230. if ZUINForNOT == 1:
  1231. r = m
  1232. ZUINForNOT = 0
  1233. if r == 0:
  1234. index = 1
  1235. if r == 1:
  1236. index = 0
  1237. objectORnot = gs.checkForAnnotation(splitsentences[index] , 'oa', 'word.dep_')
  1238. if reORnot == 0 and objectORnot == 0:
  1239. splitsentences[index].insert(1, 'das')
  1240. if r == 1:
  1241. splitsentences[0], splitsentences[1] = splitsentences[1] , splitsentences[0]
  1242. else:
  1243. processed == 2
  1244. except:
  1245. wasNotInAnyList = 1
  1246. #rules = [['ART','ADJA','NN'], ['ART','ADJA','NE'], ['ART', 'NN'], ['ART', 'NE'], ['APPR','NN'], ['APPR','NE'], ['APPR', 'ART', 'NN'], ['APPR', 'ART', 'NE'], ['APPR','ART','NN','ADJA','NN'], ['APPR','ART','NN','ADJA','NE'], ['KOKOM', 'ART', 'NN'], ['KOKOM', 'ART', 'NE'], ['PPOSAT', 'NN'], ['PPOSAT', 'NE'], ['ADV', 'ADJD']]
  1247. #print('B',splitsentences)
  1248. endsentences = []
  1249. if (processed == 2 or processed == 0) and n == 1:
  1250. wasNotInAnyList = 1
  1251. try:
  1252. if wasNotInAnyList == 0:
  1253. newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
  1254. #print('splitsentencee', splitsentences)
  1255. if len(splitsentences) > 2:
  1256. splitsentences = splitsentences[:2]
  1257. #print('splitsentenceeeees', splitsentences)
  1258. for splitsentence in splitsentences:
  1259. #print('splitsentenceeeeeeeeeeee!!',splitsentence)
  1260. wordtoputfirst = 'nada'
  1261. for word in self.firstwordlist:
  1262. if word == splitsentence[0]:
  1263. wordtoputfirst = word
  1264. splitsentence.remove(word)
  1265. #print('get the tuples and triples to check..')
  1266. tuplesTocheck, triplesTocheck, quadruplesTocheck = self.gs.GetTuplesinSentence(splitsentence)
  1267. #print('done')
  1268. #print(tuplesTocheck, 'ole', triplesTocheck ,'aiai', quadruplesTocheck)
  1269. #print('1')
  1270. grammpiecessentence = self.gs.createTupleofGrammarpieces( splitsentence, tuplesTocheck, triplesTocheck, quadruplesTocheck)
  1271. #print('grammpiece',grammpiecessentence)
  1272. #print('2')
  1273. if len(grammpiecessentence) > 7:
  1274. print('A sentence is too long, too many permutations. \n piping wrong grammar..')
  1275. endsentence = ' '.join(grammpiecessentence)
  1276. else:
  1277. #print('genrating the permutations')
  1278. permutations = self.sgm.GeneratePermutationsOfSentence(grammpiecessentence)
  1279. #print('done')
  1280. #print(permutations)
  1281. #print('3')
  1282. firstwordwithverblist = ['deswegen', 'danach']
  1283. permutationstodelete = []
  1284. for permutation in permutations:
  1285. #print('4')
  1286. if permutation[0] in firstwordwithverblist:
  1287. #print('4.1')
  1288. count = 1
  1289. for word in self.nlp(permutation[1]):
  1290. #print('4.2')
  1291. if word.tag_[0] != 'V':
  1292. #print('4.3')
  1293. permutationstodelete.append(permutation)
  1294. break
  1295. else:
  1296. break
  1297. #for word in self.nlp(permutation[0]):
  1298. #print('4.2')
  1299. #if word.tag_[0] != 'V':
  1300. #print('4.3')
  1301. #permutationstodelete.append(permutation)
  1302. #break
  1303. #else:
  1304. #break
  1305. for delperm in permutationstodelete:
  1306. try:
  1307. permutations.remove(delperm)
  1308. except:
  1309. pass
  1310. #print('5')
  1311. sentencesToCheck = []
  1312. if wordtoputfirst in self.firstwordlist:
  1313. for sentence in permutations:
  1314. sentencesToCheck.append(wordtoputfirst + ' ' + ' '.join(sentence))
  1315. else:
  1316. for sentence in permutations:
  1317. sentencesToCheck.append(' '.join(sentence))
  1318. endsentence = self.sgm.GetBestSentenceFromSentencesAccordingToGrammar(sentencesToCheck, ' '.join(splitsentence))
  1319. #print('done')
  1320. #print('endsent',endsentence)
  1321. endsentences.append(endsentence)
  1322. except:
  1323. #print('there was an error')
  1324. wasNotInAnyList = 1
  1325. endsentences = []
  1326. todelete = []
  1327. for index in range(len(newpunctuationsindex)):
  1328. if newpunctuationsindex[index][0] == counter - 1:
  1329. todelete.append(index)
  1330. for todel in todelete[::-1]:
  1331. del newpunctuationsindex[todel]
  1332. if wasNotInAnyList == 1:
  1333. #print('was not in any list')
  1334. #print(oldsentence)
  1335. endsplisentences = []
  1336. splisentence = []
  1337. for word in oldsentence:
  1338. if word[-1] == ',':
  1339. splisentence.append(word[:-1])
  1340. if word == ',':
  1341. pass
  1342. if word[-1] != ',':
  1343. splisentence.append(word)
  1344. if word[-1] == ',' or word == ',':
  1345. endsplisentences.append(splisentence)
  1346. splisentence = []
  1347. endsplisentences.append(splisentence)
  1348. newpunctuationsindex.insert(0,[counter-1,punctuations[counter-1]])
  1349. #print('endsplisentences',endsplisentences)
  1350. for splsentence in endsplisentences:
  1351. endsentences.append(' '.join(splsentence))
  1352. '''
  1353. fsearch1 = self.fsearch1
  1354. spacyclass1 = 'word.tag_'
  1355. gs_sentence1 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass1)
  1356. print('searchPatternMatch for tags')
  1357. bestmatches1 = fsearch1.searchPatternMatch(' '.join(gs_sentence1), 1)
  1358. print('done')
  1359. #print('oioi', bestmatches1)
  1360. #print(len(fsearch1.database))
  1361. right_gs_tupel1 = []
  1362. if len(bestmatches1) < 10:
  1363. bestndocs1 = len(bestmatches1)
  1364. else:
  1365. bestndocs1 = 10
  1366. for m in range(bestndocs1):
  1367. right_gs_tupel1.append(fsearch1.database[bestmatches1[m][0]])
  1368. statistically_correct_sentences1 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence1, right_gs_tupel1)
  1369. fsearch2 = self.fsearch2
  1370. spacyclass2 = 'word.dep_'
  1371. gs_sentence2 = gs.Sentence2GrammarSchema(' '.join(splitsentence), spacyclass2)
  1372. print('searchPatternMatch for deps')
  1373. bestmatches2 = fsearch2.searchPatternMatch(' '.join(gs_sentence2), 1)
  1374. print('done')
  1375. right_gs_tupel2 = []
  1376. if len(bestmatches2) < 10:
  1377. bestndocs2 = len(bestmatches2)
  1378. else:
  1379. bestndocs2 = 10
  1380. for m in range(bestndocs2):
  1381. right_gs_tupel2.append(fsearch2.database[bestmatches2[m][0]])
  1382. #print(' '.join(splitsentence))
  1383. statistically_correct_sentences2 = gs.Sentence2RightGrammarTupel(' '.join(splitsentence), gs_sentence2, right_gs_tupel2)
  1384. print(splitsentence)
  1385. Rightsentence = gs.GetBestgsAccordingRules(' '.join(splitsentence) , gs_sentence1, right_gs_tupel1, right_gs_tupel2, statistically_correct_sentences1, statistically_correct_sentences2, rules, generalrules)
  1386. '''
  1387. for endsentence in endsentences:
  1388. gramcorr_splitsentences.append(endsentence.split())
  1389. for index in newpunctuationsindex:
  1390. punctuations.insert(index[0], index[1])
  1391. return gramcorr_splitsentences, punctuations
  1392. def putAppendixesIntoOwnSentences(self, sentences, punctuations):
  1393. gs = self.gs
  1394. #triples = [['NN', 'ART', 'NN'], ['NE', 'ART', 'NN'], ['NN', 'ART', 'NN'], ['NE', 'ART', 'NE']]
  1395. quadruples = [['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'NE', 'NN'], ['NN', 'APPR', 'ART', 'NN'], ['NE', 'APPR', 'ART', 'NN'], ['NN', 'APPR', 'ART', 'NE'], ['NE', 'APPR', 'ART', 'NE']]
  1396. quadruplestochange = []
  1397. triplestochange = []
  1398. newsentences = []
  1399. newpunctuations = []
  1400. Whatisofnouns = []
  1401. oldsentences = sentences
  1402. oldpunctuations = punctuations
  1403. for hauptindex in range(len(sentences)):
  1404. sentence = sentences[hauptindex]
  1405. try:
  1406. #for triple in triples:
  1407. # AnnoOrNot, tripleInWords = gs.checkForAnnotationTriple(sentence, triple, 'word.tag_', 'None')
  1408. # for tripleinwor in tripleInWords:
  1409. # triplestochange.append([triple, tripleinwor])
  1410. for quadruple in quadruples:
  1411. AnnoOrNot, quadrupleInWords = gs.checkForAnnotationQuadruple(sentence, quadruple, 'word.tag_', 'None')
  1412. #print('quadinwords', quadrupleInWords)
  1413. #print('ANNOORNOT', AnnoOrNot)
  1414. for quadrupleInWo in quadrupleInWords:
  1415. quadruplestochange.append([quadruple, quadrupleInWo])
  1416. #print('quadstochange',quadruplestochange)
  1417. for quad in quadruplestochange:
  1418. for n in range(len(sentence) - 4):
  1419. if sentence[n] == quad[1][0]:
  1420. if sentence[n + 1] == quad[1][1]:
  1421. if sentence[n + 2] == quad[1][2]:
  1422. artword = None
  1423. longerWhatisnoun = 0
  1424. for m in range(2):
  1425. for word in self.nlp(sentence[n - m]):
  1426. if word.tag_ == 'ART':
  1427. Nounthatis = sentence[n - m:n + 1]
  1428. import spacy
  1429. nlp = spacy.load('de_core_news_sm')
  1430. token3 = nlp(sentence[n+4])
  1431. counter = 0
  1432. Whatisnoun = sentence[n + 1:n + 4]
  1433. for wor in token3:
  1434. counter += 1
  1435. if wor.tag_ == 'NN' or wor.tag_ == 'NE':
  1436. if counter == 1:
  1437. Whatisnoun = sentence[n + 1:n + 5]
  1438. longerWhatisnoun = 1
  1439. if counter == 2:
  1440. Whatisnoun = sentence[n + 1:n + 4]
  1441. artword = word.text
  1442. #print(sentence[n - 1],'oi')
  1443. if ((artword == 'die' or artword == 'Die') and sentence[n][-1] != 'n') or ((artword == 'der' or artword == 'einer' or artword == 'dieser') and (sentence[n - 2] in ['von', 'in', 'auf', 'ueber', 'unter', 'nach', 'mit'])):
  1444. if artword == 'der':
  1445. Nounthatis[0] = 'die'
  1446. donothing = 0
  1447. if sentence[n + 1] == 'mit':
  1448. if sentence[n + 2] == 'den':
  1449. verb = ' hat die '
  1450. Whatisnoun = Whatisnoun[2:]
  1451. if sentence[n + 2] == 'der':
  1452. verb = ' hat eine '
  1453. Whatisnoun = Whatisnoun[2:]
  1454. if sentence[n + 2] != 'der' and sentence[n + 2] != 'den':
  1455. donothing = 1
  1456. else:
  1457. verb = ' ist '
  1458. if donothing == 0:
  1459. newsentence = ' '.join(Nounthatis) + verb + ' '.join(Whatisnoun)
  1460. newsentences.append([hauptindex + 1, newsentence.split()])
  1461. newpunctuations.append([hauptindex + 1, punctuations[hauptindex]])
  1462. if longerWhatisnoun == 0:
  1463. Whatisofnouns.append([n + 1, n + 4, hauptindex])
  1464. else:
  1465. Whatisofnouns.append([n + 1, n + 5, hauptindex])
  1466. except:
  1467. print('Konnte nicht ' + str(sentence) + 'in Characterisierung pro Satz prozessieren..')
  1468. try:
  1469. for whatis in Whatisofnouns[::-1]:
  1470. thereisacomma = 0
  1471. #print(sentences[whatis[2]][whatis[1] - 1])
  1472. if sentences[whatis[2]][whatis[1] - 1][-1] == ',':
  1473. thereisacomma = 1
  1474. if thereisacomma == 1:
  1475. #print(sentences[whatis[2]][whatis[0] - 1])
  1476. sentences[whatis[2]][whatis[0] - 1] = sentences[whatis[2]][whatis[0] - 1] + ','
  1477. del sentences[whatis[2]][whatis[0]:whatis[1]]
  1478. for newsent in newsentences[::-1]:
  1479. sentences.insert(newsent[0], newsent[1])
  1480. for newpunct in newpunctuations[::-1]:
  1481. punctuations.insert(newpunct[0], newpunct[1])
  1482. for sentence in sentences:
  1483. if sentence[-1][-1] == ',':
  1484. sentence[-1] = sentence[-1][:-1]
  1485. except:
  1486. print('konnte nicht die gesammelten Characterisierungen prozessieren')
  1487. sentences = oldsentences
  1488. punctuations = oldpunctuations
  1489. return sentences, punctuations