You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

241 lines
6.5 KiB

4 years ago
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import SentSeg\n"
  10. ]
  11. },
  12. {
  13. "cell_type": "code",
  14. "execution_count": 2,
  15. "metadata": {},
  16. "outputs": [
  17. {
  18. "name": "stdout",
  19. "output_type": "stream",
  20. "text": [
  21. "[[['Die', 'Frau', 'von', 'den', 'Bergen', 'ist', 'eine', 'gute', 'Partie.']], [['Es', 'war', 'die', 'Frau', 'mit', 'den', 'Klauen,', 'die', 'ein', 'größeres', 'Problem', 'mit', 'der', 'Geschichte', 'hatte.']]]\n"
  22. ]
  23. }
  24. ],
  25. "source": [
  26. "sent_seg = SentSeg.SentSeg('de')\n",
  27. "\n",
  28. " \n",
  29. "sentences = sent_seg.ReadDoc2Sent('atest1')\n",
  30. "print(sentences)"
  31. ]
  32. },
  33. {
  34. "cell_type": "code",
  35. "execution_count": 3,
  36. "metadata": {},
  37. "outputs": [
  38. {
  39. "data": {
  40. "text/plain": [
  41. "'done'"
  42. ]
  43. },
  44. "execution_count": 3,
  45. "metadata": {},
  46. "output_type": "execute_result"
  47. }
  48. ],
  49. "source": [
  50. "sent_seg.LoadSentGlueSGDandGSUtils()"
  51. ]
  52. },
  53. {
  54. "cell_type": "code",
  55. "execution_count": 7,
  56. "metadata": {},
  57. "outputs": [
  58. {
  59. "name": "stdout",
  60. "output_type": "stream",
  61. "text": [
  62. "creating array of comma or not..\n",
  63. "done\n",
  64. "[[['Hallo', 'was', 'gehe', 'denn', 'hier', 'so.'], [0, ['Hallo']], [0, ['Hallo']], [0, ['Hallo']]], [['Ich', 'habe', 'echt', 'keine', 'Ahnung', 'verdammt.'], [0, ['Ich']], [0, ['Ich']], [0, ['Ich']]], [['I.', 'd.', 'R.', 'gibt', 'es', 'keine', 'Abschiebungen.'], [0, ['I.']], [0, ['I.']], [0, ['I.']]], [['Ende', 'd.', 'J.', 'wird', 'alles', 'problematisch.'], [0, ['Ende']], [0, ['Ende']], [0, ['Ende']]], [['Sie', 'gingen', 'nach', 'Hause,', 'weil', 'es', 'in', 'Strömen', 'regnete.'], [1, ['Sie', 'weil']], [1, ['Sie', 'weil']], [1, ['Sie', 'weil']]], [['Heute', 'war', 'die', 'Straße', 'blau', 'angemalt,', 'damit', 'der', 'Marathon', 'funktionierte.'], [1, ['Heute', 'damit']], [1, ['Heute', 'damit']], [1, ['Heute', 'damit']]], [['Er', 'habe', 'es', 'sehr', 'schwer.'], [0, ['Er']], [0, ['Er']], [0, ['Er']]], [['Es', 'war', 'die', 'Hose', 'des', 'Gauners.'], [0, ['Es']], [0, ['Es']], [0, ['Es']]], [['Bliblablub.'], [0, ['Bliblablub']], [0, ['Bliblablub']], [0, ['Bliblablub']]], [['Sie', 'ist', 'nicht', 'schön', 'heute.'], [0, ['Sie']], [0, ['Sie']], [0, ['Sie']]], [['Oleoleole.'], [0, ['Oleoleole']], [0, ['Oleoleole']], [0, ['Oleoleole']]], [['Mannoman.'], [0, ['Mannoman']], [0, ['Mannoman']], [0, ['Mannoman']]], [['Er', 'ginge', 'nicht', 'schnell.'], [0, ['Er']], [0, ['Er']], [0, ['Er']]], [['Die', 'Hühner', 'lieben', 'sich', 'nicht.'], [0, ['Die']], [0, ['Die']], [0, ['Die']]]]\n"
  65. ]
  66. }
  67. ],
  68. "source": [
  69. "sentences = sent_seg.putAppendixesIntoOwnSentences(sentences, punctuations)\n",
  70. "print(sentences)"
  71. ]
  72. },
  73. {
  74. "cell_type": "code",
  75. "execution_count": 6,
  76. "metadata": {},
  77. "outputs": [],
  78. "source": [
  79. "sentences = sent_seg.GetUtteranceNumber(sentences)"
  80. ]
  81. },
  82. {
  83. "cell_type": "code",
  84. "execution_count": 7,
  85. "metadata": {},
  86. "outputs": [],
  87. "source": [
  88. "sentences = sent_seg.GetQuestionOrNot(sentences)"
  89. ]
  90. },
  91. {
  92. "cell_type": "code",
  93. "execution_count": 8,
  94. "metadata": {},
  95. "outputs": [
  96. {
  97. "name": "stdout",
  98. "output_type": "stream",
  99. "text": [
  100. "importing spacy..\n",
  101. "done\n"
  102. ]
  103. }
  104. ],
  105. "source": [
  106. "sentences1 = sent_seg.SplitSentencesIntoHauptNebenTuple(sentences)"
  107. ]
  108. },
  109. {
  110. "cell_type": "code",
  111. "execution_count": 9,
  112. "metadata": {},
  113. "outputs": [
  114. {
  115. "name": "stdout",
  116. "output_type": "stream",
  117. "text": [
  118. "['es', 'regnete', 'in Strömen']\n",
  119. "['deswegen', 'Sie', 'gingen', 'nach Hause']\n",
  120. "['Heute', 'war', 'blau', 'angemalt', 'die Straße']\n",
  121. "100\n",
  122. "['dann', 'funktionierte', 'der Marathon']\n"
  123. ]
  124. }
  125. ],
  126. "source": [
  127. "outsentences = sent_seg.SplitCommatas(sentences1)"
  128. ]
  129. },
  130. {
  131. "cell_type": "code",
  132. "execution_count": 18,
  133. "metadata": {},
  134. "outputs": [
  135. {
  136. "name": "stdout",
  137. "output_type": "stream",
  138. "text": [
  139. "[['Hallo', 'was', 'gehe', 'denn', 'hier', 'so'], ['Ich', 'habe', 'echt', 'keine', 'Ahnung', 'verdammt'], ['I.', 'd.', 'R.', 'gibt', 'es', 'keine', 'Abschiebungen'], ['Ende', 'd.', 'J.', 'wird', 'alles', 'problematisch'], ['in', 'Strömen', 'regnete', 'e'], ['deswegen', 'gingen', 'Sie', 'nach', 'Haus'], ['angemalt', 'war', 'die', 'Straße', 'blau', 'Heut'], ['dann', 'der', 'Marathon', 'funktioniert'], ['Er', 'habe', 'es', 'sehr', 'schwer'], ['Es', 'war', 'die', 'Hose', 'des', 'Gauners'], ['Bliblablub'], ['Oleoleole'], ['Mannoman'], ['Er', 'ginge', 'nicht', 'schnell'], ['Der', 'Satz', 'davor', 'funktioniert', 'nicht', 'im', 'Modul', 'Konjunktsolve'], ['Weil', 'er', 'zu', 'viele', 'verben', 'hat']]\n"
  140. ]
  141. }
  142. ],
  143. "source": [
  144. "print(outsentences)"
  145. ]
  146. },
  147. {
  148. "cell_type": "code",
  149. "execution_count": 19,
  150. "metadata": {},
  151. "outputs": [],
  152. "source": [
  153. "punctuations = []\n",
  154. "for n in range(len(outsentences)):\n",
  155. " punctuations.append('.')\n",
  156. " if outsentences[n][-1][-1] == '.':\n",
  157. " outsentences[n][-1] = outsentences[n][-1][:-1]"
  158. ]
  159. },
  160. {
  161. "cell_type": "code",
  162. "execution_count": 20,
  163. "metadata": {},
  164. "outputs": [],
  165. "source": [
  166. "from oi import *\n",
  167. "oi = oi()"
  168. ]
  169. },
  170. {
  171. "cell_type": "code",
  172. "execution_count": 21,
  173. "metadata": {},
  174. "outputs": [
  175. {
  176. "name": "stdout",
  177. "output_type": "stream",
  178. "text": [
  179. ".\n",
  180. ".\n",
  181. ".\n",
  182. ".\n",
  183. ".\n",
  184. ".\n",
  185. ".\n",
  186. ".\n",
  187. ".\n",
  188. ".\n",
  189. ".\n",
  190. ".\n",
  191. ".\n",
  192. ".\n",
  193. ".\n",
  194. ".\n"
  195. ]
  196. },
  197. {
  198. "data": {
  199. "text/plain": [
  200. "'OK'"
  201. ]
  202. },
  203. "execution_count": 21,
  204. "metadata": {},
  205. "output_type": "execute_result"
  206. }
  207. ],
  208. "source": [
  209. "oi.PrintSplitSentencesToTextFile(punctuations, outsentences, 'test1out')\n"
  210. ]
  211. },
  212. {
  213. "cell_type": "code",
  214. "execution_count": null,
  215. "metadata": {},
  216. "outputs": [],
  217. "source": []
  218. }
  219. ],
  220. "metadata": {
  221. "kernelspec": {
  222. "display_name": "Python 3",
  223. "language": "python",
  224. "name": "python3"
  225. },
  226. "language_info": {
  227. "codemirror_mode": {
  228. "name": "ipython",
  229. "version": 3
  230. },
  231. "file_extension": ".py",
  232. "mimetype": "text/x-python",
  233. "name": "python",
  234. "nbconvert_exporter": "python",
  235. "pygments_lexer": "ipython3",
  236. "version": "3.5.3"
  237. }
  238. },
  239. "nbformat": 4,
  240. "nbformat_minor": 2
  241. }