You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

270 lines
7.8 KiB

4 years ago
  1. import ipywidgets as widgets
  2. from IPython.display import display, HTML
  3. javascript_functions = {False: "hide()", True: "show()"}
  4. button_descriptions = {False: "Code anzeigen", True: "Code verstecken"}
  5. class oi(object):
  6. def __init__(self):
  7. self.punktuation_list = ['.', '?', '!', ';', ':']
  8. return None
  9. def ReadDoc2Sent(self, document):
  10. splitsentences = []
  11. splitsentence = []
  12. punctuations = []
  13. with open(document) as sentences:
  14. counter = 0
  15. for sentence in sentences:
  16. counter += 1
  17. if counter % 1000 == 0:
  18. print(counter)
  19. words = sentence.split()
  20. for word in words:
  21. if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
  22. splitsentence.append(word[:-1])
  23. splitsentences.append(splitsentence)
  24. punctuations.append(word[-1])
  25. splitsentence = []
  26. else:
  27. splitsentence.append(word)
  28. return splitsentences, punctuations
  29. def PrintSplitSentencesToTextFile(self, punctuations, sentences, document):
  30. with open(document, 'a') as doc:
  31. for n in range(len(sentences)):
  32. towrite = ' '.join(sentences[n])
  33. #print(punctuations[n])
  34. punctuation = punctuations[n]
  35. doc.write(towrite + punctuation + '\n')
  36. return 'OK'
  37. def CellInputText2Splitsentences(self, sentences):
  38. splitsentences = []
  39. splitsentence = []
  40. punctuations = []
  41. words = sentences.split()
  42. for word in words:
  43. if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2):
  44. splitsentence.append(word[:-1])
  45. splitsentences.append(splitsentence)
  46. punctuations.append(word[-1])
  47. splitsentence = []
  48. else:
  49. splitsentence.append(word)
  50. return splitsentences, punctuations
  51. def CellInputText2SplitsentencesWithspacy(self, sentences):
  52. #print('iamhere')
  53. punctuations = []
  54. splitsentences = []
  55. import spacy
  56. nlp = spacy.load('de_core_news_sm')
  57. spacysentences = nlp(sentences)
  58. for sent in spacysentences.sents:
  59. #print('sent', sent)
  60. firstwordisaNoun = False
  61. if sent[0].tag_ == 'NN' or sent[0].tag_ == 'NE':
  62. firstwordisaNoun = True
  63. #print('da taaaag', sent[0].tag_)
  64. splitsent = sent.text.split()
  65. lastword = splitsent[-1]
  66. if firstwordisaNoun == False:
  67. try:
  68. splitsent[0] = splitsent[0].lower()
  69. except:
  70. #print('lower did not work')
  71. pass
  72. #print('Firstword',splitsent[0])
  73. count = 0
  74. indextocutlastword = 1
  75. for letter in lastword:
  76. count += 1
  77. if letter in self.punktuation_list:
  78. punctuations.append(letter)
  79. indextocutlastword = count
  80. splitsent[-1] = splitsent[-1][:indextocutlastword - 1]
  81. splitsentences.append(splitsent)
  82. #print('senttext' , splitsent, indextocutlastword)
  83. if len(splitsentences) != len(punctuations):
  84. splitsentences = []
  85. splitsentence = []
  86. punctuations = []
  87. words = sentences.split()
  88. #print('sentencessplit', words)
  89. for word in words:
  90. if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2):
  91. splitsentence.append(word[:-1])
  92. splitsentences.append(splitsentence)
  93. punctuations.append(word[-1])
  94. splitsentence = []
  95. else:
  96. splitsentence.append(word)
  97. for splisentence in splitsentences:
  98. for word in spacysentences:
  99. if word.text == splisentence[0]:
  100. if word.tag_ == 'NN' or word.tag_ == 'NE':
  101. pass
  102. else:
  103. splisentence[0] = splisentence[0].lower()
  104. return splitsentences, punctuations
  105. def printSplitsentences2Text(self, punctuations, splitsentences):
  106. for n in range(len(sentences)):
  107. towrite = ' '.join(sentences[n])
  108. punctuation = punctuations[n]
  109. print(towrite + punctuation + '\n')
  110. return 'done'
  111. def toggle_code(self, state):
  112. """
  113. Toggles the JavaScript show()/hide() function on the div.input element.
  114. """
  115. output_string = "<script>$(\"div.input\").{}</script>"
  116. output_args = (javascript_functions[state],)
  117. output = output_string.format(*output_args)
  118. display(HTML(output))
  119. def button_action(self, value):
  120. """
  121. Calls the toggle_code function and updates the button description.
  122. """
  123. state = value.new
  124. self.toggle_code(state)
  125. value.owner.description = button_descriptions[state]
  126. def log_progress(self, sequence, every=None, size=None, name='Items'):
  127. from ipywidgets import IntProgress, HTML, VBox
  128. from IPython.display import display
  129. is_iterator = False
  130. if size is None:
  131. try:
  132. size = len(sequence)
  133. except TypeError:
  134. is_iterator = True
  135. if size is not None:
  136. if every is None:
  137. if size <= 200:
  138. every = 1
  139. else:
  140. every = int(size / 200) # every 0.5%
  141. else:
  142. assert every is not None, 'sequence is iterator, set every'
  143. if is_iterator:
  144. progress = IntProgress(min=0, max=1, value=1)
  145. progress.bar_style = 'info'
  146. else:
  147. progress = IntProgress(min=0, max=size, value=0)
  148. label = HTML()
  149. box = VBox(children=[label, progress])
  150. display(box)
  151. index = 0
  152. try:
  153. for index, record in enumerate(sequence, 1):
  154. if index == 1 or index % every == 0:
  155. if is_iterator:
  156. label.value = '{name}: {index} / ?'.format(
  157. name=name,
  158. index=index
  159. )
  160. else:
  161. progress.value = index
  162. label.value = u'{name}: {index} / {size}'.format(
  163. name=name,
  164. index=index,
  165. size=size
  166. )
  167. yield record
  168. except:
  169. progress.bar_style = 'danger'
  170. raise
  171. else:
  172. progress.bar_style = 'success'
  173. progress.value = index
  174. label.value = "{name}: {index}".format(
  175. name=name,
  176. index=str(index or '?')
  177. )