|
|
- import ipywidgets as widgets
- from IPython.display import display, HTML
-
- javascript_functions = {False: "hide()", True: "show()"}
- button_descriptions = {False: "Code anzeigen", True: "Code verstecken"}
-
-
-
- class oi(object):
-
- def __init__(self):
- self.punktuation_list = ['.', '?', '!', ';', ':']
- return None
-
- def ReadDoc2Sent(self, document):
-
- splitsentences = []
- splitsentence = []
- punctuations = []
-
- with open(document) as sentences:
- counter = 0
- for sentence in sentences:
-
- counter += 1
- if counter % 1000 == 0:
- print(counter)
-
- words = sentence.split()
-
-
- for word in words:
-
-
- if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
-
-
- splitsentence.append(word[:-1])
-
- splitsentences.append(splitsentence)
-
- punctuations.append(word[-1])
-
- splitsentence = []
- else:
- splitsentence.append(word)
-
- return splitsentences, punctuations
-
-
- def PrintSplitSentencesToTextFile(self, punctuations, sentences, document):
-
- with open(document, 'a') as doc:
-
- for n in range(len(sentences)):
-
- towrite = ' '.join(sentences[n])
-
- #print(punctuations[n])
- punctuation = punctuations[n]
-
- doc.write(towrite + punctuation + '\n')
-
-
- return 'OK'
-
- def CellInputText2Splitsentences(self, sentences):
-
- splitsentences = []
- splitsentence = []
- punctuations = []
-
-
-
- words = sentences.split()
-
-
- for word in words:
-
-
- if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2):
-
-
- splitsentence.append(word[:-1])
-
- splitsentences.append(splitsentence)
-
- punctuations.append(word[-1])
-
- splitsentence = []
- else:
- splitsentence.append(word)
-
- return splitsentences, punctuations
-
- def CellInputText2SplitsentencesWithspacy(self, sentences):
- #print('iamhere')
- punctuations = []
- splitsentences = []
- import spacy
- nlp = spacy.load('de_core_news_sm')
- spacysentences = nlp(sentences)
-
-
-
-
- for sent in spacysentences.sents:
- #print('sent', sent)
- firstwordisaNoun = False
- if sent[0].tag_ == 'NN' or sent[0].tag_ == 'NE':
- firstwordisaNoun = True
-
- #print('da taaaag', sent[0].tag_)
- splitsent = sent.text.split()
- lastword = splitsent[-1]
-
- if firstwordisaNoun == False:
- try:
- splitsent[0] = splitsent[0].lower()
- except:
- #print('lower did not work')
- pass
- #print('Firstword',splitsent[0])
- count = 0
- indextocutlastword = 1
- for letter in lastword:
- count += 1
- if letter in self.punktuation_list:
- punctuations.append(letter)
- indextocutlastword = count
-
-
- splitsent[-1] = splitsent[-1][:indextocutlastword - 1]
-
- splitsentences.append(splitsent)
- #print('senttext' , splitsent, indextocutlastword)
-
-
- if len(splitsentences) != len(punctuations):
- splitsentences = []
- splitsentence = []
- punctuations = []
-
- words = sentences.split()
- #print('sentencessplit', words)
- for word in words:
-
- if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2):
-
- splitsentence.append(word[:-1])
-
- splitsentences.append(splitsentence)
-
- punctuations.append(word[-1])
-
- splitsentence = []
- else:
- splitsentence.append(word)
-
- for splisentence in splitsentences:
- for word in spacysentences:
- if word.text == splisentence[0]:
- if word.tag_ == 'NN' or word.tag_ == 'NE':
- pass
- else:
- splisentence[0] = splisentence[0].lower()
-
-
-
-
- return splitsentences, punctuations
-
- def printSplitsentences2Text(self, punctuations, splitsentences):
- for n in range(len(sentences)):
-
- towrite = ' '.join(sentences[n])
-
-
- punctuation = punctuations[n]
-
- print(towrite + punctuation + '\n')
-
- return 'done'
-
-
-
-
-
- def toggle_code(self, state):
-
- """
- Toggles the JavaScript show()/hide() function on the div.input element.
- """
-
- output_string = "<script>$(\"div.input\").{}</script>"
- output_args = (javascript_functions[state],)
- output = output_string.format(*output_args)
-
- display(HTML(output))
-
-
- def button_action(self, value):
-
- """
- Calls the toggle_code function and updates the button description.
- """
-
- state = value.new
-
- self.toggle_code(state)
-
- value.owner.description = button_descriptions[state]
-
- def log_progress(self, sequence, every=None, size=None, name='Items'):
- from ipywidgets import IntProgress, HTML, VBox
- from IPython.display import display
-
- is_iterator = False
- if size is None:
- try:
- size = len(sequence)
- except TypeError:
- is_iterator = True
- if size is not None:
- if every is None:
- if size <= 200:
- every = 1
- else:
- every = int(size / 200) # every 0.5%
- else:
- assert every is not None, 'sequence is iterator, set every'
-
- if is_iterator:
- progress = IntProgress(min=0, max=1, value=1)
- progress.bar_style = 'info'
- else:
- progress = IntProgress(min=0, max=size, value=0)
- label = HTML()
- box = VBox(children=[label, progress])
- display(box)
-
- index = 0
- try:
- for index, record in enumerate(sequence, 1):
- if index == 1 or index % every == 0:
- if is_iterator:
- label.value = '{name}: {index} / ?'.format(
- name=name,
- index=index
- )
- else:
- progress.value = index
- label.value = u'{name}: {index} / {size}'.format(
- name=name,
- index=index,
- size=size
- )
- yield record
- except:
- progress.bar_style = 'danger'
- raise
- else:
- progress.bar_style = 'success'
- progress.value = index
- label.value = "{name}: {index}".format(
- name=name,
- index=str(index or '?')
- )
-
-
|