import ipywidgets as widgets from IPython.display import display, HTML javascript_functions = {False: "hide()", True: "show()"} button_descriptions = {False: "Code anzeigen", True: "Code verstecken"} class oi(object): def __init__(self): self.punktuation_list = ['.', '?', '!', ';', ':'] return None def ReadDoc2Sent(self, document): splitsentences = [] splitsentence = [] punctuations = [] with open(document) as sentences: counter = 0 for sentence in sentences: counter += 1 if counter % 1000 == 0: print(counter) words = sentence.split() for word in words: if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2: splitsentence.append(word[:-1]) splitsentences.append(splitsentence) punctuations.append(word[-1]) splitsentence = [] else: splitsentence.append(word) return splitsentences, punctuations def PrintSplitSentencesToTextFile(self, punctuations, sentences, document): with open(document, 'a') as doc: for n in range(len(sentences)): towrite = ' '.join(sentences[n]) #print(punctuations[n]) punctuation = punctuations[n] doc.write(towrite + punctuation + '\n') return 'OK' def CellInputText2Splitsentences(self, sentences): splitsentences = [] splitsentence = [] punctuations = [] words = sentences.split() for word in words: if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2): splitsentence.append(word[:-1]) splitsentences.append(splitsentence) punctuations.append(word[-1]) splitsentence = [] else: splitsentence.append(word) return splitsentences, punctuations def CellInputText2SplitsentencesWithspacy(self, sentences): #print('iamhere') punctuations = [] splitsentences = [] import spacy nlp = spacy.load('de_core_news_sm') spacysentences = nlp(sentences) for sent in spacysentences.sents: #print('sent', sent) firstwordisaNoun = False if sent[0].tag_ == 'NN' or sent[0].tag_ == 'NE': firstwordisaNoun = True #print('da taaaag', sent[0].tag_) splitsent = sent.text.split() lastword = splitsent[-1] if firstwordisaNoun == False: try: splitsent[0] = splitsent[0].lower() except: #print('lower did not work') pass #print('Firstword',splitsent[0]) count = 0 indextocutlastword = 1 for letter in lastword: count += 1 if letter in self.punktuation_list: punctuations.append(letter) indextocutlastword = count splitsent[-1] = splitsent[-1][:indextocutlastword - 1] splitsentences.append(splitsent) #print('senttext' , splitsent, indextocutlastword) if len(splitsentences) != len(punctuations): splitsentences = [] splitsentence = [] punctuations = [] words = sentences.split() #print('sentencessplit', words) for word in words: if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2): splitsentence.append(word[:-1]) splitsentences.append(splitsentence) punctuations.append(word[-1]) splitsentence = [] else: splitsentence.append(word) for splisentence in splitsentences: for word in spacysentences: if word.text == splisentence[0]: if word.tag_ == 'NN' or word.tag_ == 'NE': pass else: splisentence[0] = splisentence[0].lower() return splitsentences, punctuations def printSplitsentences2Text(self, punctuations, splitsentences): for n in range(len(sentences)): towrite = ' '.join(sentences[n]) punctuation = punctuations[n] print(towrite + punctuation + '\n') return 'done' def toggle_code(self, state): """ Toggles the JavaScript show()/hide() function on the div.input element. """ output_string = "" output_args = (javascript_functions[state],) output = output_string.format(*output_args) display(HTML(output)) def button_action(self, value): """ Calls the toggle_code function and updates the button description. """ state = value.new self.toggle_code(state) value.owner.description = button_descriptions[state] def log_progress(self, sequence, every=None, size=None, name='Items'): from ipywidgets import IntProgress, HTML, VBox from IPython.display import display is_iterator = False if size is None: try: size = len(sequence) except TypeError: is_iterator = True if size is not None: if every is None: if size <= 200: every = 1 else: every = int(size / 200) # every 0.5% else: assert every is not None, 'sequence is iterator, set every' if is_iterator: progress = IntProgress(min=0, max=1, value=1) progress.bar_style = 'info' else: progress = IntProgress(min=0, max=size, value=0) label = HTML() box = VBox(children=[label, progress]) display(box) index = 0 try: for index, record in enumerate(sequence, 1): if index == 1 or index % every == 0: if is_iterator: label.value = '{name}: {index} / ?'.format( name=name, index=index ) else: progress.value = index label.value = u'{name}: {index} / {size}'.format( name=name, index=index, size=size ) yield record except: progress.bar_style = 'danger' raise else: progress.bar_style = 'success' progress.value = index label.value = "{name}: {index}".format( name=name, index=str(index or '?') )