You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

270 lines
7.8 KiB

import ipywidgets as widgets
from IPython.display import display, HTML
javascript_functions = {False: "hide()", True: "show()"}
button_descriptions = {False: "Code anzeigen", True: "Code verstecken"}
class oi(object):
def __init__(self):
self.punktuation_list = ['.', '?', '!', ';', ':']
return None
def ReadDoc2Sent(self, document):
splitsentences = []
splitsentence = []
punctuations = []
with open(document) as sentences:
counter = 0
for sentence in sentences:
counter += 1
if counter % 1000 == 0:
print(counter)
words = sentence.split()
for word in words:
if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
splitsentence.append(word[:-1])
splitsentences.append(splitsentence)
punctuations.append(word[-1])
splitsentence = []
else:
splitsentence.append(word)
return splitsentences, punctuations
def PrintSplitSentencesToTextFile(self, punctuations, sentences, document):
with open(document, 'a') as doc:
for n in range(len(sentences)):
towrite = ' '.join(sentences[n])
#print(punctuations[n])
punctuation = punctuations[n]
doc.write(towrite + punctuation + '\n')
return 'OK'
def CellInputText2Splitsentences(self, sentences):
splitsentences = []
splitsentence = []
punctuations = []
words = sentences.split()
for word in words:
if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2):
splitsentence.append(word[:-1])
splitsentences.append(splitsentence)
punctuations.append(word[-1])
splitsentence = []
else:
splitsentence.append(word)
return splitsentences, punctuations
def CellInputText2SplitsentencesWithspacy(self, sentences):
#print('iamhere')
punctuations = []
splitsentences = []
import spacy
nlp = spacy.load('de_core_news_sm')
spacysentences = nlp(sentences)
for sent in spacysentences.sents:
#print('sent', sent)
firstwordisaNoun = False
if sent[0].tag_ == 'NN' or sent[0].tag_ == 'NE':
firstwordisaNoun = True
#print('da taaaag', sent[0].tag_)
splitsent = sent.text.split()
lastword = splitsent[-1]
if firstwordisaNoun == False:
try:
splitsent[0] = splitsent[0].lower()
except:
#print('lower did not work')
pass
#print('Firstword',splitsent[0])
count = 0
indextocutlastword = 1
for letter in lastword:
count += 1
if letter in self.punktuation_list:
punctuations.append(letter)
indextocutlastword = count
splitsent[-1] = splitsent[-1][:indextocutlastword - 1]
splitsentences.append(splitsent)
#print('senttext' , splitsent, indextocutlastword)
if len(splitsentences) != len(punctuations):
splitsentences = []
splitsentence = []
punctuations = []
words = sentences.split()
#print('sentencessplit', words)
for word in words:
if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2):
splitsentence.append(word[:-1])
splitsentences.append(splitsentence)
punctuations.append(word[-1])
splitsentence = []
else:
splitsentence.append(word)
for splisentence in splitsentences:
for word in spacysentences:
if word.text == splisentence[0]:
if word.tag_ == 'NN' or word.tag_ == 'NE':
pass
else:
splisentence[0] = splisentence[0].lower()
return splitsentences, punctuations
def printSplitsentences2Text(self, punctuations, splitsentences):
for n in range(len(sentences)):
towrite = ' '.join(sentences[n])
punctuation = punctuations[n]
print(towrite + punctuation + '\n')
return 'done'
def toggle_code(self, state):
"""
Toggles the JavaScript show()/hide() function on the div.input element.
"""
output_string = "<script>$(\"div.input\").{}</script>"
output_args = (javascript_functions[state],)
output = output_string.format(*output_args)
display(HTML(output))
def button_action(self, value):
"""
Calls the toggle_code function and updates the button description.
"""
state = value.new
self.toggle_code(state)
value.owner.description = button_descriptions[state]
def log_progress(self, sequence, every=None, size=None, name='Items'):
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display
is_iterator = False
if size is None:
try:
size = len(sequence)
except TypeError:
is_iterator = True
if size is not None:
if every is None:
if size <= 200:
every = 1
else:
every = int(size / 200) # every 0.5%
else:
assert every is not None, 'sequence is iterator, set every'
if is_iterator:
progress = IntProgress(min=0, max=1, value=1)
progress.bar_style = 'info'
else:
progress = IntProgress(min=0, max=size, value=0)
label = HTML()
box = VBox(children=[label, progress])
display(box)
index = 0
try:
for index, record in enumerate(sequence, 1):
if index == 1 or index % every == 0:
if is_iterator:
label.value = '{name}: {index} / ?'.format(
name=name,
index=index
)
else:
progress.value = index
label.value = u'{name}: {index} / {size}'.format(
name=name,
index=index,
size=size
)
yield record
except:
progress.bar_style = 'danger'
raise
else:
progress.bar_style = 'success'
progress.value = index
label.value = "{name}: {index}".format(
name=name,
index=str(index or '?')
)