271 lines
7.8 KiB
Python
271 lines
7.8 KiB
Python
|
import ipywidgets as widgets
|
||
|
from IPython.display import display, HTML
|
||
|
|
||
|
javascript_functions = {False: "hide()", True: "show()"}
|
||
|
button_descriptions = {False: "Code anzeigen", True: "Code verstecken"}
|
||
|
|
||
|
|
||
|
|
||
|
class oi(object):
|
||
|
|
||
|
def __init__(self):
|
||
|
self.punktuation_list = ['.', '?', '!', ';', ':']
|
||
|
return None
|
||
|
|
||
|
def ReadDoc2Sent(self, document):
|
||
|
|
||
|
splitsentences = []
|
||
|
splitsentence = []
|
||
|
punctuations = []
|
||
|
|
||
|
with open(document) as sentences:
|
||
|
counter = 0
|
||
|
for sentence in sentences:
|
||
|
|
||
|
counter += 1
|
||
|
if counter % 1000 == 0:
|
||
|
print(counter)
|
||
|
|
||
|
words = sentence.split()
|
||
|
|
||
|
|
||
|
for word in words:
|
||
|
|
||
|
|
||
|
if(word[-1] in self.punktuation_list or word in self.punktuation_list) and len(word) > 2:
|
||
|
|
||
|
|
||
|
splitsentence.append(word[:-1])
|
||
|
|
||
|
splitsentences.append(splitsentence)
|
||
|
|
||
|
punctuations.append(word[-1])
|
||
|
|
||
|
splitsentence = []
|
||
|
else:
|
||
|
splitsentence.append(word)
|
||
|
|
||
|
return splitsentences, punctuations
|
||
|
|
||
|
|
||
|
def PrintSplitSentencesToTextFile(self, punctuations, sentences, document):
|
||
|
|
||
|
with open(document, 'a') as doc:
|
||
|
|
||
|
for n in range(len(sentences)):
|
||
|
|
||
|
towrite = ' '.join(sentences[n])
|
||
|
|
||
|
#print(punctuations[n])
|
||
|
punctuation = punctuations[n]
|
||
|
|
||
|
doc.write(towrite + punctuation + '\n')
|
||
|
|
||
|
|
||
|
return 'OK'
|
||
|
|
||
|
def CellInputText2Splitsentences(self, sentences):
|
||
|
|
||
|
splitsentences = []
|
||
|
splitsentence = []
|
||
|
punctuations = []
|
||
|
|
||
|
|
||
|
|
||
|
words = sentences.split()
|
||
|
|
||
|
|
||
|
for word in words:
|
||
|
|
||
|
|
||
|
if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2):
|
||
|
|
||
|
|
||
|
splitsentence.append(word[:-1])
|
||
|
|
||
|
splitsentences.append(splitsentence)
|
||
|
|
||
|
punctuations.append(word[-1])
|
||
|
|
||
|
splitsentence = []
|
||
|
else:
|
||
|
splitsentence.append(word)
|
||
|
|
||
|
return splitsentences, punctuations
|
||
|
|
||
|
def CellInputText2SplitsentencesWithspacy(self, sentences):
|
||
|
#print('iamhere')
|
||
|
punctuations = []
|
||
|
splitsentences = []
|
||
|
import spacy
|
||
|
nlp = spacy.load('de_core_news_sm')
|
||
|
spacysentences = nlp(sentences)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
for sent in spacysentences.sents:
|
||
|
#print('sent', sent)
|
||
|
firstwordisaNoun = False
|
||
|
if sent[0].tag_ == 'NN' or sent[0].tag_ == 'NE':
|
||
|
firstwordisaNoun = True
|
||
|
|
||
|
#print('da taaaag', sent[0].tag_)
|
||
|
splitsent = sent.text.split()
|
||
|
lastword = splitsent[-1]
|
||
|
|
||
|
if firstwordisaNoun == False:
|
||
|
try:
|
||
|
splitsent[0] = splitsent[0].lower()
|
||
|
except:
|
||
|
#print('lower did not work')
|
||
|
pass
|
||
|
#print('Firstword',splitsent[0])
|
||
|
count = 0
|
||
|
indextocutlastword = 1
|
||
|
for letter in lastword:
|
||
|
count += 1
|
||
|
if letter in self.punktuation_list:
|
||
|
punctuations.append(letter)
|
||
|
indextocutlastword = count
|
||
|
|
||
|
|
||
|
splitsent[-1] = splitsent[-1][:indextocutlastword - 1]
|
||
|
|
||
|
splitsentences.append(splitsent)
|
||
|
#print('senttext' , splitsent, indextocutlastword)
|
||
|
|
||
|
|
||
|
if len(splitsentences) != len(punctuations):
|
||
|
splitsentences = []
|
||
|
splitsentence = []
|
||
|
punctuations = []
|
||
|
|
||
|
words = sentences.split()
|
||
|
#print('sentencessplit', words)
|
||
|
for word in words:
|
||
|
|
||
|
if word in self.punktuation_list or (word[-1] in self.punktuation_list and len(word) > 2):
|
||
|
|
||
|
splitsentence.append(word[:-1])
|
||
|
|
||
|
splitsentences.append(splitsentence)
|
||
|
|
||
|
punctuations.append(word[-1])
|
||
|
|
||
|
splitsentence = []
|
||
|
else:
|
||
|
splitsentence.append(word)
|
||
|
|
||
|
for splisentence in splitsentences:
|
||
|
for word in spacysentences:
|
||
|
if word.text == splisentence[0]:
|
||
|
if word.tag_ == 'NN' or word.tag_ == 'NE':
|
||
|
pass
|
||
|
else:
|
||
|
splisentence[0] = splisentence[0].lower()
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
return splitsentences, punctuations
|
||
|
|
||
|
def printSplitsentences2Text(self, punctuations, splitsentences):
|
||
|
for n in range(len(sentences)):
|
||
|
|
||
|
towrite = ' '.join(sentences[n])
|
||
|
|
||
|
|
||
|
punctuation = punctuations[n]
|
||
|
|
||
|
print(towrite + punctuation + '\n')
|
||
|
|
||
|
return 'done'
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def toggle_code(self, state):
|
||
|
|
||
|
"""
|
||
|
Toggles the JavaScript show()/hide() function on the div.input element.
|
||
|
"""
|
||
|
|
||
|
output_string = "<script>$(\"div.input\").{}</script>"
|
||
|
output_args = (javascript_functions[state],)
|
||
|
output = output_string.format(*output_args)
|
||
|
|
||
|
display(HTML(output))
|
||
|
|
||
|
|
||
|
def button_action(self, value):
|
||
|
|
||
|
"""
|
||
|
Calls the toggle_code function and updates the button description.
|
||
|
"""
|
||
|
|
||
|
state = value.new
|
||
|
|
||
|
self.toggle_code(state)
|
||
|
|
||
|
value.owner.description = button_descriptions[state]
|
||
|
|
||
|
def log_progress(self, sequence, every=None, size=None, name='Items'):
|
||
|
from ipywidgets import IntProgress, HTML, VBox
|
||
|
from IPython.display import display
|
||
|
|
||
|
is_iterator = False
|
||
|
if size is None:
|
||
|
try:
|
||
|
size = len(sequence)
|
||
|
except TypeError:
|
||
|
is_iterator = True
|
||
|
if size is not None:
|
||
|
if every is None:
|
||
|
if size <= 200:
|
||
|
every = 1
|
||
|
else:
|
||
|
every = int(size / 200) # every 0.5%
|
||
|
else:
|
||
|
assert every is not None, 'sequence is iterator, set every'
|
||
|
|
||
|
if is_iterator:
|
||
|
progress = IntProgress(min=0, max=1, value=1)
|
||
|
progress.bar_style = 'info'
|
||
|
else:
|
||
|
progress = IntProgress(min=0, max=size, value=0)
|
||
|
label = HTML()
|
||
|
box = VBox(children=[label, progress])
|
||
|
display(box)
|
||
|
|
||
|
index = 0
|
||
|
try:
|
||
|
for index, record in enumerate(sequence, 1):
|
||
|
if index == 1 or index % every == 0:
|
||
|
if is_iterator:
|
||
|
label.value = '{name}: {index} / ?'.format(
|
||
|
name=name,
|
||
|
index=index
|
||
|
)
|
||
|
else:
|
||
|
progress.value = index
|
||
|
label.value = u'{name}: {index} / {size}'.format(
|
||
|
name=name,
|
||
|
index=index,
|
||
|
size=size
|
||
|
)
|
||
|
yield record
|
||
|
except:
|
||
|
progress.bar_style = 'danger'
|
||
|
raise
|
||
|
else:
|
||
|
progress.bar_style = 'success'
|
||
|
progress.value = index
|
||
|
label.value = "{name}: {index}".format(
|
||
|
name=name,
|
||
|
index=str(index or '?')
|
||
|
)
|
||
|
|
||
|
|