diff --git a/Prototyp/.ipynb_checkpoints/Prototype-checkpoint.ipynb b/Prototyp/.ipynb_checkpoints/Prototype-checkpoint.ipynb index 70925802..e6d0b42d 100644 --- a/Prototyp/.ipynb_checkpoints/Prototype-checkpoint.ipynb +++ b/Prototyp/.ipynb_checkpoints/Prototype-checkpoint.ipynb @@ -241,7 +241,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "loading SentSeg Databases\n", + "loading SolveShorts Databases\n", "Creating the bag of words...\n", "\n", "dumping the data to hkl format..\n", @@ -257,7 +257,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6793c5121aaf498e8960726a40709e19", + "model_id": "82646fa586ba44aabc1608ec7a268b2c", "version_major": 2, "version_minor": 0 }, @@ -315,6 +315,10 @@ "import dill\n", "dill.load_session('voilastate.db')\n", "\n", + "#from SolveShorts import *\n", + "#print('loading SolveShorts Databases')\n", + "#solSh.load_DB_into_FASTsearch()\n", + "\n", "#import SentSeg\n", "#sent_seg = SentSeg.SentSeg('de')\n", "#print('loading SentSeg Databases')\n", @@ -440,7 +444,7 @@ "outputs": [], "source": [ "import ipywidgets\n", - "out = 'Hier kommt der übersetzte Text raus.'\n", + "out = 'Hier kommt der übersetzte Text heraus.'\n", "\n", "TextFeld.add_class(\"thotext\")\n", "TextFelddouble.add_class(\"thotext\")\n", @@ -470,7 +474,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c833de5ff5d340bbb1988584eee0c368", + "model_id": "9e547f27f67f484c9b455ead6f63afb2", "version_major": 2, "version_minor": 0 }, @@ -607,7 +611,7 @@ " check = 1\n", " \n", " #print('sentences after cs', outsentences)\n", - " #print(len(punctuations))\n", + " print(len(punctuations))\n", " \n", " if FremdWB.value == True:\n", " if check == 1:\n", @@ -617,6 +621,7 @@ " outsentences, punctuations = fwb.fremdEintragAppend(insentences, punctuations)\n", " #print('outsentences')\n", " check = 1\n", + " \n", " if Medio.value == True:\n", " if check == 1:\n", " insentences = outsentences\n", @@ -630,7 +635,7 @@ " \n", " #print('sentences after fwb', outsentences)\n", " \n", - " #print(len(punctuations))\n", + " print(len(punctuations))\n", " \n", " if check == 1:\n", " out = ''\n", @@ -669,12 +674,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "aa136a24ef044b4fb5d10f6c9278d35f", + "model_id": "0d27a028dcb449e2a2a6a7dfd25acd49", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "interactive(children=(Button(description='Übersetzen', style=ButtonStyle()), Output()), _dom_classes=('widget-…" + "interactive(children=(Button(description='Übersetzen in Leichte Sprache', style=ButtonStyle()), Output()), _do…" ] }, "metadata": {}, @@ -690,7 +695,7 @@ "\n", "#print(widgets.interact_manual.opts)\n", "\n", - "widgets.interact_manual.opts['manual_name']= 'Übersetzen'\n", + "widgets.interact_manual.opts['manual_name']= 'Übersetzen in Leichte Sprache'\n", "ola = widgets.interact_manual(function)\n", "ola.widget.children[0].style.button_color = 'lightgreen'\n", "ola.widget.children[0].layout.height = '50px'\n", @@ -709,12 +714,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1474be19da7a4b1bbd7fee229dd5a8ee", + "model_id": "564058b35ab743fabff90d4c49c5aac3", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Textarea(value='', layout=Layout(height='180px', width='99%'), placeholder='Hier kommt der übersetzte Text rau…" + "Textarea(value='', layout=Layout(height='180px', width='99%'), placeholder='Hier kommt der übersetzte Text her…" ] }, "metadata": {}, @@ -831,7 +836,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c2f3338821ae4ee59d205af8cb1083a8", + "model_id": "2e67ffb1c4ec4ddeb2c18935f4d0fdc4", "version_major": 2, "version_minor": 0 }, @@ -857,7 +862,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1182e1d2f6c44b9ab0d50c9d388a2765", + "model_id": "f8e8a92efa8e41bbb3efe44c35c37ec1", "version_major": 2, "version_minor": 0 }, diff --git a/Prototyp/DataBaseOneZeroshkldbFremd_WB1.hkl b/Prototyp/DataBaseOneZeroshkldbFremd_WB1.hkl index b8950323..1e744cbc 100644 Binary files a/Prototyp/DataBaseOneZeroshkldbFremd_WB1.hkl and b/Prototyp/DataBaseOneZeroshkldbFremd_WB1.hkl differ diff --git a/Prototyp/DataBaseOneZeroshkldbFremd_WB2.hkl b/Prototyp/DataBaseOneZeroshkldbFremd_WB2.hkl index a1442e0f..c1c59733 100644 Binary files a/Prototyp/DataBaseOneZeroshkldbFremd_WB2.hkl and b/Prototyp/DataBaseOneZeroshkldbFremd_WB2.hkl differ diff --git a/Prototyp/FremdWB.py b/Prototyp/FremdWB.py index 041c800e..be44f6f4 100644 --- a/Prototyp/FremdWB.py +++ b/Prototyp/FremdWB.py @@ -176,8 +176,9 @@ class FremdWB(object): #print('the endsentence',sentence) except: - print('konnte nicht' + str(sentence) + 'in FremdWB prozessieren..') - if sentence != outsentences[-1]: + #print('konnte nicht' + str(sentence) + 'in FremdWB prozessieren..') + #print('outsentence und co ', outsentences[-1], eintrag, sentence) + if sentence != outsentences[-1] and alleeintraege[-1] != outsentences[-1]: outsentences.append(sentence) punctuations = oldpunctuations return outsentences, punctuations diff --git a/Prototyp/Prototype.ipynb b/Prototyp/Prototype.ipynb index 70925802..e6d0b42d 100644 --- a/Prototyp/Prototype.ipynb +++ b/Prototyp/Prototype.ipynb @@ -241,7 +241,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "loading SentSeg Databases\n", + "loading SolveShorts Databases\n", "Creating the bag of words...\n", "\n", "dumping the data to hkl format..\n", @@ -257,7 +257,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6793c5121aaf498e8960726a40709e19", + "model_id": "82646fa586ba44aabc1608ec7a268b2c", "version_major": 2, "version_minor": 0 }, @@ -315,6 +315,10 @@ "import dill\n", "dill.load_session('voilastate.db')\n", "\n", + "#from SolveShorts import *\n", + "#print('loading SolveShorts Databases')\n", + "#solSh.load_DB_into_FASTsearch()\n", + "\n", "#import SentSeg\n", "#sent_seg = SentSeg.SentSeg('de')\n", "#print('loading SentSeg Databases')\n", @@ -440,7 +444,7 @@ "outputs": [], "source": [ "import ipywidgets\n", - "out = 'Hier kommt der übersetzte Text raus.'\n", + "out = 'Hier kommt der übersetzte Text heraus.'\n", "\n", "TextFeld.add_class(\"thotext\")\n", "TextFelddouble.add_class(\"thotext\")\n", @@ -470,7 +474,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c833de5ff5d340bbb1988584eee0c368", + "model_id": "9e547f27f67f484c9b455ead6f63afb2", "version_major": 2, "version_minor": 0 }, @@ -607,7 +611,7 @@ " check = 1\n", " \n", " #print('sentences after cs', outsentences)\n", - " #print(len(punctuations))\n", + " print(len(punctuations))\n", " \n", " if FremdWB.value == True:\n", " if check == 1:\n", @@ -617,6 +621,7 @@ " outsentences, punctuations = fwb.fremdEintragAppend(insentences, punctuations)\n", " #print('outsentences')\n", " check = 1\n", + " \n", " if Medio.value == True:\n", " if check == 1:\n", " insentences = outsentences\n", @@ -630,7 +635,7 @@ " \n", " #print('sentences after fwb', outsentences)\n", " \n", - " #print(len(punctuations))\n", + " print(len(punctuations))\n", " \n", " if check == 1:\n", " out = ''\n", @@ -669,12 +674,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "aa136a24ef044b4fb5d10f6c9278d35f", + "model_id": "0d27a028dcb449e2a2a6a7dfd25acd49", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "interactive(children=(Button(description='Übersetzen', style=ButtonStyle()), Output()), _dom_classes=('widget-…" + "interactive(children=(Button(description='Übersetzen in Leichte Sprache', style=ButtonStyle()), Output()), _do…" ] }, "metadata": {}, @@ -690,7 +695,7 @@ "\n", "#print(widgets.interact_manual.opts)\n", "\n", - "widgets.interact_manual.opts['manual_name']= 'Übersetzen'\n", + "widgets.interact_manual.opts['manual_name']= 'Übersetzen in Leichte Sprache'\n", "ola = widgets.interact_manual(function)\n", "ola.widget.children[0].style.button_color = 'lightgreen'\n", "ola.widget.children[0].layout.height = '50px'\n", @@ -709,12 +714,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1474be19da7a4b1bbd7fee229dd5a8ee", + "model_id": "564058b35ab743fabff90d4c49c5aac3", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Textarea(value='', layout=Layout(height='180px', width='99%'), placeholder='Hier kommt der übersetzte Text rau…" + "Textarea(value='', layout=Layout(height='180px', width='99%'), placeholder='Hier kommt der übersetzte Text her…" ] }, "metadata": {}, @@ -831,7 +836,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c2f3338821ae4ee59d205af8cb1083a8", + "model_id": "2e67ffb1c4ec4ddeb2c18935f4d0fdc4", "version_major": 2, "version_minor": 0 }, @@ -857,7 +862,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1182e1d2f6c44b9ab0d50c9d388a2765", + "model_id": "f8e8a92efa8e41bbb3efe44c35c37ec1", "version_major": 2, "version_minor": 0 }, diff --git a/Prototyp/SolveShorts.py b/Prototyp/SolveShorts.py index 7e412a09..16e7c898 100644 --- a/Prototyp/SolveShorts.py +++ b/Prototyp/SolveShorts.py @@ -91,14 +91,19 @@ class SolveShorts(object): doc = self.nlp(' '.join(sentence)) #print('da sentence', sentence) newshorts = [] - for word in sentence: - + wordcount = 0 + for oriword in sentence: + wordcount += 1 + if wordcount == len(sentence): + word = oriword + '.' + else: + word = oriword newshort = [] prenewshort = [] punctcount = list(word).count('.') #print(word, list(word), punctcount) if punctcount > 1: - replaceindex = sentence.index(word) + replaceindex = sentence.index(oriword) dacount = 0 for letter in list(word): #print('letter in word split', letter) @@ -153,11 +158,18 @@ class SolveShorts(object): interestingindex = m break if len(sentence) - n <= 5 and n != len(sentence) - 1: - for m in range((len(sentence) - n) - 1): - #print(n, m, n+m+1, len(sentence)) - if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.': - interestingindex = m - break + for m in range((len(sentence) - n)): + #print('oleolaolu',n, m, n+m+1, len(sentence)) + + if m == (len(sentence) - n) - 1: + if sentence[n + m][-1] == '.': + interestingindex = m + break + else: + + if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.' : + interestingindex = m + break #print(interestingindex, 'interestingindex') if interestingindex == 0: diff --git a/Prototyp/Verbesserungen/Input143.txt b/Prototyp/Verbesserungen/Input143.txt new file mode 100644 index 00000000..0f65af11 --- /dev/null +++ b/Prototyp/Verbesserungen/Input143.txt @@ -0,0 +1,2 @@ +mit dem Rechtsmittel soll die mittlerweile seit 439 Tagen (Stand Dienstag) andauernde Prüfung nach mehrfach enttäuschten Versprechen gegenüber der Bürger-Ini auf juristischem Weg erzwungen werden . +die Volks-Initiative klagt seit Mai gegen die lange Prüfdauer ihres Anliegens, große Wohnungskonzerne gegen Entschädigungen zu vergesellschaften und spricht von Verschleppung . diff --git a/Prototyp/Verbesserungen/Output143.txt b/Prototyp/Verbesserungen/Output143.txt new file mode 100644 index 00000000..d7ee4c7e --- /dev/null +++ b/Prototyp/Verbesserungen/Output143.txt @@ -0,0 +1,3 @@ +mit dem Rechtsmittel soll die mittlerweile seit 439 Tagen (Stand Dienstag) andauernde Pruefung nach mehrfach enttaeuschten Versprechen gegenueber der Buerger-Ini auf juristischem Weg erzwungen werden . +die Volks-Initiative klagt seit Mai gegen die lange Pruefdauer ihres Anliegens . +Grosse Wohnungskonzerne gegen Entschaedigungen zu vergesellschaften und spricht von Verschleppung . diff --git a/Prototyp/Verbesserungen/Verbesserungen143.txt b/Prototyp/Verbesserungen/Verbesserungen143.txt new file mode 100644 index 00000000..4011899d --- /dev/null +++ b/Prototyp/Verbesserungen/Verbesserungen143.txt @@ -0,0 +1,5 @@ +mit dem Rechtsmittel soll die Pruefung nach mehrfach enttaeuschten Versprechen gegenueber der Buerger-Ini auf juristischen Weg erzwungen werden . +die Pruefung dauert mittlerweile seit 439 Tagen an . +die Volks-Initiative klagt seit Mai gegen die lange Pruefdauer ihres Anliegens . +ihr Anliegen ist Grosse Wohnungskonzerne gegen Entschaedigungen zu vergesellschaften . +und die Volks-Initiative spricht von Verschleppung . diff --git a/Prototyp/Verbesserungen/indexDerVerbesserungen.txt b/Prototyp/Verbesserungen/indexDerVerbesserungen.txt index 83248fb9..aa59885c 100644 --- a/Prototyp/Verbesserungen/indexDerVerbesserungen.txt +++ b/Prototyp/Verbesserungen/indexDerVerbesserungen.txt @@ -1 +1 @@ -142 \ No newline at end of file +143 \ No newline at end of file diff --git a/Prototyp/__pycache__/FremdWB.cpython-35.pyc b/Prototyp/__pycache__/FremdWB.cpython-35.pyc index a4630b77..ccbcdc56 100644 Binary files a/Prototyp/__pycache__/FremdWB.cpython-35.pyc and b/Prototyp/__pycache__/FremdWB.cpython-35.pyc differ diff --git a/Prototyp/__pycache__/SolveShorts.cpython-35.pyc b/Prototyp/__pycache__/SolveShorts.cpython-35.pyc index 747d0252..b734f0f0 100644 Binary files a/Prototyp/__pycache__/SolveShorts.cpython-35.pyc and b/Prototyp/__pycache__/SolveShorts.cpython-35.pyc differ diff --git a/Prototyp/bagofwordshkldbFremd_WB1.pkl b/Prototyp/bagofwordshkldbFremd_WB1.pkl index 046aaad7..8b0ee7f1 100644 Binary files a/Prototyp/bagofwordshkldbFremd_WB1.pkl and b/Prototyp/bagofwordshkldbFremd_WB1.pkl differ diff --git a/Prototyp/bagofwordshkldbFremd_WB2.pkl b/Prototyp/bagofwordshkldbFremd_WB2.pkl index 91425417..f5f7354f 100644 Binary files a/Prototyp/bagofwordshkldbFremd_WB2.pkl and b/Prototyp/bagofwordshkldbFremd_WB2.pkl differ diff --git a/Prototyp/voilastate.db b/Prototyp/voilastate.db index 5a539940..4ecac697 100644 Binary files a/Prototyp/voilastate.db and b/Prototyp/voilastate.db differ