|
@ -0,0 +1,787 @@ |
|
|
|
|
|
# Parse wiktionary.xml with pure python, such that it can be run with pypy (python just in time compiler) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# optimization would be possible through cython and assembler loops etc |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# on a linux system, get the first n lines of a document with: |
|
|
|
|
|
|
|
|
|
|
|
# head -n1000000 dewiktionary-20181201-pages-articles.xml > wiktionaryFirstMio.xml |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import sys |
|
|
|
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Parser(object): |
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, InputDokument, OutputDokument): |
|
|
|
|
|
|
|
|
|
|
|
self.Indok = InputDokument |
|
|
|
|
|
self.Outdok = OutputDokument |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetSeparators(self): |
|
|
|
|
|
with open(self.Indok) as xmldok: |
|
|
|
|
|
with open(self.Outdok , 'w') as getsepdok: |
|
|
|
|
|
seperators = [] |
|
|
|
|
|
counter = 0 |
|
|
|
|
|
for line in xmldok: |
|
|
|
|
|
counter += 1 |
|
|
|
|
|
#print(counter) |
|
|
|
|
|
if (counter % 10) == 0: |
|
|
|
|
|
print(counter) |
|
|
|
|
|
|
|
|
|
|
|
seperator =[] |
|
|
|
|
|
val = 0 |
|
|
|
|
|
|
|
|
|
|
|
#if counter == 10000: |
|
|
|
|
|
#seperatorsSet = [] |
|
|
|
|
|
#getsepdok.write('[' + '\n') |
|
|
|
|
|
#for element in seperators: |
|
|
|
|
|
#seperatorsSet.append(''.join(element)) |
|
|
|
|
|
|
|
|
|
|
|
#for element in set(seperatorsSet): |
|
|
|
|
|
#getsepdok.write(str(''.join(element)) + '\n') |
|
|
|
|
|
#getsepdok.write(']') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for letter in line: |
|
|
|
|
|
|
|
|
|
|
|
#print(letter) |
|
|
|
|
|
if letter == '>': |
|
|
|
|
|
val = 0 |
|
|
|
|
|
seperators.append(seperator) |
|
|
|
|
|
seperator = [] |
|
|
|
|
|
|
|
|
|
|
|
if val == 1: |
|
|
|
|
|
seperator.append(letter) |
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
if letter == '<': |
|
|
|
|
|
val = 1 |
|
|
|
|
|
|
|
|
|
|
|
seperatorsSet = [] |
|
|
|
|
|
getsepdok.write('[' + '\n') |
|
|
|
|
|
for element in seperators: |
|
|
|
|
|
seperatorsSet.append(''.join(element)) |
|
|
|
|
|
seperatorsSet = set(seperatorsSet) |
|
|
|
|
|
for element in set(seperatorsSet): |
|
|
|
|
|
getsepdok.write(str(''.join(element)) + '\n') |
|
|
|
|
|
getsepdok.write(']') |
|
|
|
|
|
return seperatorsSet |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetPayloadBetweenTwoSymbols(self, SymbolA, SymbolB , LogLineNumber=False, Doc = True): |
|
|
|
|
|
with open(self.Indok) as xmldok: |
|
|
|
|
|
with open(self.Outdok , 'w') as payloaddok: |
|
|
|
|
|
seperators = [] |
|
|
|
|
|
counter = 0 |
|
|
|
|
|
valA = 0 |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
|
|
|
|
|
|
seperator =[] |
|
|
|
|
|
for line in xmldok: |
|
|
|
|
|
|
|
|
|
|
|
#print(line) |
|
|
|
|
|
counter += 1 |
|
|
|
|
|
|
|
|
|
|
|
if LogLineNumber == True: |
|
|
|
|
|
if (counter % 10000) == 0: |
|
|
|
|
|
print(counter) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#for letter in line.decode('utf-8'): |
|
|
|
|
|
for letter in line: |
|
|
|
|
|
#print(letter) |
|
|
|
|
|
#print(set(range(1, len(SymbolA)))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if valA % len(SymbolA) in set(range(1, len(SymbolA) )): |
|
|
|
|
|
|
|
|
|
|
|
#print('jo') |
|
|
|
|
|
if wait1letterA == True: |
|
|
|
|
|
|
|
|
|
|
|
#print('joo') |
|
|
|
|
|
#print(letter) |
|
|
|
|
|
valA -= valA % len(SymbolA) |
|
|
|
|
|
|
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
|
|
|
|
|
|
wait1letterA = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if valB in set(range(1, len(SymbolB) )): |
|
|
|
|
|
|
|
|
|
|
|
if wait1letterB == True: |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
|
|
|
|
|
|
wait1letterB = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for n in range(len(SymbolB)): |
|
|
|
|
|
if valA >= len(SymbolA) and valB == n and letter == SymbolB[n]: |
|
|
|
|
|
valB = n + 1 |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
if valB == len(SymbolB) and valA >= len(SymbolA): |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
|
|
|
|
|
|
#print(letter) |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
valA -= len(SymbolA) |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
|
|
|
|
|
|
#print(seperators) |
|
|
|
|
|
|
|
|
|
|
|
if valA >= len(SymbolA): |
|
|
|
|
|
|
|
|
|
|
|
seperator.append(letter) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
#print(SymbolA[6]) |
|
|
|
|
|
#print(len(SymbolA)) |
|
|
|
|
|
#print(range(len(SymbolA))) |
|
|
|
|
|
if valA == 0: |
|
|
|
|
|
|
|
|
|
|
|
if len(seperator[:-(len(SymbolB)-1)]) >= 1: |
|
|
|
|
|
|
|
|
|
|
|
seperators.append(seperator[:-(len(SymbolB)-1)]) |
|
|
|
|
|
seperator = [] |
|
|
|
|
|
|
|
|
|
|
|
for n in range(len(SymbolA)): |
|
|
|
|
|
#print(n) |
|
|
|
|
|
if valA % len(SymbolA) == n and letter == SymbolA[n]: |
|
|
|
|
|
|
|
|
|
|
|
valA += 1 |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
break |
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seperatorsSet = [] |
|
|
|
|
|
#getsepdok.write('[' + '\n') |
|
|
|
|
|
for element in seperators: |
|
|
|
|
|
seperatorsSet.append(''.join(element)) |
|
|
|
|
|
seperatorsSet = set(seperatorsSet) |
|
|
|
|
|
|
|
|
|
|
|
output = [] |
|
|
|
|
|
ID = 0 |
|
|
|
|
|
|
|
|
|
|
|
## Set has a probabilistic factor in it!!!! thats why the nmbers change |
|
|
|
|
|
for element in seperatorsSet: |
|
|
|
|
|
|
|
|
|
|
|
output.append([element, ID]) |
|
|
|
|
|
ID += 1 |
|
|
|
|
|
|
|
|
|
|
|
return output |
|
|
|
|
|
|
|
|
|
|
|
def GetPayloadBetweenTwoSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber): |
|
|
|
|
|
seperators = [] |
|
|
|
|
|
counter = 0 |
|
|
|
|
|
|
|
|
|
|
|
for element in Payload: |
|
|
|
|
|
|
|
|
|
|
|
counter += 1 |
|
|
|
|
|
|
|
|
|
|
|
if LogElementNumber == True: |
|
|
|
|
|
if (counter % 1000) == 0: |
|
|
|
|
|
print(counter) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seperator =[] |
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
valA = 0 |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for letter in element[0]: |
|
|
|
|
|
#print(letter) |
|
|
|
|
|
#print(set(range(1, len(SymbolA)))) |
|
|
|
|
|
if valA % len(SymbolA) in set(range(1, len(SymbolA) )): |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
#print('jo') |
|
|
|
|
|
if wait1letterA == True: |
|
|
|
|
|
|
|
|
|
|
|
#print('joo') |
|
|
|
|
|
|
|
|
|
|
|
valA -= valA % len(SymbolA) |
|
|
|
|
|
|
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
|
|
|
|
|
|
wait1letterA = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if valB in set(range(1, len(SymbolB) )) and valA >= len(SymbolA): |
|
|
|
|
|
|
|
|
|
|
|
if wait1letterB == True: |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
|
|
|
|
|
|
wait1letterB = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#for n in range(len(SymbolB)): |
|
|
|
|
|
#if valB == n and letter == SymbolB[n]: |
|
|
|
|
|
#valB = n + 1 |
|
|
|
|
|
#wait1letterB = False |
|
|
|
|
|
#else: |
|
|
|
|
|
#pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if letter == SymbolB[valB % len(SymbolB)] and valA >= len(SymbolA): |
|
|
|
|
|
valB += 1 |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
if valB == len(SymbolB) and valA >= len(SymbolA): |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print(valA) |
|
|
|
|
|
valA -= len(SymbolA) |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
|
|
|
|
|
|
#print(seperators) |
|
|
|
|
|
|
|
|
|
|
|
if valA >= len(SymbolA): |
|
|
|
|
|
##print(letter) |
|
|
|
|
|
seperator.append(letter) |
|
|
|
|
|
#print(seperator) |
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
#print(SymbolA[6]) |
|
|
|
|
|
#print(len(SymbolA)) |
|
|
|
|
|
#print(range(len(SymbolA))) |
|
|
|
|
|
if valA == 0: |
|
|
|
|
|
#print('seps') |
|
|
|
|
|
if len(seperator[:-(len(SymbolB)-1)]) >= 1: |
|
|
|
|
|
seperators.append([''.join(seperator[:-(len(SymbolB)-1)]), element[1]]) |
|
|
|
|
|
seperator = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs. |
|
|
|
|
|
# wuerde einiges an computation wegnehmen, auch da beide symbole |
|
|
|
|
|
#for n in range(len(SymbolA)): |
|
|
|
|
|
##print(n) |
|
|
|
|
|
#if valA % len(SymbolA) == n and letter == SymbolA[n]: |
|
|
|
|
|
##print(SymbolA[n]) |
|
|
|
|
|
#valA += 1 |
|
|
|
|
|
#wait1letterA = False |
|
|
|
|
|
#else: |
|
|
|
|
|
#pass |
|
|
|
|
|
for n in range(len(SymbolA)): |
|
|
|
|
|
#print(n) |
|
|
|
|
|
if valA % len(SymbolA) == n and letter == SymbolA[n]: |
|
|
|
|
|
|
|
|
|
|
|
valA += 1 |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
break |
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return seperators |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetPayloadBetweenTwoOneSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber, Payloadrow, IDrow): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
counter = 0 |
|
|
|
|
|
seperator =[] |
|
|
|
|
|
|
|
|
|
|
|
seperators = [] |
|
|
|
|
|
|
|
|
|
|
|
for payload in Payload: |
|
|
|
|
|
val = 0 |
|
|
|
|
|
for letter in payload[Payloadrow]: |
|
|
|
|
|
|
|
|
|
|
|
counter += 1 |
|
|
|
|
|
#print(counter) |
|
|
|
|
|
if LogElementNumber == True: |
|
|
|
|
|
if (counter % 10) == 0: |
|
|
|
|
|
print(counter) |
|
|
|
|
|
|
|
|
|
|
|
#print(letter) |
|
|
|
|
|
if letter == SymbolB: |
|
|
|
|
|
val -= 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if val >= 1: |
|
|
|
|
|
seperator.append(letter) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
if val == 0 and len(seperator) >= 1: |
|
|
|
|
|
|
|
|
|
|
|
seperators.append([''.join(seperator), payload[IDrow]]) |
|
|
|
|
|
|
|
|
|
|
|
seperator = [] |
|
|
|
|
|
|
|
|
|
|
|
if letter == SymbolA: |
|
|
|
|
|
#print(val) |
|
|
|
|
|
val += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return seperators |
|
|
|
|
|
|
|
|
|
|
|
def CutTextAtSymbol(self, text, symbol): |
|
|
|
|
|
itisthesymbol = 0 |
|
|
|
|
|
outtext = [] |
|
|
|
|
|
output = [] |
|
|
|
|
|
symbolisthere = 0 |
|
|
|
|
|
for letter in text: |
|
|
|
|
|
|
|
|
|
|
|
outtext.append(letter) |
|
|
|
|
|
#print(letter) |
|
|
|
|
|
if letter != symbol[itisthesymbol]: |
|
|
|
|
|
itisthesymbol = 0 |
|
|
|
|
|
if letter == symbol[itisthesymbol]: |
|
|
|
|
|
itisthesymbol += 1 |
|
|
|
|
|
|
|
|
|
|
|
if itisthesymbol == len(symbol): |
|
|
|
|
|
#print(outtext) |
|
|
|
|
|
output.append(''.join(outtext)) |
|
|
|
|
|
itisthesymbol = 0 |
|
|
|
|
|
symbolisthere = 1 |
|
|
|
|
|
|
|
|
|
|
|
if symbolisthere == 0: |
|
|
|
|
|
output.append(''.join(outtext)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return output[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetPayloadBetweenTwoSymbolsInText(self, text, SymbolA, SymbolB): |
|
|
|
|
|
seperators = [] |
|
|
|
|
|
seperator =[] |
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
valA = 0 |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for letter in text: |
|
|
|
|
|
#print(letter) |
|
|
|
|
|
#print(SymbolA) |
|
|
|
|
|
if valA % len(SymbolA) in set(range(1, len(SymbolA) )): |
|
|
|
|
|
|
|
|
|
|
|
if wait1letterA == True: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
valA -= valA % len(SymbolA) |
|
|
|
|
|
|
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
|
|
|
|
|
|
wait1letterA = True |
|
|
|
|
|
#print('B',valB) |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
if valB in set(range(1, len(SymbolB) )): |
|
|
|
|
|
|
|
|
|
|
|
if wait1letterB == True: |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
|
|
|
|
|
|
wait1letterB = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print('B',valB) |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if letter == SymbolB[valB % len(SymbolB)]: |
|
|
|
|
|
valB += 1 |
|
|
|
|
|
wait1letterB = False |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
if valB == len(SymbolB): |
|
|
|
|
|
valB = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
valA -= len(SymbolA) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print('B',valB) |
|
|
|
|
|
#print(valA) |
|
|
|
|
|
if valA >= len(SymbolA): |
|
|
|
|
|
#print('append') |
|
|
|
|
|
seperator.append(letter) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if valA == 0: |
|
|
|
|
|
|
|
|
|
|
|
if len(seperator[:-(len(SymbolB)-1)]) >= 1: |
|
|
|
|
|
seperators.append([''.join(seperator[:-(len(SymbolB)-1)])]) |
|
|
|
|
|
seperator = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs. |
|
|
|
|
|
# wuerde einiges an computation wegnehmen, auch da beide symbole |
|
|
|
|
|
#for n in range(len(SymbolA)): |
|
|
|
|
|
#print(SymbolA[valA % len(SymbolA)]) |
|
|
|
|
|
if letter == SymbolA[valA % len(SymbolA)]: |
|
|
|
|
|
#print('oi') |
|
|
|
|
|
valA += 1 |
|
|
|
|
|
wait1letterA = False |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return seperators |
|
|
|
|
|
|
|
|
|
|
|
def GetPayloadBetweenTwoSameSymbolsInText(self, text, Symbol): |
|
|
|
|
|
seperators = [] |
|
|
|
|
|
seperator =[] |
|
|
|
|
|
wait1letter = False |
|
|
|
|
|
|
|
|
|
|
|
nowendit = False |
|
|
|
|
|
|
|
|
|
|
|
val = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for letter in text: |
|
|
|
|
|
#print(letter) |
|
|
|
|
|
#print(SymbolA) |
|
|
|
|
|
|
|
|
|
|
|
if nowendit == False and letter == Symbol[val % len(Symbol)]: |
|
|
|
|
|
val += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if nowendit == True and letter == Symbol[val % len(Symbol)]: |
|
|
|
|
|
val -= 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if val == len(Symbol): |
|
|
|
|
|
seperator.append(letter) |
|
|
|
|
|
nowendit = True |
|
|
|
|
|
#print('append') |
|
|
|
|
|
|
|
|
|
|
|
if val == 0 and len(seperator) >= 1: |
|
|
|
|
|
seperators.append(' '.join(seperator)) |
|
|
|
|
|
seperator = [] |
|
|
|
|
|
nowendit = False |
|
|
|
|
|
|
|
|
|
|
|
return seperators |
|
|
|
|
|
|
|
|
|
|
|
def ParseWordswithSymbolFromSymbolongoing(self, text, Symbol): |
|
|
|
|
|
seperators = [] |
|
|
|
|
|
#print(text.split()) |
|
|
|
|
|
for word in text.split(): |
|
|
|
|
|
|
|
|
|
|
|
val = 0 |
|
|
|
|
|
waitoneletter = False |
|
|
|
|
|
seperator = [] |
|
|
|
|
|
for letter in word: |
|
|
|
|
|
|
|
|
|
|
|
#print(letter) |
|
|
|
|
|
#print(val) |
|
|
|
|
|
if val < len(Symbol): |
|
|
|
|
|
if letter == Symbol[val]: |
|
|
|
|
|
val += 1 |
|
|
|
|
|
#print(letter) |
|
|
|
|
|
#print(len(Symbol)) |
|
|
|
|
|
#print(val) |
|
|
|
|
|
if val >= len(Symbol): |
|
|
|
|
|
val = len(Symbol) |
|
|
|
|
|
|
|
|
|
|
|
if val < len(Symbol): |
|
|
|
|
|
if letter != Symbol[val]: |
|
|
|
|
|
val = 0 |
|
|
|
|
|
|
|
|
|
|
|
if val == len(Symbol): |
|
|
|
|
|
seperator.append(letter) |
|
|
|
|
|
#print('itsappending') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(seperator) >= 1: |
|
|
|
|
|
seperators.append(''.join(seperator)) |
|
|
|
|
|
seperator = [] |
|
|
|
|
|
|
|
|
|
|
|
return seperators |
|
|
|
|
|
|
|
|
|
|
|
def ParseWithHighestLetterAccordance(self, inputtext, Letters): |
|
|
|
|
|
|
|
|
|
|
|
# first check if there is a word that has all letters |
|
|
|
|
|
|
|
|
|
|
|
short = False |
|
|
|
|
|
lettervect = [] |
|
|
|
|
|
Lettervector = [] |
|
|
|
|
|
|
|
|
|
|
|
wordscores = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = inputtext.lower() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if '.' in set(Letters): |
|
|
|
|
|
short = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if short == True: |
|
|
|
|
|
for letter in re.sub("[^a-zA-Züäö.]", " ", Letters): |
|
|
|
|
|
letter = letter.lower() |
|
|
|
|
|
#print(re.sub("[^a-züäö.]", " ", Letters)) |
|
|
|
|
|
|
|
|
|
|
|
if letter != '.' and letter != ' ': |
|
|
|
|
|
|
|
|
|
|
|
lettervect.append(letter) |
|
|
|
|
|
if letter == '.': |
|
|
|
|
|
Lettervector.append(lettervect) |
|
|
|
|
|
lettervect = [] |
|
|
|
|
|
|
|
|
|
|
|
if len(lettervect) >= 1: |
|
|
|
|
|
Lettervector.append(lettervect) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for letter in re.sub("[^a-zA-Züäö.]", " ", Letters): |
|
|
|
|
|
letter = letter.lower() |
|
|
|
|
|
Lettervector.append([letter]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print(text) |
|
|
|
|
|
#print(Lettervector) |
|
|
|
|
|
from copy import deepcopy |
|
|
|
|
|
|
|
|
|
|
|
for word in text.split(): |
|
|
|
|
|
|
|
|
|
|
|
lettervector = deepcopy(Lettervector) |
|
|
|
|
|
#print(word) |
|
|
|
|
|
#print(Lettervector) |
|
|
|
|
|
|
|
|
|
|
|
wordscore = [] |
|
|
|
|
|
for n in range(len(lettervector)): |
|
|
|
|
|
wordscore.append([word, 0]) |
|
|
|
|
|
|
|
|
|
|
|
#wordscore = len(lettervector) * [[word, 0 ]] |
|
|
|
|
|
#print(wordscore) |
|
|
|
|
|
firstletter = 0 |
|
|
|
|
|
usedletters = [] |
|
|
|
|
|
for letter in word: |
|
|
|
|
|
firstletter += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print(set(Letters)) |
|
|
|
|
|
#print(wordscore) |
|
|
|
|
|
|
|
|
|
|
|
#print(lettervector[n]) |
|
|
|
|
|
if firstletter == 1: |
|
|
|
|
|
if letter == lettervector[0][0]: |
|
|
|
|
|
#print('oi') |
|
|
|
|
|
#print(lettervector) |
|
|
|
|
|
#print(len(lettervector[2])) |
|
|
|
|
|
wordscore[0][1] += 1 |
|
|
|
|
|
lettervector[0].remove(letter) |
|
|
|
|
|
#print(usedletters) |
|
|
|
|
|
else: |
|
|
|
|
|
lettervector[0].remove(lettervector[0][0]) |
|
|
|
|
|
|
|
|
|
|
|
for n in range(len(lettervector)): |
|
|
|
|
|
|
|
|
|
|
|
#print('1' ,letter) |
|
|
|
|
|
#print(lettervector[n][0]) |
|
|
|
|
|
if letter in set(lettervector[n]): |
|
|
|
|
|
|
|
|
|
|
|
#print('ooioi',usedletters) |
|
|
|
|
|
if letter not in set(usedletters): |
|
|
|
|
|
#print('something was added', letter) |
|
|
|
|
|
wordscore[n][1] += 1 |
|
|
|
|
|
lettervector[n].remove(letter) |
|
|
|
|
|
#print('angesprungen') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wordscores.append(wordscore) |
|
|
|
|
|
|
|
|
|
|
|
#print(wordscores) |
|
|
|
|
|
|
|
|
|
|
|
#checkbest_firstlettervector = [] |
|
|
|
|
|
#for n in range(len(wordscores)): |
|
|
|
|
|
|
|
|
|
|
|
#checkbest_firstlettervector.append([ n , wordscores[n][0][1]]) |
|
|
|
|
|
|
|
|
|
|
|
#print('wordscores', wordscores) |
|
|
|
|
|
#best_n_lettervectors = sorted(checkbest_firstlettervector[::-1], key=lambda tup: tup[1], reverse=True) |
|
|
|
|
|
|
|
|
|
|
|
#print(best_n_lettervectors) |
|
|
|
|
|
|
|
|
|
|
|
#for wordscore in wordscores: |
|
|
|
|
|
ntupelscores = [] |
|
|
|
|
|
ntupelscoresm = [] |
|
|
|
|
|
|
|
|
|
|
|
for o in range(len(wordscores)): |
|
|
|
|
|
#print('newlettervectorindex') |
|
|
|
|
|
lastletterexistentindex = 1 |
|
|
|
|
|
lastlettercame = False |
|
|
|
|
|
if wordscores[o][0][1] >= 1: |
|
|
|
|
|
for m in range(1, len(lettervector) + 1): |
|
|
|
|
|
#print(m) |
|
|
|
|
|
if o <= len(text.split()) - (m): |
|
|
|
|
|
|
|
|
|
|
|
triplescore = [] |
|
|
|
|
|
for q in range(len(wordscores[o])): |
|
|
|
|
|
triplescore.append(0) |
|
|
|
|
|
#print(len(lettervector)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for n in range(m): |
|
|
|
|
|
#print(wordscores[lettervectorindex[0] + n][n][1]) |
|
|
|
|
|
#wordscores[lettervectorindex[0] + 1][1][1] + wordscores[lettervectorindex[0] + 2][2][1] |
|
|
|
|
|
|
|
|
|
|
|
for p in range(len(wordscores[o])): |
|
|
|
|
|
#print(wordscore[o + n][p][1]) |
|
|
|
|
|
#print(len(Lettervector[p])) |
|
|
|
|
|
if wordscores[o + n][p][1] == len(Lettervector[p]): |
|
|
|
|
|
triplescore[p] += wordscores[o + n][p][1] |
|
|
|
|
|
|
|
|
|
|
|
letterlength = 0 |
|
|
|
|
|
for r in range(len(lettervector)): |
|
|
|
|
|
letterlength += len(Lettervector[r]) |
|
|
|
|
|
|
|
|
|
|
|
#print(wordscore) |
|
|
|
|
|
#print(sum(triplescore)) |
|
|
|
|
|
|
|
|
|
|
|
if p == len(wordscores[o]) - 1 and wordscores[o + n][p][1] == len(Lettervector[p]) and lastlettercame == False and sum(triplescore) == letterlength: |
|
|
|
|
|
#print('oioioioioioioooioioioiiiiiiiiiiiiiiiiiiiiiiiiiii') |
|
|
|
|
|
lastletterexistentindex = n |
|
|
|
|
|
lastlettercame = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#triplescore += wordscores[o + n][p][1] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ntupelscores.append([[o , m, lastletterexistentindex], sum(triplescore)]) |
|
|
|
|
|
#ntupelscoresm.append([m , triplescore]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print(text.split()) |
|
|
|
|
|
#print('bliblablub', ntupelscores) |
|
|
|
|
|
|
|
|
|
|
|
for tupel in ntupelscores: |
|
|
|
|
|
|
|
|
|
|
|
if text.split()[tupel[0][0]][0] == Lettervector[0][0]: |
|
|
|
|
|
tupel[1] += 3 |
|
|
|
|
|
|
|
|
|
|
|
#print('b',text.split()[tupel[0][0] + tupel[0][1] - 1][0]) |
|
|
|
|
|
#print('a',Lettervector[-1][0]) |
|
|
|
|
|
if text.split()[tupel[0][0] + tupel[0][1] - 1][0] == Lettervector[-1][0]: |
|
|
|
|
|
tupel[1] += 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Bestrafe laengere Tupel, sprich wenn durch weitere worte kein score dazukommt |
|
|
|
|
|
tupel[1] -= tupel[0][1] * 0.1 |
|
|
|
|
|
|
|
|
|
|
|
bestntupelscoresorted = sorted(ntupelscores[::-1], key=lambda tup: tup[1], reverse=True) |
|
|
|
|
|
#bestntupelscoresortedm = sorted(ntupelscoresm[::-1], key=lambda tup: tup[1], reverse=True) |
|
|
|
|
|
|
|
|
|
|
|
#print('oioioioioioioioioioi',bestntupelscoresorted) |
|
|
|
|
|
outputntupel = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print(bestntupelscoresorted) |
|
|
|
|
|
for s in range(bestntupelscoresorted[0][0][1] ): |
|
|
|
|
|
|
|
|
|
|
|
outputntupel.append(text.split()[bestntupelscoresorted[0][0][0] + s]) |
|
|
|
|
|
|
|
|
|
|
|
#print(outputntupel) |
|
|
|
|
|
|
|
|
|
|
|
return outputntupel |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#def parseWordsContainingCertainSymbols(self, text, symbols): |
|
|
|
|
|
#print() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#fooSeparator = 'title' |
|
|
|
|
|
|
|
|
|
|
|
#cwd = os.getcwd() |
|
|
|
|
|
|
|
|
|
|
|
#with open('dewiktionary-20181201-pages-articles.xml') as xmldok: |
|
|
|
|
|
#with open(cwd + '/' + 'classes.txt', 'w') as Outdok: |
|
|
|
|
|
#n = 0 |
|
|
|
|
|
#done = False |
|
|
|
|
|
#while done == False: |
|
|
|
|
|
#for line in xmldok: |
|
|
|
|
|
#n += 1 |
|
|
|
|
|
##print(line) |
|
|
|
|
|
##print(dok_to_token(line)) |
|
|
|
|
|
##print(n) |
|
|
|
|
|
#for word in line: |
|
|
|
|
|
#print(word) |
|
|
|
|
|
|
|
|
|
|
|
#try: |
|
|
|
|
|
#if dok_to_token(line)[:(len(fooSeparator) + 2)] == '<' + fooSeparator + '>': |
|
|
|
|
|
#Outdok.write(dok_to_token(line)[len(fooSeperator):-len(fooSeperator)] + '\n') |
|
|
|
|
|
#except: |
|
|
|
|
|
#pass |
|
|
|
|
|
#if n >= 100000: |
|
|
|
|
|
#quit() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|