a parser, able to run with pypy at 20 times of the speed of the standard python interpreter. Because written in pure python.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

787 lines
26 KiB

# Parse wiktionary.xml with pure python, such that it can be run with pypy (python just in time compiler)
# optimization would be possible through cython and assembler loops etc
# on a linux system, get the first n lines of a document with:
# head -n1000000 dewiktionary-20181201-pages-articles.xml > wiktionaryFirstMio.xml
import sys
import os
import re
class Parser(object):
def __init__(self, InputDokument, OutputDokument):
self.Indok = InputDokument
self.Outdok = OutputDokument
def GetSeparators(self):
with open(self.Indok) as xmldok:
with open(self.Outdok , 'w') as getsepdok:
seperators = []
counter = 0
for line in xmldok:
counter += 1
#print(counter)
if (counter % 10) == 0:
print(counter)
seperator =[]
val = 0
#if counter == 10000:
#seperatorsSet = []
#getsepdok.write('[' + '\n')
#for element in seperators:
#seperatorsSet.append(''.join(element))
#for element in set(seperatorsSet):
#getsepdok.write(str(''.join(element)) + '\n')
#getsepdok.write(']')
for letter in line:
#print(letter)
if letter == '>':
val = 0
seperators.append(seperator)
seperator = []
if val == 1:
seperator.append(letter)
else:
pass
if letter == '<':
val = 1
seperatorsSet = []
getsepdok.write('[' + '\n')
for element in seperators:
seperatorsSet.append(''.join(element))
seperatorsSet = set(seperatorsSet)
for element in set(seperatorsSet):
getsepdok.write(str(''.join(element)) + '\n')
getsepdok.write(']')
return seperatorsSet
def GetPayloadBetweenTwoSymbols(self, SymbolA, SymbolB , LogLineNumber=False, Doc = True):
with open(self.Indok) as xmldok:
with open(self.Outdok , 'w') as payloaddok:
seperators = []
counter = 0
valA = 0
valB = 0
seperator =[]
for line in xmldok:
#print(line)
counter += 1
if LogLineNumber == True:
if (counter % 10000) == 0:
print(counter)
wait1letterA = False
wait1letterB = False
#for letter in line.decode('utf-8'):
for letter in line:
#print(letter)
#print(set(range(1, len(SymbolA))))
if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
#print('jo')
if wait1letterA == True:
#print('joo')
#print(letter)
valA -= valA % len(SymbolA)
wait1letterA = False
wait1letterA = True
if valB in set(range(1, len(SymbolB) )):
if wait1letterB == True:
valB = 0
wait1letterB = False
wait1letterB = True
for n in range(len(SymbolB)):
if valA >= len(SymbolA) and valB == n and letter == SymbolB[n]:
valB = n + 1
wait1letterB = False
else:
pass
if valB == len(SymbolB) and valA >= len(SymbolA):
valB = 0
#print(letter)
#print(valA)
valA -= len(SymbolA)
#print(valA)
#print(seperators)
if valA >= len(SymbolA):
seperator.append(letter)
else:
pass
#print(valA)
#print(SymbolA[6])
#print(len(SymbolA))
#print(range(len(SymbolA)))
if valA == 0:
if len(seperator[:-(len(SymbolB)-1)]) >= 1:
seperators.append(seperator[:-(len(SymbolB)-1)])
seperator = []
for n in range(len(SymbolA)):
#print(n)
if valA % len(SymbolA) == n and letter == SymbolA[n]:
valA += 1
#print(valA)
wait1letterA = False
break
else:
pass
seperatorsSet = []
#getsepdok.write('[' + '\n')
for element in seperators:
seperatorsSet.append(''.join(element))
seperatorsSet = set(seperatorsSet)
output = []
ID = 0
## Set has a probabilistic factor in it!!!! thats why the nmbers change
for element in seperatorsSet:
output.append([element, ID])
ID += 1
return output
def GetPayloadBetweenTwoSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber):
seperators = []
counter = 0
for element in Payload:
counter += 1
if LogElementNumber == True:
if (counter % 1000) == 0:
print(counter)
seperator =[]
wait1letterA = False
wait1letterB = False
valA = 0
valB = 0
for letter in element[0]:
#print(letter)
#print(set(range(1, len(SymbolA))))
if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
#print(valA)
#print('jo')
if wait1letterA == True:
#print('joo')
valA -= valA % len(SymbolA)
wait1letterA = False
wait1letterA = True
if valB in set(range(1, len(SymbolB) )) and valA >= len(SymbolA):
if wait1letterB == True:
valB = 0
wait1letterB = False
wait1letterB = True
#for n in range(len(SymbolB)):
#if valB == n and letter == SymbolB[n]:
#valB = n + 1
#wait1letterB = False
#else:
#pass
if letter == SymbolB[valB % len(SymbolB)] and valA >= len(SymbolA):
valB += 1
wait1letterB = False
else:
pass
if valB == len(SymbolB) and valA >= len(SymbolA):
valB = 0
#print(valA)
valA -= len(SymbolA)
#print(valA)
#print(seperators)
if valA >= len(SymbolA):
##print(letter)
seperator.append(letter)
#print(seperator)
else:
pass
#print(valA)
#print(SymbolA[6])
#print(len(SymbolA))
#print(range(len(SymbolA)))
if valA == 0:
#print('seps')
if len(seperator[:-(len(SymbolB)-1)]) >= 1:
seperators.append([''.join(seperator[:-(len(SymbolB)-1)]), element[1]])
seperator = []
# Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs.
# wuerde einiges an computation wegnehmen, auch da beide symbole
#for n in range(len(SymbolA)):
##print(n)
#if valA % len(SymbolA) == n and letter == SymbolA[n]:
##print(SymbolA[n])
#valA += 1
#wait1letterA = False
#else:
#pass
for n in range(len(SymbolA)):
#print(n)
if valA % len(SymbolA) == n and letter == SymbolA[n]:
valA += 1
#print(valA)
wait1letterA = False
break
else:
pass
return seperators
def GetPayloadBetweenTwoOneSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber, Payloadrow, IDrow):
counter = 0
seperator =[]
seperators = []
for payload in Payload:
val = 0
for letter in payload[Payloadrow]:
counter += 1
#print(counter)
if LogElementNumber == True:
if (counter % 10) == 0:
print(counter)
#print(letter)
if letter == SymbolB:
val -= 1
if val >= 1:
seperator.append(letter)
else:
pass
if val == 0 and len(seperator) >= 1:
seperators.append([''.join(seperator), payload[IDrow]])
seperator = []
if letter == SymbolA:
#print(val)
val += 1
return seperators
def CutTextAtSymbol(self, text, symbol):
itisthesymbol = 0
outtext = []
output = []
symbolisthere = 0
for letter in text:
outtext.append(letter)
#print(letter)
if letter != symbol[itisthesymbol]:
itisthesymbol = 0
if letter == symbol[itisthesymbol]:
itisthesymbol += 1
if itisthesymbol == len(symbol):
#print(outtext)
output.append(''.join(outtext))
itisthesymbol = 0
symbolisthere = 1
if symbolisthere == 0:
output.append(''.join(outtext))
return output[0]
def GetPayloadBetweenTwoSymbolsInText(self, text, SymbolA, SymbolB):
seperators = []
seperator =[]
wait1letterA = False
wait1letterB = False
valA = 0
valB = 0
for letter in text:
#print(letter)
#print(SymbolA)
if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
if wait1letterA == True:
valA -= valA % len(SymbolA)
wait1letterA = False
wait1letterA = True
#print('B',valB)
#print(valA)
if valB in set(range(1, len(SymbolB) )):
if wait1letterB == True:
valB = 0
wait1letterB = False
wait1letterB = True
#print('B',valB)
#print(valA)
if letter == SymbolB[valB % len(SymbolB)]:
valB += 1
wait1letterB = False
else:
pass
if valB == len(SymbolB):
valB = 0
valA -= len(SymbolA)
#print('B',valB)
#print(valA)
if valA >= len(SymbolA):
#print('append')
seperator.append(letter)
else:
pass
if valA == 0:
if len(seperator[:-(len(SymbolB)-1)]) >= 1:
seperators.append([''.join(seperator[:-(len(SymbolB)-1)])])
seperator = []
# Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs.
# wuerde einiges an computation wegnehmen, auch da beide symbole
#for n in range(len(SymbolA)):
#print(SymbolA[valA % len(SymbolA)])
if letter == SymbolA[valA % len(SymbolA)]:
#print('oi')
valA += 1
wait1letterA = False
else:
pass
return seperators
def GetPayloadBetweenTwoSameSymbolsInText(self, text, Symbol):
seperators = []
seperator =[]
wait1letter = False
nowendit = False
val = 0
for letter in text:
#print(letter)
#print(SymbolA)
if nowendit == False and letter == Symbol[val % len(Symbol)]:
val += 1
if nowendit == True and letter == Symbol[val % len(Symbol)]:
val -= 1
if val == len(Symbol):
seperator.append(letter)
nowendit = True
#print('append')
if val == 0 and len(seperator) >= 1:
seperators.append(' '.join(seperator))
seperator = []
nowendit = False
return seperators
def ParseWordswithSymbolFromSymbolongoing(self, text, Symbol):
seperators = []
#print(text.split())
for word in text.split():
val = 0
waitoneletter = False
seperator = []
for letter in word:
#print(letter)
#print(val)
if val < len(Symbol):
if letter == Symbol[val]:
val += 1
#print(letter)
#print(len(Symbol))
#print(val)
if val >= len(Symbol):
val = len(Symbol)
if val < len(Symbol):
if letter != Symbol[val]:
val = 0
if val == len(Symbol):
seperator.append(letter)
#print('itsappending')
if len(seperator) >= 1:
seperators.append(''.join(seperator))
seperator = []
return seperators
def ParseWithHighestLetterAccordance(self, inputtext, Letters):
# first check if there is a word that has all letters
short = False
lettervect = []
Lettervector = []
wordscores = []
text = inputtext.lower()
if '.' in set(Letters):
short = True
if short == True:
for letter in re.sub("[^a-zA-Züäö.]", " ", Letters):
letter = letter.lower()
#print(re.sub("[^a-züäö.]", " ", Letters))
if letter != '.' and letter != ' ':
lettervect.append(letter)
if letter == '.':
Lettervector.append(lettervect)
lettervect = []
if len(lettervect) >= 1:
Lettervector.append(lettervect)
else:
for letter in re.sub("[^a-zA-Züäö.]", " ", Letters):
letter = letter.lower()
Lettervector.append([letter])
#print(text)
#print(Lettervector)
from copy import deepcopy
for word in text.split():
lettervector = deepcopy(Lettervector)
#print(word)
#print(Lettervector)
wordscore = []
for n in range(len(lettervector)):
wordscore.append([word, 0])
#wordscore = len(lettervector) * [[word, 0 ]]
#print(wordscore)
firstletter = 0
usedletters = []
for letter in word:
firstletter += 1
#print(set(Letters))
#print(wordscore)
#print(lettervector[n])
if firstletter == 1:
if letter == lettervector[0][0]:
#print('oi')
#print(lettervector)
#print(len(lettervector[2]))
wordscore[0][1] += 1
lettervector[0].remove(letter)
#print(usedletters)
else:
lettervector[0].remove(lettervector[0][0])
for n in range(len(lettervector)):
#print('1' ,letter)
#print(lettervector[n][0])
if letter in set(lettervector[n]):
#print('ooioi',usedletters)
if letter not in set(usedletters):
#print('something was added', letter)
wordscore[n][1] += 1
lettervector[n].remove(letter)
#print('angesprungen')
wordscores.append(wordscore)
#print(wordscores)
#checkbest_firstlettervector = []
#for n in range(len(wordscores)):
#checkbest_firstlettervector.append([ n , wordscores[n][0][1]])
#print('wordscores', wordscores)
#best_n_lettervectors = sorted(checkbest_firstlettervector[::-1], key=lambda tup: tup[1], reverse=True)
#print(best_n_lettervectors)
#for wordscore in wordscores:
ntupelscores = []
ntupelscoresm = []
for o in range(len(wordscores)):
#print('newlettervectorindex')
lastletterexistentindex = 1
lastlettercame = False
if wordscores[o][0][1] >= 1:
for m in range(1, len(lettervector) + 1):
#print(m)
if o <= len(text.split()) - (m):
triplescore = []
for q in range(len(wordscores[o])):
triplescore.append(0)
#print(len(lettervector))
for n in range(m):
#print(wordscores[lettervectorindex[0] + n][n][1])
#wordscores[lettervectorindex[0] + 1][1][1] + wordscores[lettervectorindex[0] + 2][2][1]
for p in range(len(wordscores[o])):
#print(wordscore[o + n][p][1])
#print(len(Lettervector[p]))
if wordscores[o + n][p][1] == len(Lettervector[p]):
triplescore[p] += wordscores[o + n][p][1]
letterlength = 0
for r in range(len(lettervector)):
letterlength += len(Lettervector[r])
#print(wordscore)
#print(sum(triplescore))
if p == len(wordscores[o]) - 1 and wordscores[o + n][p][1] == len(Lettervector[p]) and lastlettercame == False and sum(triplescore) == letterlength:
#print('oioioioioioioooioioioiiiiiiiiiiiiiiiiiiiiiiiiiii')
lastletterexistentindex = n
lastlettercame = True
#triplescore += wordscores[o + n][p][1]
ntupelscores.append([[o , m, lastletterexistentindex], sum(triplescore)])
#ntupelscoresm.append([m , triplescore])
#print(text.split())
#print('bliblablub', ntupelscores)
for tupel in ntupelscores:
if text.split()[tupel[0][0]][0] == Lettervector[0][0]:
tupel[1] += 3
#print('b',text.split()[tupel[0][0] + tupel[0][1] - 1][0])
#print('a',Lettervector[-1][0])
if text.split()[tupel[0][0] + tupel[0][1] - 1][0] == Lettervector[-1][0]:
tupel[1] += 3
# Bestrafe laengere Tupel, sprich wenn durch weitere worte kein score dazukommt
tupel[1] -= tupel[0][1] * 0.1
bestntupelscoresorted = sorted(ntupelscores[::-1], key=lambda tup: tup[1], reverse=True)
#bestntupelscoresortedm = sorted(ntupelscoresm[::-1], key=lambda tup: tup[1], reverse=True)
#print('oioioioioioioioioioi',bestntupelscoresorted)
outputntupel = []
#print(bestntupelscoresorted)
for s in range(bestntupelscoresorted[0][0][1] ):
outputntupel.append(text.split()[bestntupelscoresorted[0][0][0] + s])
#print(outputntupel)
return outputntupel
#def parseWordsContainingCertainSymbols(self, text, symbols):
#print()
#fooSeparator = 'title'
#cwd = os.getcwd()
#with open('dewiktionary-20181201-pages-articles.xml') as xmldok:
#with open(cwd + '/' + 'classes.txt', 'w') as Outdok:
#n = 0
#done = False
#while done == False:
#for line in xmldok:
#n += 1
##print(line)
##print(dok_to_token(line))
##print(n)
#for word in line:
#print(word)
#try:
#if dok_to_token(line)[:(len(fooSeparator) + 2)] == '<' + fooSeparator + '>':
#Outdok.write(dok_to_token(line)[len(fooSeperator):-len(fooSeperator)] + '\n')
#except:
#pass
#if n >= 100000:
#quit()