# Parse wiktionary.xml with pure python, such that it can be run with pypy (python just in time compiler) # optimization would be possible through cython and assembler loops etc # on a linux system, get the first n lines of a document with: # head -n1000000 dewiktionary-20181201-pages-articles.xml > wiktionaryFirstMio.xml import sys import os import re class Parser(object): def __init__(self, InputDokument, OutputDokument): self.Indok = InputDokument self.Outdok = OutputDokument def GetSeparators(self): with open(self.Indok) as xmldok: with open(self.Outdok , 'w') as getsepdok: seperators = [] counter = 0 for line in xmldok: counter += 1 #print(counter) if (counter % 10) == 0: print(counter) seperator =[] val = 0 #if counter == 10000: #seperatorsSet = [] #getsepdok.write('[' + '\n') #for element in seperators: #seperatorsSet.append(''.join(element)) #for element in set(seperatorsSet): #getsepdok.write(str(''.join(element)) + '\n') #getsepdok.write(']') for letter in line: #print(letter) if letter == '>': val = 0 seperators.append(seperator) seperator = [] if val == 1: seperator.append(letter) else: pass if letter == '<': val = 1 seperatorsSet = [] getsepdok.write('[' + '\n') for element in seperators: seperatorsSet.append(''.join(element)) seperatorsSet = set(seperatorsSet) for element in set(seperatorsSet): getsepdok.write(str(''.join(element)) + '\n') getsepdok.write(']') return seperatorsSet def GetPayloadBetweenTwoSymbols(self, SymbolA, SymbolB , LogLineNumber=False, Doc = True): with open(self.Indok) as xmldok: with open(self.Outdok , 'w') as payloaddok: seperators = [] counter = 0 valA = 0 valB = 0 seperator =[] for line in xmldok: #print(line) counter += 1 if LogLineNumber == True: if (counter % 10000) == 0: print(counter) wait1letterA = False wait1letterB = False #for letter in line.decode('utf-8'): for letter in line: #print(letter) #print(set(range(1, len(SymbolA)))) if valA % len(SymbolA) in set(range(1, len(SymbolA) )): #print('jo') if wait1letterA == True: #print('joo') #print(letter) valA -= valA % len(SymbolA) wait1letterA = False wait1letterA = True if valB in set(range(1, len(SymbolB) )): if wait1letterB == True: valB = 0 wait1letterB = False wait1letterB = True for n in range(len(SymbolB)): if valA >= len(SymbolA) and valB == n and letter == SymbolB[n]: valB = n + 1 wait1letterB = False else: pass if valB == len(SymbolB) and valA >= len(SymbolA): valB = 0 #print(letter) #print(valA) valA -= len(SymbolA) #print(valA) #print(seperators) if valA >= len(SymbolA): seperator.append(letter) else: pass #print(valA) #print(SymbolA[6]) #print(len(SymbolA)) #print(range(len(SymbolA))) if valA == 0: if len(seperator[:-(len(SymbolB)-1)]) >= 1: seperators.append(seperator[:-(len(SymbolB)-1)]) seperator = [] for n in range(len(SymbolA)): #print(n) if valA % len(SymbolA) == n and letter == SymbolA[n]: valA += 1 #print(valA) wait1letterA = False break else: pass seperatorsSet = [] #getsepdok.write('[' + '\n') for element in seperators: seperatorsSet.append(''.join(element)) seperatorsSet = set(seperatorsSet) output = [] ID = 0 ## Set has a probabilistic factor in it!!!! thats why the nmbers change for element in seperatorsSet: output.append([element, ID]) ID += 1 return output def GetPayloadBetweenTwoSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber): seperators = [] counter = 0 for element in Payload: counter += 1 if LogElementNumber == True: if (counter % 1000) == 0: print(counter) seperator =[] wait1letterA = False wait1letterB = False valA = 0 valB = 0 for letter in element[0]: #print(letter) #print(set(range(1, len(SymbolA)))) if valA % len(SymbolA) in set(range(1, len(SymbolA) )): #print(valA) #print('jo') if wait1letterA == True: #print('joo') valA -= valA % len(SymbolA) wait1letterA = False wait1letterA = True if valB in set(range(1, len(SymbolB) )) and valA >= len(SymbolA): if wait1letterB == True: valB = 0 wait1letterB = False wait1letterB = True #for n in range(len(SymbolB)): #if valB == n and letter == SymbolB[n]: #valB = n + 1 #wait1letterB = False #else: #pass if letter == SymbolB[valB % len(SymbolB)] and valA >= len(SymbolA): valB += 1 wait1letterB = False else: pass if valB == len(SymbolB) and valA >= len(SymbolA): valB = 0 #print(valA) valA -= len(SymbolA) #print(valA) #print(seperators) if valA >= len(SymbolA): ##print(letter) seperator.append(letter) #print(seperator) else: pass #print(valA) #print(SymbolA[6]) #print(len(SymbolA)) #print(range(len(SymbolA))) if valA == 0: #print('seps') if len(seperator[:-(len(SymbolB)-1)]) >= 1: seperators.append([''.join(seperator[:-(len(SymbolB)-1)]), element[1]]) seperator = [] # Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs. # wuerde einiges an computation wegnehmen, auch da beide symbole #for n in range(len(SymbolA)): ##print(n) #if valA % len(SymbolA) == n and letter == SymbolA[n]: ##print(SymbolA[n]) #valA += 1 #wait1letterA = False #else: #pass for n in range(len(SymbolA)): #print(n) if valA % len(SymbolA) == n and letter == SymbolA[n]: valA += 1 #print(valA) wait1letterA = False break else: pass return seperators def GetPayloadBetweenTwoOneSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber, Payloadrow, IDrow): counter = 0 seperator =[] seperators = [] for payload in Payload: val = 0 for letter in payload[Payloadrow]: counter += 1 #print(counter) if LogElementNumber == True: if (counter % 10) == 0: print(counter) #print(letter) if letter == SymbolB: val -= 1 if val >= 1: seperator.append(letter) else: pass if val == 0 and len(seperator) >= 1: seperators.append([''.join(seperator), payload[IDrow]]) seperator = [] if letter == SymbolA: #print(val) val += 1 return seperators def CutTextAtSymbol(self, text, symbol): itisthesymbol = 0 outtext = [] output = [] symbolisthere = 0 for letter in text: outtext.append(letter) #print(letter) if letter != symbol[itisthesymbol]: itisthesymbol = 0 if letter == symbol[itisthesymbol]: itisthesymbol += 1 if itisthesymbol == len(symbol): #print(outtext) output.append(''.join(outtext)) itisthesymbol = 0 symbolisthere = 1 if symbolisthere == 0: output.append(''.join(outtext)) return output[0] def GetPayloadBetweenTwoSymbolsInText(self, text, SymbolA, SymbolB): seperators = [] seperator =[] wait1letterA = False wait1letterB = False valA = 0 valB = 0 for letter in text: #print(letter) #print(SymbolA) if valA % len(SymbolA) in set(range(1, len(SymbolA) )): if wait1letterA == True: valA -= valA % len(SymbolA) wait1letterA = False wait1letterA = True #print('B',valB) #print(valA) if valB in set(range(1, len(SymbolB) )): if wait1letterB == True: valB = 0 wait1letterB = False wait1letterB = True #print('B',valB) #print(valA) if letter == SymbolB[valB % len(SymbolB)]: valB += 1 wait1letterB = False else: pass if valB == len(SymbolB): valB = 0 valA -= len(SymbolA) #print('B',valB) #print(valA) if valA >= len(SymbolA): #print('append') seperator.append(letter) else: pass if valA == 0: if len(seperator[:-(len(SymbolB)-1)]) >= 1: seperators.append([''.join(seperator[:-(len(SymbolB)-1)])]) seperator = [] # Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs. # wuerde einiges an computation wegnehmen, auch da beide symbole #for n in range(len(SymbolA)): #print(SymbolA[valA % len(SymbolA)]) if letter == SymbolA[valA % len(SymbolA)]: #print('oi') valA += 1 wait1letterA = False else: pass return seperators def GetPayloadBetweenTwoSameSymbolsInText(self, text, Symbol): seperators = [] seperator =[] wait1letter = False nowendit = False val = 0 for letter in text: #print(letter) #print(SymbolA) if nowendit == False and letter == Symbol[val % len(Symbol)]: val += 1 if nowendit == True and letter == Symbol[val % len(Symbol)]: val -= 1 if val == len(Symbol): seperator.append(letter) nowendit = True #print('append') if val == 0 and len(seperator) >= 1: seperators.append(' '.join(seperator)) seperator = [] nowendit = False return seperators def ParseWordswithSymbolFromSymbolongoing(self, text, Symbol): seperators = [] #print(text.split()) for word in text.split(): val = 0 waitoneletter = False seperator = [] for letter in word: #print(letter) #print(val) if val < len(Symbol): if letter == Symbol[val]: val += 1 #print(letter) #print(len(Symbol)) #print(val) if val >= len(Symbol): val = len(Symbol) if val < len(Symbol): if letter != Symbol[val]: val = 0 if val == len(Symbol): seperator.append(letter) #print('itsappending') if len(seperator) >= 1: seperators.append(''.join(seperator)) seperator = [] return seperators def ParseWithHighestLetterAccordance(self, inputtext, Letters): # first check if there is a word that has all letters short = False lettervect = [] Lettervector = [] wordscores = [] text = inputtext.lower() if '.' in set(Letters): short = True if short == True: for letter in re.sub("[^a-zA-Züäö.]", " ", Letters): letter = letter.lower() #print(re.sub("[^a-züäö.]", " ", Letters)) if letter != '.' and letter != ' ': lettervect.append(letter) if letter == '.': Lettervector.append(lettervect) lettervect = [] if len(lettervect) >= 1: Lettervector.append(lettervect) else: for letter in re.sub("[^a-zA-Züäö.]", " ", Letters): letter = letter.lower() Lettervector.append([letter]) #print(text) #print(Lettervector) from copy import deepcopy for word in text.split(): lettervector = deepcopy(Lettervector) #print(word) #print(Lettervector) wordscore = [] for n in range(len(lettervector)): wordscore.append([word, 0]) #wordscore = len(lettervector) * [[word, 0 ]] #print(wordscore) firstletter = 0 usedletters = [] for letter in word: firstletter += 1 #print(set(Letters)) #print(wordscore) #print(lettervector[n]) if firstletter == 1: if letter == lettervector[0][0]: #print('oi') #print(lettervector) #print(len(lettervector[2])) wordscore[0][1] += 1 lettervector[0].remove(letter) #print(usedletters) else: lettervector[0].remove(lettervector[0][0]) for n in range(len(lettervector)): #print('1' ,letter) #print(lettervector[n][0]) if letter in set(lettervector[n]): #print('ooioi',usedletters) if letter not in set(usedletters): #print('something was added', letter) wordscore[n][1] += 1 lettervector[n].remove(letter) #print('angesprungen') wordscores.append(wordscore) #print(wordscores) #checkbest_firstlettervector = [] #for n in range(len(wordscores)): #checkbest_firstlettervector.append([ n , wordscores[n][0][1]]) #print('wordscores', wordscores) #best_n_lettervectors = sorted(checkbest_firstlettervector[::-1], key=lambda tup: tup[1], reverse=True) #print(best_n_lettervectors) #for wordscore in wordscores: ntupelscores = [] ntupelscoresm = [] for o in range(len(wordscores)): #print('newlettervectorindex') lastletterexistentindex = 1 lastlettercame = False if wordscores[o][0][1] >= 1: for m in range(1, len(lettervector) + 1): #print(m) if o <= len(text.split()) - (m): triplescore = [] for q in range(len(wordscores[o])): triplescore.append(0) #print(len(lettervector)) for n in range(m): #print(wordscores[lettervectorindex[0] + n][n][1]) #wordscores[lettervectorindex[0] + 1][1][1] + wordscores[lettervectorindex[0] + 2][2][1] for p in range(len(wordscores[o])): #print(wordscore[o + n][p][1]) #print(len(Lettervector[p])) if wordscores[o + n][p][1] == len(Lettervector[p]): triplescore[p] += wordscores[o + n][p][1] letterlength = 0 for r in range(len(lettervector)): letterlength += len(Lettervector[r]) #print(wordscore) #print(sum(triplescore)) if p == len(wordscores[o]) - 1 and wordscores[o + n][p][1] == len(Lettervector[p]) and lastlettercame == False and sum(triplescore) == letterlength: #print('oioioioioioioooioioioiiiiiiiiiiiiiiiiiiiiiiiiiii') lastletterexistentindex = n lastlettercame = True #triplescore += wordscores[o + n][p][1] ntupelscores.append([[o , m, lastletterexistentindex], sum(triplescore)]) #ntupelscoresm.append([m , triplescore]) #print(text.split()) #print('bliblablub', ntupelscores) for tupel in ntupelscores: if text.split()[tupel[0][0]][0] == Lettervector[0][0]: tupel[1] += 3 #print('b',text.split()[tupel[0][0] + tupel[0][1] - 1][0]) #print('a',Lettervector[-1][0]) if text.split()[tupel[0][0] + tupel[0][1] - 1][0] == Lettervector[-1][0]: tupel[1] += 3 # Bestrafe laengere Tupel, sprich wenn durch weitere worte kein score dazukommt tupel[1] -= tupel[0][1] * 0.1 bestntupelscoresorted = sorted(ntupelscores[::-1], key=lambda tup: tup[1], reverse=True) #bestntupelscoresortedm = sorted(ntupelscoresm[::-1], key=lambda tup: tup[1], reverse=True) #print('oioioioioioioioioioi',bestntupelscoresorted) outputntupel = [] #print(bestntupelscoresorted) for s in range(bestntupelscoresorted[0][0][1] ): outputntupel.append(text.split()[bestntupelscoresorted[0][0][0] + s]) #print(outputntupel) return outputntupel #def parseWordsContainingCertainSymbols(self, text, symbols): #print() #fooSeparator = 'title' #cwd = os.getcwd() #with open('dewiktionary-20181201-pages-articles.xml') as xmldok: #with open(cwd + '/' + 'classes.txt', 'w') as Outdok: #n = 0 #done = False #while done == False: #for line in xmldok: #n += 1 ##print(line) ##print(dok_to_token(line)) ##print(n) #for word in line: #print(word) #try: #if dok_to_token(line)[:(len(fooSeparator) + 2)] == '<' + fooSeparator + '>': #Outdok.write(dok_to_token(line)[len(fooSeperator):-len(fooSeperator)] + '\n') #except: #pass #if n >= 100000: #quit()