commit 1263460e05d4eac88c51a39cb1e045e2b28891e7 Author: corsaronero Date: Fri Feb 17 14:14:04 2023 +0000 first publishing diff --git a/parse1.py b/parse1.py new file mode 100644 index 0000000..cb1dfd1 --- /dev/null +++ b/parse1.py @@ -0,0 +1,787 @@ +# Parse wiktionary.xml with pure python, such that it can be run with pypy (python just in time compiler) + + +# optimization would be possible through cython and assembler loops etc + + +# on a linux system, get the first n lines of a document with: + +# head -n1000000 dewiktionary-20181201-pages-articles.xml > wiktionaryFirstMio.xml + + + + + + +import sys +import os + +import re + + + +class Parser(object): + + def __init__(self, InputDokument, OutputDokument): + + self.Indok = InputDokument + self.Outdok = OutputDokument + + + + def GetSeparators(self): + with open(self.Indok) as xmldok: + with open(self.Outdok , 'w') as getsepdok: + seperators = [] + counter = 0 + for line in xmldok: + counter += 1 + #print(counter) + if (counter % 10) == 0: + print(counter) + + seperator =[] + val = 0 + + #if counter == 10000: + #seperatorsSet = [] + #getsepdok.write('[' + '\n') + #for element in seperators: + #seperatorsSet.append(''.join(element)) + + #for element in set(seperatorsSet): + #getsepdok.write(str(''.join(element)) + '\n') + #getsepdok.write(']') + + + for letter in line: + + #print(letter) + if letter == '>': + val = 0 + seperators.append(seperator) + seperator = [] + + if val == 1: + seperator.append(letter) + else: + pass + + if letter == '<': + val = 1 + + seperatorsSet = [] + getsepdok.write('[' + '\n') + for element in seperators: + seperatorsSet.append(''.join(element)) + seperatorsSet = set(seperatorsSet) + for element in set(seperatorsSet): + getsepdok.write(str(''.join(element)) + '\n') + getsepdok.write(']') + return seperatorsSet + + + + def GetPayloadBetweenTwoSymbols(self, SymbolA, SymbolB , LogLineNumber=False, Doc = True): + with open(self.Indok) as xmldok: + with open(self.Outdok , 'w') as payloaddok: + seperators = [] + counter = 0 + valA = 0 + valB = 0 + + seperator =[] + for line in xmldok: + + #print(line) + counter += 1 + + if LogLineNumber == True: + if (counter % 10000) == 0: + print(counter) + + + wait1letterA = False + wait1letterB = False + + + + #for letter in line.decode('utf-8'): + for letter in line: + #print(letter) + #print(set(range(1, len(SymbolA)))) + + + if valA % len(SymbolA) in set(range(1, len(SymbolA) )): + + #print('jo') + if wait1letterA == True: + + #print('joo') + #print(letter) + valA -= valA % len(SymbolA) + + wait1letterA = False + + wait1letterA = True + + + if valB in set(range(1, len(SymbolB) )): + + if wait1letterB == True: + valB = 0 + wait1letterB = False + + wait1letterB = True + + + + + + for n in range(len(SymbolB)): + if valA >= len(SymbolA) and valB == n and letter == SymbolB[n]: + valB = n + 1 + wait1letterB = False + else: + pass + + if valB == len(SymbolB) and valA >= len(SymbolA): + valB = 0 + + #print(letter) + #print(valA) + valA -= len(SymbolA) + #print(valA) + + #print(seperators) + + if valA >= len(SymbolA): + + seperator.append(letter) + + else: + pass + #print(valA) + #print(SymbolA[6]) + #print(len(SymbolA)) + #print(range(len(SymbolA))) + if valA == 0: + + if len(seperator[:-(len(SymbolB)-1)]) >= 1: + + seperators.append(seperator[:-(len(SymbolB)-1)]) + seperator = [] + + for n in range(len(SymbolA)): + #print(n) + if valA % len(SymbolA) == n and letter == SymbolA[n]: + + valA += 1 + #print(valA) + wait1letterA = False + break + else: + pass + + + + seperatorsSet = [] + #getsepdok.write('[' + '\n') + for element in seperators: + seperatorsSet.append(''.join(element)) + seperatorsSet = set(seperatorsSet) + + output = [] + ID = 0 + + ## Set has a probabilistic factor in it!!!! thats why the nmbers change + for element in seperatorsSet: + + output.append([element, ID]) + ID += 1 + + return output + + def GetPayloadBetweenTwoSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber): + seperators = [] + counter = 0 + + for element in Payload: + + counter += 1 + + if LogElementNumber == True: + if (counter % 1000) == 0: + print(counter) + + + + seperator =[] + wait1letterA = False + wait1letterB = False + valA = 0 + valB = 0 + + + for letter in element[0]: + #print(letter) + #print(set(range(1, len(SymbolA)))) + if valA % len(SymbolA) in set(range(1, len(SymbolA) )): + #print(valA) + #print('jo') + if wait1letterA == True: + + #print('joo') + + valA -= valA % len(SymbolA) + + wait1letterA = False + + wait1letterA = True + + + if valB in set(range(1, len(SymbolB) )) and valA >= len(SymbolA): + + if wait1letterB == True: + valB = 0 + wait1letterB = False + + wait1letterB = True + + + + + + #for n in range(len(SymbolB)): + #if valB == n and letter == SymbolB[n]: + #valB = n + 1 + #wait1letterB = False + #else: + #pass + + + if letter == SymbolB[valB % len(SymbolB)] and valA >= len(SymbolA): + valB += 1 + wait1letterB = False + else: + pass + + if valB == len(SymbolB) and valA >= len(SymbolA): + valB = 0 + + + #print(valA) + valA -= len(SymbolA) + #print(valA) + + #print(seperators) + + if valA >= len(SymbolA): + ##print(letter) + seperator.append(letter) + #print(seperator) + else: + pass + #print(valA) + #print(SymbolA[6]) + #print(len(SymbolA)) + #print(range(len(SymbolA))) + if valA == 0: + #print('seps') + if len(seperator[:-(len(SymbolB)-1)]) >= 1: + seperators.append([''.join(seperator[:-(len(SymbolB)-1)]), element[1]]) + seperator = [] + + + # Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs. + # wuerde einiges an computation wegnehmen, auch da beide symbole + #for n in range(len(SymbolA)): + ##print(n) + #if valA % len(SymbolA) == n and letter == SymbolA[n]: + ##print(SymbolA[n]) + #valA += 1 + #wait1letterA = False + #else: + #pass + for n in range(len(SymbolA)): + #print(n) + if valA % len(SymbolA) == n and letter == SymbolA[n]: + + valA += 1 + #print(valA) + wait1letterA = False + break + else: + pass + + + + return seperators + + + def GetPayloadBetweenTwoOneSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber, Payloadrow, IDrow): + + + + counter = 0 + seperator =[] + + seperators = [] + + for payload in Payload: + val = 0 + for letter in payload[Payloadrow]: + + counter += 1 + #print(counter) + if LogElementNumber == True: + if (counter % 10) == 0: + print(counter) + + #print(letter) + if letter == SymbolB: + val -= 1 + + + if val >= 1: + seperator.append(letter) + + else: + pass + + if val == 0 and len(seperator) >= 1: + + seperators.append([''.join(seperator), payload[IDrow]]) + + seperator = [] + + if letter == SymbolA: + #print(val) + val += 1 + + + return seperators + + def CutTextAtSymbol(self, text, symbol): + itisthesymbol = 0 + outtext = [] + output = [] + symbolisthere = 0 + for letter in text: + + outtext.append(letter) + #print(letter) + if letter != symbol[itisthesymbol]: + itisthesymbol = 0 + if letter == symbol[itisthesymbol]: + itisthesymbol += 1 + + if itisthesymbol == len(symbol): + #print(outtext) + output.append(''.join(outtext)) + itisthesymbol = 0 + symbolisthere = 1 + + if symbolisthere == 0: + output.append(''.join(outtext)) + + + return output[0] + + + def GetPayloadBetweenTwoSymbolsInText(self, text, SymbolA, SymbolB): + seperators = [] + seperator =[] + wait1letterA = False + wait1letterB = False + valA = 0 + valB = 0 + + + for letter in text: + #print(letter) + #print(SymbolA) + if valA % len(SymbolA) in set(range(1, len(SymbolA) )): + + if wait1letterA == True: + + + + valA -= valA % len(SymbolA) + + wait1letterA = False + + wait1letterA = True + #print('B',valB) + #print(valA) + if valB in set(range(1, len(SymbolB) )): + + if wait1letterB == True: + valB = 0 + wait1letterB = False + + wait1letterB = True + + + #print('B',valB) + #print(valA) + + + + if letter == SymbolB[valB % len(SymbolB)]: + valB += 1 + wait1letterB = False + + else: + pass + + if valB == len(SymbolB): + valB = 0 + + + valA -= len(SymbolA) + + + #print('B',valB) + #print(valA) + if valA >= len(SymbolA): + #print('append') + seperator.append(letter) + + + else: + pass + + + + + if valA == 0: + + if len(seperator[:-(len(SymbolB)-1)]) >= 1: + seperators.append([''.join(seperator[:-(len(SymbolB)-1)])]) + seperator = [] + + + # Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs. + # wuerde einiges an computation wegnehmen, auch da beide symbole + #for n in range(len(SymbolA)): + #print(SymbolA[valA % len(SymbolA)]) + if letter == SymbolA[valA % len(SymbolA)]: + #print('oi') + valA += 1 + wait1letterA = False + + else: + pass + + + + return seperators + + def GetPayloadBetweenTwoSameSymbolsInText(self, text, Symbol): + seperators = [] + seperator =[] + wait1letter = False + + nowendit = False + + val = 0 + + + + for letter in text: + #print(letter) + #print(SymbolA) + + if nowendit == False and letter == Symbol[val % len(Symbol)]: + val += 1 + + + if nowendit == True and letter == Symbol[val % len(Symbol)]: + val -= 1 + + + if val == len(Symbol): + seperator.append(letter) + nowendit = True + #print('append') + + if val == 0 and len(seperator) >= 1: + seperators.append(' '.join(seperator)) + seperator = [] + nowendit = False + + return seperators + + def ParseWordswithSymbolFromSymbolongoing(self, text, Symbol): + seperators = [] + #print(text.split()) + for word in text.split(): + + val = 0 + waitoneletter = False + seperator = [] + for letter in word: + + #print(letter) + #print(val) + if val < len(Symbol): + if letter == Symbol[val]: + val += 1 + #print(letter) + #print(len(Symbol)) + #print(val) + if val >= len(Symbol): + val = len(Symbol) + + if val < len(Symbol): + if letter != Symbol[val]: + val = 0 + + if val == len(Symbol): + seperator.append(letter) + #print('itsappending') + + + if len(seperator) >= 1: + seperators.append(''.join(seperator)) + seperator = [] + + return seperators + + def ParseWithHighestLetterAccordance(self, inputtext, Letters): + + # first check if there is a word that has all letters + + short = False + lettervect = [] + Lettervector = [] + + wordscores = [] + + + text = inputtext.lower() + + + if '.' in set(Letters): + short = True + + + + if short == True: + for letter in re.sub("[^a-zA-Züäö.]", " ", Letters): + letter = letter.lower() + #print(re.sub("[^a-züäö.]", " ", Letters)) + + if letter != '.' and letter != ' ': + + lettervect.append(letter) + if letter == '.': + Lettervector.append(lettervect) + lettervect = [] + + if len(lettervect) >= 1: + Lettervector.append(lettervect) + + else: + + + for letter in re.sub("[^a-zA-Züäö.]", " ", Letters): + letter = letter.lower() + Lettervector.append([letter]) + + + #print(text) + #print(Lettervector) + from copy import deepcopy + + for word in text.split(): + + lettervector = deepcopy(Lettervector) + #print(word) + #print(Lettervector) + + wordscore = [] + for n in range(len(lettervector)): + wordscore.append([word, 0]) + + #wordscore = len(lettervector) * [[word, 0 ]] + #print(wordscore) + firstletter = 0 + usedletters = [] + for letter in word: + firstletter += 1 + + + #print(set(Letters)) + #print(wordscore) + + #print(lettervector[n]) + if firstletter == 1: + if letter == lettervector[0][0]: + #print('oi') + #print(lettervector) + #print(len(lettervector[2])) + wordscore[0][1] += 1 + lettervector[0].remove(letter) + #print(usedletters) + else: + lettervector[0].remove(lettervector[0][0]) + + for n in range(len(lettervector)): + + #print('1' ,letter) + #print(lettervector[n][0]) + if letter in set(lettervector[n]): + + #print('ooioi',usedletters) + if letter not in set(usedletters): + #print('something was added', letter) + wordscore[n][1] += 1 + lettervector[n].remove(letter) + #print('angesprungen') + + + + + + wordscores.append(wordscore) + + #print(wordscores) + + #checkbest_firstlettervector = [] + #for n in range(len(wordscores)): + + #checkbest_firstlettervector.append([ n , wordscores[n][0][1]]) + + #print('wordscores', wordscores) + #best_n_lettervectors = sorted(checkbest_firstlettervector[::-1], key=lambda tup: tup[1], reverse=True) + + #print(best_n_lettervectors) + + #for wordscore in wordscores: + ntupelscores = [] + ntupelscoresm = [] + + for o in range(len(wordscores)): + #print('newlettervectorindex') + lastletterexistentindex = 1 + lastlettercame = False + if wordscores[o][0][1] >= 1: + for m in range(1, len(lettervector) + 1): + #print(m) + if o <= len(text.split()) - (m): + + triplescore = [] + for q in range(len(wordscores[o])): + triplescore.append(0) + #print(len(lettervector)) + + + + for n in range(m): + #print(wordscores[lettervectorindex[0] + n][n][1]) + #wordscores[lettervectorindex[0] + 1][1][1] + wordscores[lettervectorindex[0] + 2][2][1] + + for p in range(len(wordscores[o])): + #print(wordscore[o + n][p][1]) + #print(len(Lettervector[p])) + if wordscores[o + n][p][1] == len(Lettervector[p]): + triplescore[p] += wordscores[o + n][p][1] + + letterlength = 0 + for r in range(len(lettervector)): + letterlength += len(Lettervector[r]) + + #print(wordscore) + #print(sum(triplescore)) + + if p == len(wordscores[o]) - 1 and wordscores[o + n][p][1] == len(Lettervector[p]) and lastlettercame == False and sum(triplescore) == letterlength: + #print('oioioioioioioooioioioiiiiiiiiiiiiiiiiiiiiiiiiiii') + lastletterexistentindex = n + lastlettercame = True + + + + + + #triplescore += wordscores[o + n][p][1] + + + ntupelscores.append([[o , m, lastletterexistentindex], sum(triplescore)]) + #ntupelscoresm.append([m , triplescore]) + + + + #print(text.split()) + #print('bliblablub', ntupelscores) + + for tupel in ntupelscores: + + if text.split()[tupel[0][0]][0] == Lettervector[0][0]: + tupel[1] += 3 + + #print('b',text.split()[tupel[0][0] + tupel[0][1] - 1][0]) + #print('a',Lettervector[-1][0]) + if text.split()[tupel[0][0] + tupel[0][1] - 1][0] == Lettervector[-1][0]: + tupel[1] += 3 + + + # Bestrafe laengere Tupel, sprich wenn durch weitere worte kein score dazukommt + tupel[1] -= tupel[0][1] * 0.1 + + bestntupelscoresorted = sorted(ntupelscores[::-1], key=lambda tup: tup[1], reverse=True) + #bestntupelscoresortedm = sorted(ntupelscoresm[::-1], key=lambda tup: tup[1], reverse=True) + + #print('oioioioioioioioioioi',bestntupelscoresorted) + outputntupel = [] + + + + + + #print(bestntupelscoresorted) + for s in range(bestntupelscoresorted[0][0][1] ): + + outputntupel.append(text.split()[bestntupelscoresorted[0][0][0] + s]) + + #print(outputntupel) + + return outputntupel + + + #def parseWordsContainingCertainSymbols(self, text, symbols): + #print() + + + + + +#fooSeparator = 'title' + +#cwd = os.getcwd() + +#with open('dewiktionary-20181201-pages-articles.xml') as xmldok: + #with open(cwd + '/' + 'classes.txt', 'w') as Outdok: + #n = 0 + #done = False + #while done == False: + #for line in xmldok: + #n += 1 + ##print(line) + ##print(dok_to_token(line)) + ##print(n) + #for word in line: + #print(word) + + #try: + #if dok_to_token(line)[:(len(fooSeparator) + 2)] == '<' + fooSeparator + '>': + #Outdok.write(dok_to_token(line)[len(fooSeperator):-len(fooSeperator)] + '\n') + #except: + #pass + #if n >= 100000: + #quit() + + + + diff --git a/parse1.pyc b/parse1.pyc new file mode 100644 index 0000000..c82b148 Binary files /dev/null and b/parse1.pyc differ