|
|
- # Parse wiktionary.xml with pure python, such that it can be run with pypy (python just in time compiler)
-
-
- # optimization would be possible through cython and assembler loops etc
-
-
- # on a linux system, get the first n lines of a document with:
-
- # head -n1000000 dewiktionary-20181201-pages-articles.xml > wiktionaryFirstMio.xml
-
-
-
-
-
-
- import sys
- import os
-
- import re
-
-
-
- class Parser(object):
-
- def __init__(self, InputDokument, OutputDokument):
-
- self.Indok = InputDokument
- self.Outdok = OutputDokument
-
-
-
- def GetSeparators(self):
- with open(self.Indok) as xmldok:
- with open(self.Outdok , 'w') as getsepdok:
- seperators = []
- counter = 0
- for line in xmldok:
- counter += 1
- #print(counter)
- if (counter % 10) == 0:
- print(counter)
-
- seperator =[]
- val = 0
-
- #if counter == 10000:
- #seperatorsSet = []
- #getsepdok.write('[' + '\n')
- #for element in seperators:
- #seperatorsSet.append(''.join(element))
-
- #for element in set(seperatorsSet):
- #getsepdok.write(str(''.join(element)) + '\n')
- #getsepdok.write(']')
-
-
- for letter in line:
-
- #print(letter)
- if letter == '>':
- val = 0
- seperators.append(seperator)
- seperator = []
-
- if val == 1:
- seperator.append(letter)
- else:
- pass
-
- if letter == '<':
- val = 1
-
- seperatorsSet = []
- getsepdok.write('[' + '\n')
- for element in seperators:
- seperatorsSet.append(''.join(element))
- seperatorsSet = set(seperatorsSet)
- for element in set(seperatorsSet):
- getsepdok.write(str(''.join(element)) + '\n')
- getsepdok.write(']')
- return seperatorsSet
-
-
-
- def GetPayloadBetweenTwoSymbols(self, SymbolA, SymbolB , LogLineNumber=False, Doc = True):
- with open(self.Indok) as xmldok:
- with open(self.Outdok , 'w') as payloaddok:
- seperators = []
- counter = 0
- valA = 0
- valB = 0
-
- seperator =[]
- for line in xmldok:
-
- #print(line)
- counter += 1
-
- if LogLineNumber == True:
- if (counter % 10000) == 0:
- print(counter)
-
-
- wait1letterA = False
- wait1letterB = False
-
-
-
- #for letter in line.decode('utf-8'):
- for letter in line:
- #print(letter)
- #print(set(range(1, len(SymbolA))))
-
-
- if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
-
- #print('jo')
- if wait1letterA == True:
-
- #print('joo')
- #print(letter)
- valA -= valA % len(SymbolA)
-
- wait1letterA = False
-
- wait1letterA = True
-
-
- if valB in set(range(1, len(SymbolB) )):
-
- if wait1letterB == True:
- valB = 0
- wait1letterB = False
-
- wait1letterB = True
-
-
-
-
-
- for n in range(len(SymbolB)):
- if valA >= len(SymbolA) and valB == n and letter == SymbolB[n]:
- valB = n + 1
- wait1letterB = False
- else:
- pass
-
- if valB == len(SymbolB) and valA >= len(SymbolA):
- valB = 0
-
- #print(letter)
- #print(valA)
- valA -= len(SymbolA)
- #print(valA)
-
- #print(seperators)
-
- if valA >= len(SymbolA):
-
- seperator.append(letter)
-
- else:
- pass
- #print(valA)
- #print(SymbolA[6])
- #print(len(SymbolA))
- #print(range(len(SymbolA)))
- if valA == 0:
-
- if len(seperator[:-(len(SymbolB)-1)]) >= 1:
-
- seperators.append(seperator[:-(len(SymbolB)-1)])
- seperator = []
-
- for n in range(len(SymbolA)):
- #print(n)
- if valA % len(SymbolA) == n and letter == SymbolA[n]:
-
- valA += 1
- #print(valA)
- wait1letterA = False
- break
- else:
- pass
-
-
-
- seperatorsSet = []
- #getsepdok.write('[' + '\n')
- for element in seperators:
- seperatorsSet.append(''.join(element))
- seperatorsSet = set(seperatorsSet)
-
- output = []
- ID = 0
-
- ## Set has a probabilistic factor in it!!!! thats why the nmbers change
- for element in seperatorsSet:
-
- output.append([element, ID])
- ID += 1
-
- return output
-
- def GetPayloadBetweenTwoSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber):
- seperators = []
- counter = 0
-
- for element in Payload:
-
- counter += 1
-
- if LogElementNumber == True:
- if (counter % 1000) == 0:
- print(counter)
-
-
-
- seperator =[]
- wait1letterA = False
- wait1letterB = False
- valA = 0
- valB = 0
-
-
- for letter in element[0]:
- #print(letter)
- #print(set(range(1, len(SymbolA))))
- if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
- #print(valA)
- #print('jo')
- if wait1letterA == True:
-
- #print('joo')
-
- valA -= valA % len(SymbolA)
-
- wait1letterA = False
-
- wait1letterA = True
-
-
- if valB in set(range(1, len(SymbolB) )) and valA >= len(SymbolA):
-
- if wait1letterB == True:
- valB = 0
- wait1letterB = False
-
- wait1letterB = True
-
-
-
-
-
- #for n in range(len(SymbolB)):
- #if valB == n and letter == SymbolB[n]:
- #valB = n + 1
- #wait1letterB = False
- #else:
- #pass
-
-
- if letter == SymbolB[valB % len(SymbolB)] and valA >= len(SymbolA):
- valB += 1
- wait1letterB = False
- else:
- pass
-
- if valB == len(SymbolB) and valA >= len(SymbolA):
- valB = 0
-
-
- #print(valA)
- valA -= len(SymbolA)
- #print(valA)
-
- #print(seperators)
-
- if valA >= len(SymbolA):
- ##print(letter)
- seperator.append(letter)
- #print(seperator)
- else:
- pass
- #print(valA)
- #print(SymbolA[6])
- #print(len(SymbolA))
- #print(range(len(SymbolA)))
- if valA == 0:
- #print('seps')
- if len(seperator[:-(len(SymbolB)-1)]) >= 1:
- seperators.append([''.join(seperator[:-(len(SymbolB)-1)]), element[1]])
- seperator = []
-
-
- # Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs.
- # wuerde einiges an computation wegnehmen, auch da beide symbole
- #for n in range(len(SymbolA)):
- ##print(n)
- #if valA % len(SymbolA) == n and letter == SymbolA[n]:
- ##print(SymbolA[n])
- #valA += 1
- #wait1letterA = False
- #else:
- #pass
- for n in range(len(SymbolA)):
- #print(n)
- if valA % len(SymbolA) == n and letter == SymbolA[n]:
-
- valA += 1
- #print(valA)
- wait1letterA = False
- break
- else:
- pass
-
-
-
- return seperators
-
-
- def GetPayloadBetweenTwoOneSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber, Payloadrow, IDrow):
-
-
-
- counter = 0
- seperator =[]
-
- seperators = []
-
- for payload in Payload:
- val = 0
- for letter in payload[Payloadrow]:
-
- counter += 1
- #print(counter)
- if LogElementNumber == True:
- if (counter % 10) == 0:
- print(counter)
-
- #print(letter)
- if letter == SymbolB:
- val -= 1
-
-
- if val >= 1:
- seperator.append(letter)
-
- else:
- pass
-
- if val == 0 and len(seperator) >= 1:
-
- seperators.append([''.join(seperator), payload[IDrow]])
-
- seperator = []
-
- if letter == SymbolA:
- #print(val)
- val += 1
-
-
- return seperators
-
- def CutTextAtSymbol(self, text, symbol):
- itisthesymbol = 0
- outtext = []
- output = []
- symbolisthere = 0
- for letter in text:
-
- outtext.append(letter)
- #print(letter)
- if letter != symbol[itisthesymbol]:
- itisthesymbol = 0
- if letter == symbol[itisthesymbol]:
- itisthesymbol += 1
-
- if itisthesymbol == len(symbol):
- #print(outtext)
- output.append(''.join(outtext))
- itisthesymbol = 0
- symbolisthere = 1
-
- if symbolisthere == 0:
- output.append(''.join(outtext))
-
-
- return output[0]
-
-
- def GetPayloadBetweenTwoSymbolsInText(self, text, SymbolA, SymbolB):
- seperators = []
- seperator =[]
- wait1letterA = False
- wait1letterB = False
- valA = 0
- valB = 0
-
-
- for letter in text:
- #print(letter)
- #print(SymbolA)
- if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
-
- if wait1letterA == True:
-
-
-
- valA -= valA % len(SymbolA)
-
- wait1letterA = False
-
- wait1letterA = True
- #print('B',valB)
- #print(valA)
- if valB in set(range(1, len(SymbolB) )):
-
- if wait1letterB == True:
- valB = 0
- wait1letterB = False
-
- wait1letterB = True
-
-
- #print('B',valB)
- #print(valA)
-
-
-
- if letter == SymbolB[valB % len(SymbolB)]:
- valB += 1
- wait1letterB = False
-
- else:
- pass
-
- if valB == len(SymbolB):
- valB = 0
-
-
- valA -= len(SymbolA)
-
-
- #print('B',valB)
- #print(valA)
- if valA >= len(SymbolA):
- #print('append')
- seperator.append(letter)
-
-
- else:
- pass
-
-
-
-
- if valA == 0:
-
- if len(seperator[:-(len(SymbolB)-1)]) >= 1:
- seperators.append([''.join(seperator[:-(len(SymbolB)-1)])])
- seperator = []
-
-
- # Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs.
- # wuerde einiges an computation wegnehmen, auch da beide symbole
- #for n in range(len(SymbolA)):
- #print(SymbolA[valA % len(SymbolA)])
- if letter == SymbolA[valA % len(SymbolA)]:
- #print('oi')
- valA += 1
- wait1letterA = False
-
- else:
- pass
-
-
-
- return seperators
-
- def GetPayloadBetweenTwoSameSymbolsInText(self, text, Symbol):
- seperators = []
- seperator =[]
- wait1letter = False
-
- nowendit = False
-
- val = 0
-
-
-
- for letter in text:
- #print(letter)
- #print(SymbolA)
-
- if nowendit == False and letter == Symbol[val % len(Symbol)]:
- val += 1
-
-
- if nowendit == True and letter == Symbol[val % len(Symbol)]:
- val -= 1
-
-
- if val == len(Symbol):
- seperator.append(letter)
- nowendit = True
- #print('append')
-
- if val == 0 and len(seperator) >= 1:
- seperators.append(' '.join(seperator))
- seperator = []
- nowendit = False
-
- return seperators
-
- def ParseWordswithSymbolFromSymbolongoing(self, text, Symbol):
- seperators = []
- #print(text.split())
- for word in text.split():
-
- val = 0
- waitoneletter = False
- seperator = []
- for letter in word:
-
- #print(letter)
- #print(val)
- if val < len(Symbol):
- if letter == Symbol[val]:
- val += 1
- #print(letter)
- #print(len(Symbol))
- #print(val)
- if val >= len(Symbol):
- val = len(Symbol)
-
- if val < len(Symbol):
- if letter != Symbol[val]:
- val = 0
-
- if val == len(Symbol):
- seperator.append(letter)
- #print('itsappending')
-
-
- if len(seperator) >= 1:
- seperators.append(''.join(seperator))
- seperator = []
-
- return seperators
-
- def ParseWithHighestLetterAccordance(self, inputtext, Letters):
-
- # first check if there is a word that has all letters
-
- short = False
- lettervect = []
- Lettervector = []
-
- wordscores = []
-
-
- text = inputtext.lower()
-
-
- if '.' in set(Letters):
- short = True
-
-
-
- if short == True:
- for letter in re.sub("[^a-zA-Züäö.]", " ", Letters):
- letter = letter.lower()
- #print(re.sub("[^a-züäö.]", " ", Letters))
-
- if letter != '.' and letter != ' ':
-
- lettervect.append(letter)
- if letter == '.':
- Lettervector.append(lettervect)
- lettervect = []
-
- if len(lettervect) >= 1:
- Lettervector.append(lettervect)
-
- else:
-
-
- for letter in re.sub("[^a-zA-Züäö.]", " ", Letters):
- letter = letter.lower()
- Lettervector.append([letter])
-
-
- #print(text)
- #print(Lettervector)
- from copy import deepcopy
-
- for word in text.split():
-
- lettervector = deepcopy(Lettervector)
- #print(word)
- #print(Lettervector)
-
- wordscore = []
- for n in range(len(lettervector)):
- wordscore.append([word, 0])
-
- #wordscore = len(lettervector) * [[word, 0 ]]
- #print(wordscore)
- firstletter = 0
- usedletters = []
- for letter in word:
- firstletter += 1
-
-
- #print(set(Letters))
- #print(wordscore)
-
- #print(lettervector[n])
- if firstletter == 1:
- if letter == lettervector[0][0]:
- #print('oi')
- #print(lettervector)
- #print(len(lettervector[2]))
- wordscore[0][1] += 1
- lettervector[0].remove(letter)
- #print(usedletters)
- else:
- lettervector[0].remove(lettervector[0][0])
-
- for n in range(len(lettervector)):
-
- #print('1' ,letter)
- #print(lettervector[n][0])
- if letter in set(lettervector[n]):
-
- #print('ooioi',usedletters)
- if letter not in set(usedletters):
- #print('something was added', letter)
- wordscore[n][1] += 1
- lettervector[n].remove(letter)
- #print('angesprungen')
-
-
-
-
-
- wordscores.append(wordscore)
-
- #print(wordscores)
-
- #checkbest_firstlettervector = []
- #for n in range(len(wordscores)):
-
- #checkbest_firstlettervector.append([ n , wordscores[n][0][1]])
-
- #print('wordscores', wordscores)
- #best_n_lettervectors = sorted(checkbest_firstlettervector[::-1], key=lambda tup: tup[1], reverse=True)
-
- #print(best_n_lettervectors)
-
- #for wordscore in wordscores:
- ntupelscores = []
- ntupelscoresm = []
-
- for o in range(len(wordscores)):
- #print('newlettervectorindex')
- lastletterexistentindex = 1
- lastlettercame = False
- if wordscores[o][0][1] >= 1:
- for m in range(1, len(lettervector) + 1):
- #print(m)
- if o <= len(text.split()) - (m):
-
- triplescore = []
- for q in range(len(wordscores[o])):
- triplescore.append(0)
- #print(len(lettervector))
-
-
-
- for n in range(m):
- #print(wordscores[lettervectorindex[0] + n][n][1])
- #wordscores[lettervectorindex[0] + 1][1][1] + wordscores[lettervectorindex[0] + 2][2][1]
-
- for p in range(len(wordscores[o])):
- #print(wordscore[o + n][p][1])
- #print(len(Lettervector[p]))
- if wordscores[o + n][p][1] == len(Lettervector[p]):
- triplescore[p] += wordscores[o + n][p][1]
-
- letterlength = 0
- for r in range(len(lettervector)):
- letterlength += len(Lettervector[r])
-
- #print(wordscore)
- #print(sum(triplescore))
-
- if p == len(wordscores[o]) - 1 and wordscores[o + n][p][1] == len(Lettervector[p]) and lastlettercame == False and sum(triplescore) == letterlength:
- #print('oioioioioioioooioioioiiiiiiiiiiiiiiiiiiiiiiiiiii')
- lastletterexistentindex = n
- lastlettercame = True
-
-
-
-
-
- #triplescore += wordscores[o + n][p][1]
-
-
- ntupelscores.append([[o , m, lastletterexistentindex], sum(triplescore)])
- #ntupelscoresm.append([m , triplescore])
-
-
-
- #print(text.split())
- #print('bliblablub', ntupelscores)
-
- for tupel in ntupelscores:
-
- if text.split()[tupel[0][0]][0] == Lettervector[0][0]:
- tupel[1] += 3
-
- #print('b',text.split()[tupel[0][0] + tupel[0][1] - 1][0])
- #print('a',Lettervector[-1][0])
- if text.split()[tupel[0][0] + tupel[0][1] - 1][0] == Lettervector[-1][0]:
- tupel[1] += 3
-
-
- # Bestrafe laengere Tupel, sprich wenn durch weitere worte kein score dazukommt
- tupel[1] -= tupel[0][1] * 0.1
-
- bestntupelscoresorted = sorted(ntupelscores[::-1], key=lambda tup: tup[1], reverse=True)
- #bestntupelscoresortedm = sorted(ntupelscoresm[::-1], key=lambda tup: tup[1], reverse=True)
-
- #print('oioioioioioioioioioi',bestntupelscoresorted)
- outputntupel = []
-
-
-
-
-
- #print(bestntupelscoresorted)
- for s in range(bestntupelscoresorted[0][0][1] ):
-
- outputntupel.append(text.split()[bestntupelscoresorted[0][0][0] + s])
-
- #print(outputntupel)
-
- return outputntupel
-
-
- #def parseWordsContainingCertainSymbols(self, text, symbols):
- #print()
-
-
-
-
-
- #fooSeparator = 'title'
-
- #cwd = os.getcwd()
-
- #with open('dewiktionary-20181201-pages-articles.xml') as xmldok:
- #with open(cwd + '/' + 'classes.txt', 'w') as Outdok:
- #n = 0
- #done = False
- #while done == False:
- #for line in xmldok:
- #n += 1
- ##print(line)
- ##print(dok_to_token(line))
- ##print(n)
- #for word in line:
- #print(word)
-
- #try:
- #if dok_to_token(line)[:(len(fooSeparator) + 2)] == '<' + fooSeparator + '>':
- #Outdok.write(dok_to_token(line)[len(fooSeperator):-len(fooSeperator)] + '\n')
- #except:
- #pass
- #if n >= 100000:
- #quit()
-
-
-
-
|