first publishing
This commit is contained in:
commit
1263460e05
2 changed files with 787 additions and 0 deletions
787
parse1.py
Normal file
787
parse1.py
Normal file
|
@ -0,0 +1,787 @@
|
|||
# Parse wiktionary.xml with pure python, such that it can be run with pypy (python just in time compiler)
|
||||
|
||||
|
||||
# optimization would be possible through cython and assembler loops etc
|
||||
|
||||
|
||||
# on a linux system, get the first n lines of a document with:
|
||||
|
||||
# head -n1000000 dewiktionary-20181201-pages-articles.xml > wiktionaryFirstMio.xml
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
import re
|
||||
|
||||
|
||||
|
||||
class Parser(object):
|
||||
|
||||
def __init__(self, InputDokument, OutputDokument):
|
||||
|
||||
self.Indok = InputDokument
|
||||
self.Outdok = OutputDokument
|
||||
|
||||
|
||||
|
||||
def GetSeparators(self):
|
||||
with open(self.Indok) as xmldok:
|
||||
with open(self.Outdok , 'w') as getsepdok:
|
||||
seperators = []
|
||||
counter = 0
|
||||
for line in xmldok:
|
||||
counter += 1
|
||||
#print(counter)
|
||||
if (counter % 10) == 0:
|
||||
print(counter)
|
||||
|
||||
seperator =[]
|
||||
val = 0
|
||||
|
||||
#if counter == 10000:
|
||||
#seperatorsSet = []
|
||||
#getsepdok.write('[' + '\n')
|
||||
#for element in seperators:
|
||||
#seperatorsSet.append(''.join(element))
|
||||
|
||||
#for element in set(seperatorsSet):
|
||||
#getsepdok.write(str(''.join(element)) + '\n')
|
||||
#getsepdok.write(']')
|
||||
|
||||
|
||||
for letter in line:
|
||||
|
||||
#print(letter)
|
||||
if letter == '>':
|
||||
val = 0
|
||||
seperators.append(seperator)
|
||||
seperator = []
|
||||
|
||||
if val == 1:
|
||||
seperator.append(letter)
|
||||
else:
|
||||
pass
|
||||
|
||||
if letter == '<':
|
||||
val = 1
|
||||
|
||||
seperatorsSet = []
|
||||
getsepdok.write('[' + '\n')
|
||||
for element in seperators:
|
||||
seperatorsSet.append(''.join(element))
|
||||
seperatorsSet = set(seperatorsSet)
|
||||
for element in set(seperatorsSet):
|
||||
getsepdok.write(str(''.join(element)) + '\n')
|
||||
getsepdok.write(']')
|
||||
return seperatorsSet
|
||||
|
||||
|
||||
|
||||
def GetPayloadBetweenTwoSymbols(self, SymbolA, SymbolB , LogLineNumber=False, Doc = True):
|
||||
with open(self.Indok) as xmldok:
|
||||
with open(self.Outdok , 'w') as payloaddok:
|
||||
seperators = []
|
||||
counter = 0
|
||||
valA = 0
|
||||
valB = 0
|
||||
|
||||
seperator =[]
|
||||
for line in xmldok:
|
||||
|
||||
#print(line)
|
||||
counter += 1
|
||||
|
||||
if LogLineNumber == True:
|
||||
if (counter % 10000) == 0:
|
||||
print(counter)
|
||||
|
||||
|
||||
wait1letterA = False
|
||||
wait1letterB = False
|
||||
|
||||
|
||||
|
||||
#for letter in line.decode('utf-8'):
|
||||
for letter in line:
|
||||
#print(letter)
|
||||
#print(set(range(1, len(SymbolA))))
|
||||
|
||||
|
||||
if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
|
||||
|
||||
#print('jo')
|
||||
if wait1letterA == True:
|
||||
|
||||
#print('joo')
|
||||
#print(letter)
|
||||
valA -= valA % len(SymbolA)
|
||||
|
||||
wait1letterA = False
|
||||
|
||||
wait1letterA = True
|
||||
|
||||
|
||||
if valB in set(range(1, len(SymbolB) )):
|
||||
|
||||
if wait1letterB == True:
|
||||
valB = 0
|
||||
wait1letterB = False
|
||||
|
||||
wait1letterB = True
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
for n in range(len(SymbolB)):
|
||||
if valA >= len(SymbolA) and valB == n and letter == SymbolB[n]:
|
||||
valB = n + 1
|
||||
wait1letterB = False
|
||||
else:
|
||||
pass
|
||||
|
||||
if valB == len(SymbolB) and valA >= len(SymbolA):
|
||||
valB = 0
|
||||
|
||||
#print(letter)
|
||||
#print(valA)
|
||||
valA -= len(SymbolA)
|
||||
#print(valA)
|
||||
|
||||
#print(seperators)
|
||||
|
||||
if valA >= len(SymbolA):
|
||||
|
||||
seperator.append(letter)
|
||||
|
||||
else:
|
||||
pass
|
||||
#print(valA)
|
||||
#print(SymbolA[6])
|
||||
#print(len(SymbolA))
|
||||
#print(range(len(SymbolA)))
|
||||
if valA == 0:
|
||||
|
||||
if len(seperator[:-(len(SymbolB)-1)]) >= 1:
|
||||
|
||||
seperators.append(seperator[:-(len(SymbolB)-1)])
|
||||
seperator = []
|
||||
|
||||
for n in range(len(SymbolA)):
|
||||
#print(n)
|
||||
if valA % len(SymbolA) == n and letter == SymbolA[n]:
|
||||
|
||||
valA += 1
|
||||
#print(valA)
|
||||
wait1letterA = False
|
||||
break
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
seperatorsSet = []
|
||||
#getsepdok.write('[' + '\n')
|
||||
for element in seperators:
|
||||
seperatorsSet.append(''.join(element))
|
||||
seperatorsSet = set(seperatorsSet)
|
||||
|
||||
output = []
|
||||
ID = 0
|
||||
|
||||
## Set has a probabilistic factor in it!!!! thats why the nmbers change
|
||||
for element in seperatorsSet:
|
||||
|
||||
output.append([element, ID])
|
||||
ID += 1
|
||||
|
||||
return output
|
||||
|
||||
def GetPayloadBetweenTwoSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber):
|
||||
seperators = []
|
||||
counter = 0
|
||||
|
||||
for element in Payload:
|
||||
|
||||
counter += 1
|
||||
|
||||
if LogElementNumber == True:
|
||||
if (counter % 1000) == 0:
|
||||
print(counter)
|
||||
|
||||
|
||||
|
||||
seperator =[]
|
||||
wait1letterA = False
|
||||
wait1letterB = False
|
||||
valA = 0
|
||||
valB = 0
|
||||
|
||||
|
||||
for letter in element[0]:
|
||||
#print(letter)
|
||||
#print(set(range(1, len(SymbolA))))
|
||||
if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
|
||||
#print(valA)
|
||||
#print('jo')
|
||||
if wait1letterA == True:
|
||||
|
||||
#print('joo')
|
||||
|
||||
valA -= valA % len(SymbolA)
|
||||
|
||||
wait1letterA = False
|
||||
|
||||
wait1letterA = True
|
||||
|
||||
|
||||
if valB in set(range(1, len(SymbolB) )) and valA >= len(SymbolA):
|
||||
|
||||
if wait1letterB == True:
|
||||
valB = 0
|
||||
wait1letterB = False
|
||||
|
||||
wait1letterB = True
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#for n in range(len(SymbolB)):
|
||||
#if valB == n and letter == SymbolB[n]:
|
||||
#valB = n + 1
|
||||
#wait1letterB = False
|
||||
#else:
|
||||
#pass
|
||||
|
||||
|
||||
if letter == SymbolB[valB % len(SymbolB)] and valA >= len(SymbolA):
|
||||
valB += 1
|
||||
wait1letterB = False
|
||||
else:
|
||||
pass
|
||||
|
||||
if valB == len(SymbolB) and valA >= len(SymbolA):
|
||||
valB = 0
|
||||
|
||||
|
||||
#print(valA)
|
||||
valA -= len(SymbolA)
|
||||
#print(valA)
|
||||
|
||||
#print(seperators)
|
||||
|
||||
if valA >= len(SymbolA):
|
||||
##print(letter)
|
||||
seperator.append(letter)
|
||||
#print(seperator)
|
||||
else:
|
||||
pass
|
||||
#print(valA)
|
||||
#print(SymbolA[6])
|
||||
#print(len(SymbolA))
|
||||
#print(range(len(SymbolA)))
|
||||
if valA == 0:
|
||||
#print('seps')
|
||||
if len(seperator[:-(len(SymbolB)-1)]) >= 1:
|
||||
seperators.append([''.join(seperator[:-(len(SymbolB)-1)]), element[1]])
|
||||
seperator = []
|
||||
|
||||
|
||||
# Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs.
|
||||
# wuerde einiges an computation wegnehmen, auch da beide symbole
|
||||
#for n in range(len(SymbolA)):
|
||||
##print(n)
|
||||
#if valA % len(SymbolA) == n and letter == SymbolA[n]:
|
||||
##print(SymbolA[n])
|
||||
#valA += 1
|
||||
#wait1letterA = False
|
||||
#else:
|
||||
#pass
|
||||
for n in range(len(SymbolA)):
|
||||
#print(n)
|
||||
if valA % len(SymbolA) == n and letter == SymbolA[n]:
|
||||
|
||||
valA += 1
|
||||
#print(valA)
|
||||
wait1letterA = False
|
||||
break
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
return seperators
|
||||
|
||||
|
||||
def GetPayloadBetweenTwoOneSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber, Payloadrow, IDrow):
|
||||
|
||||
|
||||
|
||||
counter = 0
|
||||
seperator =[]
|
||||
|
||||
seperators = []
|
||||
|
||||
for payload in Payload:
|
||||
val = 0
|
||||
for letter in payload[Payloadrow]:
|
||||
|
||||
counter += 1
|
||||
#print(counter)
|
||||
if LogElementNumber == True:
|
||||
if (counter % 10) == 0:
|
||||
print(counter)
|
||||
|
||||
#print(letter)
|
||||
if letter == SymbolB:
|
||||
val -= 1
|
||||
|
||||
|
||||
if val >= 1:
|
||||
seperator.append(letter)
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
if val == 0 and len(seperator) >= 1:
|
||||
|
||||
seperators.append([''.join(seperator), payload[IDrow]])
|
||||
|
||||
seperator = []
|
||||
|
||||
if letter == SymbolA:
|
||||
#print(val)
|
||||
val += 1
|
||||
|
||||
|
||||
return seperators
|
||||
|
||||
def CutTextAtSymbol(self, text, symbol):
|
||||
itisthesymbol = 0
|
||||
outtext = []
|
||||
output = []
|
||||
symbolisthere = 0
|
||||
for letter in text:
|
||||
|
||||
outtext.append(letter)
|
||||
#print(letter)
|
||||
if letter != symbol[itisthesymbol]:
|
||||
itisthesymbol = 0
|
||||
if letter == symbol[itisthesymbol]:
|
||||
itisthesymbol += 1
|
||||
|
||||
if itisthesymbol == len(symbol):
|
||||
#print(outtext)
|
||||
output.append(''.join(outtext))
|
||||
itisthesymbol = 0
|
||||
symbolisthere = 1
|
||||
|
||||
if symbolisthere == 0:
|
||||
output.append(''.join(outtext))
|
||||
|
||||
|
||||
return output[0]
|
||||
|
||||
|
||||
def GetPayloadBetweenTwoSymbolsInText(self, text, SymbolA, SymbolB):
|
||||
seperators = []
|
||||
seperator =[]
|
||||
wait1letterA = False
|
||||
wait1letterB = False
|
||||
valA = 0
|
||||
valB = 0
|
||||
|
||||
|
||||
for letter in text:
|
||||
#print(letter)
|
||||
#print(SymbolA)
|
||||
if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
|
||||
|
||||
if wait1letterA == True:
|
||||
|
||||
|
||||
|
||||
valA -= valA % len(SymbolA)
|
||||
|
||||
wait1letterA = False
|
||||
|
||||
wait1letterA = True
|
||||
#print('B',valB)
|
||||
#print(valA)
|
||||
if valB in set(range(1, len(SymbolB) )):
|
||||
|
||||
if wait1letterB == True:
|
||||
valB = 0
|
||||
wait1letterB = False
|
||||
|
||||
wait1letterB = True
|
||||
|
||||
|
||||
#print('B',valB)
|
||||
#print(valA)
|
||||
|
||||
|
||||
|
||||
if letter == SymbolB[valB % len(SymbolB)]:
|
||||
valB += 1
|
||||
wait1letterB = False
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
if valB == len(SymbolB):
|
||||
valB = 0
|
||||
|
||||
|
||||
valA -= len(SymbolA)
|
||||
|
||||
|
||||
#print('B',valB)
|
||||
#print(valA)
|
||||
if valA >= len(SymbolA):
|
||||
#print('append')
|
||||
seperator.append(letter)
|
||||
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
if valA == 0:
|
||||
|
||||
if len(seperator[:-(len(SymbolB)-1)]) >= 1:
|
||||
seperators.append([''.join(seperator[:-(len(SymbolB)-1)])])
|
||||
seperator = []
|
||||
|
||||
|
||||
# Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs.
|
||||
# wuerde einiges an computation wegnehmen, auch da beide symbole
|
||||
#for n in range(len(SymbolA)):
|
||||
#print(SymbolA[valA % len(SymbolA)])
|
||||
if letter == SymbolA[valA % len(SymbolA)]:
|
||||
#print('oi')
|
||||
valA += 1
|
||||
wait1letterA = False
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
return seperators
|
||||
|
||||
def GetPayloadBetweenTwoSameSymbolsInText(self, text, Symbol):
|
||||
seperators = []
|
||||
seperator =[]
|
||||
wait1letter = False
|
||||
|
||||
nowendit = False
|
||||
|
||||
val = 0
|
||||
|
||||
|
||||
|
||||
for letter in text:
|
||||
#print(letter)
|
||||
#print(SymbolA)
|
||||
|
||||
if nowendit == False and letter == Symbol[val % len(Symbol)]:
|
||||
val += 1
|
||||
|
||||
|
||||
if nowendit == True and letter == Symbol[val % len(Symbol)]:
|
||||
val -= 1
|
||||
|
||||
|
||||
if val == len(Symbol):
|
||||
seperator.append(letter)
|
||||
nowendit = True
|
||||
#print('append')
|
||||
|
||||
if val == 0 and len(seperator) >= 1:
|
||||
seperators.append(' '.join(seperator))
|
||||
seperator = []
|
||||
nowendit = False
|
||||
|
||||
return seperators
|
||||
|
||||
def ParseWordswithSymbolFromSymbolongoing(self, text, Symbol):
|
||||
seperators = []
|
||||
#print(text.split())
|
||||
for word in text.split():
|
||||
|
||||
val = 0
|
||||
waitoneletter = False
|
||||
seperator = []
|
||||
for letter in word:
|
||||
|
||||
#print(letter)
|
||||
#print(val)
|
||||
if val < len(Symbol):
|
||||
if letter == Symbol[val]:
|
||||
val += 1
|
||||
#print(letter)
|
||||
#print(len(Symbol))
|
||||
#print(val)
|
||||
if val >= len(Symbol):
|
||||
val = len(Symbol)
|
||||
|
||||
if val < len(Symbol):
|
||||
if letter != Symbol[val]:
|
||||
val = 0
|
||||
|
||||
if val == len(Symbol):
|
||||
seperator.append(letter)
|
||||
#print('itsappending')
|
||||
|
||||
|
||||
if len(seperator) >= 1:
|
||||
seperators.append(''.join(seperator))
|
||||
seperator = []
|
||||
|
||||
return seperators
|
||||
|
||||
def ParseWithHighestLetterAccordance(self, inputtext, Letters):
|
||||
|
||||
# first check if there is a word that has all letters
|
||||
|
||||
short = False
|
||||
lettervect = []
|
||||
Lettervector = []
|
||||
|
||||
wordscores = []
|
||||
|
||||
|
||||
text = inputtext.lower()
|
||||
|
||||
|
||||
if '.' in set(Letters):
|
||||
short = True
|
||||
|
||||
|
||||
|
||||
if short == True:
|
||||
for letter in re.sub("[^a-zA-Züäö.]", " ", Letters):
|
||||
letter = letter.lower()
|
||||
#print(re.sub("[^a-züäö.]", " ", Letters))
|
||||
|
||||
if letter != '.' and letter != ' ':
|
||||
|
||||
lettervect.append(letter)
|
||||
if letter == '.':
|
||||
Lettervector.append(lettervect)
|
||||
lettervect = []
|
||||
|
||||
if len(lettervect) >= 1:
|
||||
Lettervector.append(lettervect)
|
||||
|
||||
else:
|
||||
|
||||
|
||||
for letter in re.sub("[^a-zA-Züäö.]", " ", Letters):
|
||||
letter = letter.lower()
|
||||
Lettervector.append([letter])
|
||||
|
||||
|
||||
#print(text)
|
||||
#print(Lettervector)
|
||||
from copy import deepcopy
|
||||
|
||||
for word in text.split():
|
||||
|
||||
lettervector = deepcopy(Lettervector)
|
||||
#print(word)
|
||||
#print(Lettervector)
|
||||
|
||||
wordscore = []
|
||||
for n in range(len(lettervector)):
|
||||
wordscore.append([word, 0])
|
||||
|
||||
#wordscore = len(lettervector) * [[word, 0 ]]
|
||||
#print(wordscore)
|
||||
firstletter = 0
|
||||
usedletters = []
|
||||
for letter in word:
|
||||
firstletter += 1
|
||||
|
||||
|
||||
#print(set(Letters))
|
||||
#print(wordscore)
|
||||
|
||||
#print(lettervector[n])
|
||||
if firstletter == 1:
|
||||
if letter == lettervector[0][0]:
|
||||
#print('oi')
|
||||
#print(lettervector)
|
||||
#print(len(lettervector[2]))
|
||||
wordscore[0][1] += 1
|
||||
lettervector[0].remove(letter)
|
||||
#print(usedletters)
|
||||
else:
|
||||
lettervector[0].remove(lettervector[0][0])
|
||||
|
||||
for n in range(len(lettervector)):
|
||||
|
||||
#print('1' ,letter)
|
||||
#print(lettervector[n][0])
|
||||
if letter in set(lettervector[n]):
|
||||
|
||||
#print('ooioi',usedletters)
|
||||
if letter not in set(usedletters):
|
||||
#print('something was added', letter)
|
||||
wordscore[n][1] += 1
|
||||
lettervector[n].remove(letter)
|
||||
#print('angesprungen')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
wordscores.append(wordscore)
|
||||
|
||||
#print(wordscores)
|
||||
|
||||
#checkbest_firstlettervector = []
|
||||
#for n in range(len(wordscores)):
|
||||
|
||||
#checkbest_firstlettervector.append([ n , wordscores[n][0][1]])
|
||||
|
||||
#print('wordscores', wordscores)
|
||||
#best_n_lettervectors = sorted(checkbest_firstlettervector[::-1], key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
#print(best_n_lettervectors)
|
||||
|
||||
#for wordscore in wordscores:
|
||||
ntupelscores = []
|
||||
ntupelscoresm = []
|
||||
|
||||
for o in range(len(wordscores)):
|
||||
#print('newlettervectorindex')
|
||||
lastletterexistentindex = 1
|
||||
lastlettercame = False
|
||||
if wordscores[o][0][1] >= 1:
|
||||
for m in range(1, len(lettervector) + 1):
|
||||
#print(m)
|
||||
if o <= len(text.split()) - (m):
|
||||
|
||||
triplescore = []
|
||||
for q in range(len(wordscores[o])):
|
||||
triplescore.append(0)
|
||||
#print(len(lettervector))
|
||||
|
||||
|
||||
|
||||
for n in range(m):
|
||||
#print(wordscores[lettervectorindex[0] + n][n][1])
|
||||
#wordscores[lettervectorindex[0] + 1][1][1] + wordscores[lettervectorindex[0] + 2][2][1]
|
||||
|
||||
for p in range(len(wordscores[o])):
|
||||
#print(wordscore[o + n][p][1])
|
||||
#print(len(Lettervector[p]))
|
||||
if wordscores[o + n][p][1] == len(Lettervector[p]):
|
||||
triplescore[p] += wordscores[o + n][p][1]
|
||||
|
||||
letterlength = 0
|
||||
for r in range(len(lettervector)):
|
||||
letterlength += len(Lettervector[r])
|
||||
|
||||
#print(wordscore)
|
||||
#print(sum(triplescore))
|
||||
|
||||
if p == len(wordscores[o]) - 1 and wordscores[o + n][p][1] == len(Lettervector[p]) and lastlettercame == False and sum(triplescore) == letterlength:
|
||||
#print('oioioioioioioooioioioiiiiiiiiiiiiiiiiiiiiiiiiiii')
|
||||
lastletterexistentindex = n
|
||||
lastlettercame = True
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#triplescore += wordscores[o + n][p][1]
|
||||
|
||||
|
||||
ntupelscores.append([[o , m, lastletterexistentindex], sum(triplescore)])
|
||||
#ntupelscoresm.append([m , triplescore])
|
||||
|
||||
|
||||
|
||||
#print(text.split())
|
||||
#print('bliblablub', ntupelscores)
|
||||
|
||||
for tupel in ntupelscores:
|
||||
|
||||
if text.split()[tupel[0][0]][0] == Lettervector[0][0]:
|
||||
tupel[1] += 3
|
||||
|
||||
#print('b',text.split()[tupel[0][0] + tupel[0][1] - 1][0])
|
||||
#print('a',Lettervector[-1][0])
|
||||
if text.split()[tupel[0][0] + tupel[0][1] - 1][0] == Lettervector[-1][0]:
|
||||
tupel[1] += 3
|
||||
|
||||
|
||||
# Bestrafe laengere Tupel, sprich wenn durch weitere worte kein score dazukommt
|
||||
tupel[1] -= tupel[0][1] * 0.1
|
||||
|
||||
bestntupelscoresorted = sorted(ntupelscores[::-1], key=lambda tup: tup[1], reverse=True)
|
||||
#bestntupelscoresortedm = sorted(ntupelscoresm[::-1], key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
#print('oioioioioioioioioioi',bestntupelscoresorted)
|
||||
outputntupel = []
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#print(bestntupelscoresorted)
|
||||
for s in range(bestntupelscoresorted[0][0][1] ):
|
||||
|
||||
outputntupel.append(text.split()[bestntupelscoresorted[0][0][0] + s])
|
||||
|
||||
#print(outputntupel)
|
||||
|
||||
return outputntupel
|
||||
|
||||
|
||||
#def parseWordsContainingCertainSymbols(self, text, symbols):
|
||||
#print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#fooSeparator = 'title'
|
||||
|
||||
#cwd = os.getcwd()
|
||||
|
||||
#with open('dewiktionary-20181201-pages-articles.xml') as xmldok:
|
||||
#with open(cwd + '/' + 'classes.txt', 'w') as Outdok:
|
||||
#n = 0
|
||||
#done = False
|
||||
#while done == False:
|
||||
#for line in xmldok:
|
||||
#n += 1
|
||||
##print(line)
|
||||
##print(dok_to_token(line))
|
||||
##print(n)
|
||||
#for word in line:
|
||||
#print(word)
|
||||
|
||||
#try:
|
||||
#if dok_to_token(line)[:(len(fooSeparator) + 2)] == '<' + fooSeparator + '>':
|
||||
#Outdok.write(dok_to_token(line)[len(fooSeperator):-len(fooSeperator)] + '\n')
|
||||
#except:
|
||||
#pass
|
||||
#if n >= 100000:
|
||||
#quit()
|
||||
|
||||
|
||||
|
||||
|
BIN
parse1.pyc
Normal file
BIN
parse1.pyc
Normal file
Binary file not shown.
Loading…
Reference in a new issue