a parser, able to run with pypy at 20 times of the speed of the standard python interpreter. Because written in pure python.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

787 lines
26 KiB

  1. # Parse wiktionary.xml with pure python, such that it can be run with pypy (python just in time compiler)
  2. # optimization would be possible through cython and assembler loops etc
  3. # on a linux system, get the first n lines of a document with:
  4. # head -n1000000 dewiktionary-20181201-pages-articles.xml > wiktionaryFirstMio.xml
  5. import sys
  6. import os
  7. import re
  8. class Parser(object):
  9. def __init__(self, InputDokument, OutputDokument):
  10. self.Indok = InputDokument
  11. self.Outdok = OutputDokument
  12. def GetSeparators(self):
  13. with open(self.Indok) as xmldok:
  14. with open(self.Outdok , 'w') as getsepdok:
  15. seperators = []
  16. counter = 0
  17. for line in xmldok:
  18. counter += 1
  19. #print(counter)
  20. if (counter % 10) == 0:
  21. print(counter)
  22. seperator =[]
  23. val = 0
  24. #if counter == 10000:
  25. #seperatorsSet = []
  26. #getsepdok.write('[' + '\n')
  27. #for element in seperators:
  28. #seperatorsSet.append(''.join(element))
  29. #for element in set(seperatorsSet):
  30. #getsepdok.write(str(''.join(element)) + '\n')
  31. #getsepdok.write(']')
  32. for letter in line:
  33. #print(letter)
  34. if letter == '>':
  35. val = 0
  36. seperators.append(seperator)
  37. seperator = []
  38. if val == 1:
  39. seperator.append(letter)
  40. else:
  41. pass
  42. if letter == '<':
  43. val = 1
  44. seperatorsSet = []
  45. getsepdok.write('[' + '\n')
  46. for element in seperators:
  47. seperatorsSet.append(''.join(element))
  48. seperatorsSet = set(seperatorsSet)
  49. for element in set(seperatorsSet):
  50. getsepdok.write(str(''.join(element)) + '\n')
  51. getsepdok.write(']')
  52. return seperatorsSet
  53. def GetPayloadBetweenTwoSymbols(self, SymbolA, SymbolB , LogLineNumber=False, Doc = True):
  54. with open(self.Indok) as xmldok:
  55. with open(self.Outdok , 'w') as payloaddok:
  56. seperators = []
  57. counter = 0
  58. valA = 0
  59. valB = 0
  60. seperator =[]
  61. for line in xmldok:
  62. #print(line)
  63. counter += 1
  64. if LogLineNumber == True:
  65. if (counter % 10000) == 0:
  66. print(counter)
  67. wait1letterA = False
  68. wait1letterB = False
  69. #for letter in line.decode('utf-8'):
  70. for letter in line:
  71. #print(letter)
  72. #print(set(range(1, len(SymbolA))))
  73. if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
  74. #print('jo')
  75. if wait1letterA == True:
  76. #print('joo')
  77. #print(letter)
  78. valA -= valA % len(SymbolA)
  79. wait1letterA = False
  80. wait1letterA = True
  81. if valB in set(range(1, len(SymbolB) )):
  82. if wait1letterB == True:
  83. valB = 0
  84. wait1letterB = False
  85. wait1letterB = True
  86. for n in range(len(SymbolB)):
  87. if valA >= len(SymbolA) and valB == n and letter == SymbolB[n]:
  88. valB = n + 1
  89. wait1letterB = False
  90. else:
  91. pass
  92. if valB == len(SymbolB) and valA >= len(SymbolA):
  93. valB = 0
  94. #print(letter)
  95. #print(valA)
  96. valA -= len(SymbolA)
  97. #print(valA)
  98. #print(seperators)
  99. if valA >= len(SymbolA):
  100. seperator.append(letter)
  101. else:
  102. pass
  103. #print(valA)
  104. #print(SymbolA[6])
  105. #print(len(SymbolA))
  106. #print(range(len(SymbolA)))
  107. if valA == 0:
  108. if len(seperator[:-(len(SymbolB)-1)]) >= 1:
  109. seperators.append(seperator[:-(len(SymbolB)-1)])
  110. seperator = []
  111. for n in range(len(SymbolA)):
  112. #print(n)
  113. if valA % len(SymbolA) == n and letter == SymbolA[n]:
  114. valA += 1
  115. #print(valA)
  116. wait1letterA = False
  117. break
  118. else:
  119. pass
  120. seperatorsSet = []
  121. #getsepdok.write('[' + '\n')
  122. for element in seperators:
  123. seperatorsSet.append(''.join(element))
  124. seperatorsSet = set(seperatorsSet)
  125. output = []
  126. ID = 0
  127. ## Set has a probabilistic factor in it!!!! thats why the nmbers change
  128. for element in seperatorsSet:
  129. output.append([element, ID])
  130. ID += 1
  131. return output
  132. def GetPayloadBetweenTwoSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber):
  133. seperators = []
  134. counter = 0
  135. for element in Payload:
  136. counter += 1
  137. if LogElementNumber == True:
  138. if (counter % 1000) == 0:
  139. print(counter)
  140. seperator =[]
  141. wait1letterA = False
  142. wait1letterB = False
  143. valA = 0
  144. valB = 0
  145. for letter in element[0]:
  146. #print(letter)
  147. #print(set(range(1, len(SymbolA))))
  148. if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
  149. #print(valA)
  150. #print('jo')
  151. if wait1letterA == True:
  152. #print('joo')
  153. valA -= valA % len(SymbolA)
  154. wait1letterA = False
  155. wait1letterA = True
  156. if valB in set(range(1, len(SymbolB) )) and valA >= len(SymbolA):
  157. if wait1letterB == True:
  158. valB = 0
  159. wait1letterB = False
  160. wait1letterB = True
  161. #for n in range(len(SymbolB)):
  162. #if valB == n and letter == SymbolB[n]:
  163. #valB = n + 1
  164. #wait1letterB = False
  165. #else:
  166. #pass
  167. if letter == SymbolB[valB % len(SymbolB)] and valA >= len(SymbolA):
  168. valB += 1
  169. wait1letterB = False
  170. else:
  171. pass
  172. if valB == len(SymbolB) and valA >= len(SymbolA):
  173. valB = 0
  174. #print(valA)
  175. valA -= len(SymbolA)
  176. #print(valA)
  177. #print(seperators)
  178. if valA >= len(SymbolA):
  179. ##print(letter)
  180. seperator.append(letter)
  181. #print(seperator)
  182. else:
  183. pass
  184. #print(valA)
  185. #print(SymbolA[6])
  186. #print(len(SymbolA))
  187. #print(range(len(SymbolA)))
  188. if valA == 0:
  189. #print('seps')
  190. if len(seperator[:-(len(SymbolB)-1)]) >= 1:
  191. seperators.append([''.join(seperator[:-(len(SymbolB)-1)]), element[1]])
  192. seperator = []
  193. # Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs.
  194. # wuerde einiges an computation wegnehmen, auch da beide symbole
  195. #for n in range(len(SymbolA)):
  196. ##print(n)
  197. #if valA % len(SymbolA) == n and letter == SymbolA[n]:
  198. ##print(SymbolA[n])
  199. #valA += 1
  200. #wait1letterA = False
  201. #else:
  202. #pass
  203. for n in range(len(SymbolA)):
  204. #print(n)
  205. if valA % len(SymbolA) == n and letter == SymbolA[n]:
  206. valA += 1
  207. #print(valA)
  208. wait1letterA = False
  209. break
  210. else:
  211. pass
  212. return seperators
  213. def GetPayloadBetweenTwoOneSymbolsInPayload(self, Payload, SymbolA, SymbolB, LogElementNumber, Payloadrow, IDrow):
  214. counter = 0
  215. seperator =[]
  216. seperators = []
  217. for payload in Payload:
  218. val = 0
  219. for letter in payload[Payloadrow]:
  220. counter += 1
  221. #print(counter)
  222. if LogElementNumber == True:
  223. if (counter % 10) == 0:
  224. print(counter)
  225. #print(letter)
  226. if letter == SymbolB:
  227. val -= 1
  228. if val >= 1:
  229. seperator.append(letter)
  230. else:
  231. pass
  232. if val == 0 and len(seperator) >= 1:
  233. seperators.append([''.join(seperator), payload[IDrow]])
  234. seperator = []
  235. if letter == SymbolA:
  236. #print(val)
  237. val += 1
  238. return seperators
  239. def CutTextAtSymbol(self, text, symbol):
  240. itisthesymbol = 0
  241. outtext = []
  242. output = []
  243. symbolisthere = 0
  244. for letter in text:
  245. outtext.append(letter)
  246. #print(letter)
  247. if letter != symbol[itisthesymbol]:
  248. itisthesymbol = 0
  249. if letter == symbol[itisthesymbol]:
  250. itisthesymbol += 1
  251. if itisthesymbol == len(symbol):
  252. #print(outtext)
  253. output.append(''.join(outtext))
  254. itisthesymbol = 0
  255. symbolisthere = 1
  256. if symbolisthere == 0:
  257. output.append(''.join(outtext))
  258. return output[0]
  259. def GetPayloadBetweenTwoSymbolsInText(self, text, SymbolA, SymbolB):
  260. seperators = []
  261. seperator =[]
  262. wait1letterA = False
  263. wait1letterB = False
  264. valA = 0
  265. valB = 0
  266. for letter in text:
  267. #print(letter)
  268. #print(SymbolA)
  269. if valA % len(SymbolA) in set(range(1, len(SymbolA) )):
  270. if wait1letterA == True:
  271. valA -= valA % len(SymbolA)
  272. wait1letterA = False
  273. wait1letterA = True
  274. #print('B',valB)
  275. #print(valA)
  276. if valB in set(range(1, len(SymbolB) )):
  277. if wait1letterB == True:
  278. valB = 0
  279. wait1letterB = False
  280. wait1letterB = True
  281. #print('B',valB)
  282. #print(valA)
  283. if letter == SymbolB[valB % len(SymbolB)]:
  284. valB += 1
  285. wait1letterB = False
  286. else:
  287. pass
  288. if valB == len(SymbolB):
  289. valB = 0
  290. valA -= len(SymbolA)
  291. #print('B',valB)
  292. #print(valA)
  293. if valA >= len(SymbolA):
  294. #print('append')
  295. seperator.append(letter)
  296. else:
  297. pass
  298. if valA == 0:
  299. if len(seperator[:-(len(SymbolB)-1)]) >= 1:
  300. seperators.append([''.join(seperator[:-(len(SymbolB)-1)])])
  301. seperator = []
  302. # Optimierungsmoeglichkeit: Hier kann die for schleife durch viele ifs ersetzt werden, sowas wie start for after zwei ifs.
  303. # wuerde einiges an computation wegnehmen, auch da beide symbole
  304. #for n in range(len(SymbolA)):
  305. #print(SymbolA[valA % len(SymbolA)])
  306. if letter == SymbolA[valA % len(SymbolA)]:
  307. #print('oi')
  308. valA += 1
  309. wait1letterA = False
  310. else:
  311. pass
  312. return seperators
  313. def GetPayloadBetweenTwoSameSymbolsInText(self, text, Symbol):
  314. seperators = []
  315. seperator =[]
  316. wait1letter = False
  317. nowendit = False
  318. val = 0
  319. for letter in text:
  320. #print(letter)
  321. #print(SymbolA)
  322. if nowendit == False and letter == Symbol[val % len(Symbol)]:
  323. val += 1
  324. if nowendit == True and letter == Symbol[val % len(Symbol)]:
  325. val -= 1
  326. if val == len(Symbol):
  327. seperator.append(letter)
  328. nowendit = True
  329. #print('append')
  330. if val == 0 and len(seperator) >= 1:
  331. seperators.append(' '.join(seperator))
  332. seperator = []
  333. nowendit = False
  334. return seperators
  335. def ParseWordswithSymbolFromSymbolongoing(self, text, Symbol):
  336. seperators = []
  337. #print(text.split())
  338. for word in text.split():
  339. val = 0
  340. waitoneletter = False
  341. seperator = []
  342. for letter in word:
  343. #print(letter)
  344. #print(val)
  345. if val < len(Symbol):
  346. if letter == Symbol[val]:
  347. val += 1
  348. #print(letter)
  349. #print(len(Symbol))
  350. #print(val)
  351. if val >= len(Symbol):
  352. val = len(Symbol)
  353. if val < len(Symbol):
  354. if letter != Symbol[val]:
  355. val = 0
  356. if val == len(Symbol):
  357. seperator.append(letter)
  358. #print('itsappending')
  359. if len(seperator) >= 1:
  360. seperators.append(''.join(seperator))
  361. seperator = []
  362. return seperators
  363. def ParseWithHighestLetterAccordance(self, inputtext, Letters):
  364. # first check if there is a word that has all letters
  365. short = False
  366. lettervect = []
  367. Lettervector = []
  368. wordscores = []
  369. text = inputtext.lower()
  370. if '.' in set(Letters):
  371. short = True
  372. if short == True:
  373. for letter in re.sub("[^a-zA-Züäö.]", " ", Letters):
  374. letter = letter.lower()
  375. #print(re.sub("[^a-züäö.]", " ", Letters))
  376. if letter != '.' and letter != ' ':
  377. lettervect.append(letter)
  378. if letter == '.':
  379. Lettervector.append(lettervect)
  380. lettervect = []
  381. if len(lettervect) >= 1:
  382. Lettervector.append(lettervect)
  383. else:
  384. for letter in re.sub("[^a-zA-Züäö.]", " ", Letters):
  385. letter = letter.lower()
  386. Lettervector.append([letter])
  387. #print(text)
  388. #print(Lettervector)
  389. from copy import deepcopy
  390. for word in text.split():
  391. lettervector = deepcopy(Lettervector)
  392. #print(word)
  393. #print(Lettervector)
  394. wordscore = []
  395. for n in range(len(lettervector)):
  396. wordscore.append([word, 0])
  397. #wordscore = len(lettervector) * [[word, 0 ]]
  398. #print(wordscore)
  399. firstletter = 0
  400. usedletters = []
  401. for letter in word:
  402. firstletter += 1
  403. #print(set(Letters))
  404. #print(wordscore)
  405. #print(lettervector[n])
  406. if firstletter == 1:
  407. if letter == lettervector[0][0]:
  408. #print('oi')
  409. #print(lettervector)
  410. #print(len(lettervector[2]))
  411. wordscore[0][1] += 1
  412. lettervector[0].remove(letter)
  413. #print(usedletters)
  414. else:
  415. lettervector[0].remove(lettervector[0][0])
  416. for n in range(len(lettervector)):
  417. #print('1' ,letter)
  418. #print(lettervector[n][0])
  419. if letter in set(lettervector[n]):
  420. #print('ooioi',usedletters)
  421. if letter not in set(usedletters):
  422. #print('something was added', letter)
  423. wordscore[n][1] += 1
  424. lettervector[n].remove(letter)
  425. #print('angesprungen')
  426. wordscores.append(wordscore)
  427. #print(wordscores)
  428. #checkbest_firstlettervector = []
  429. #for n in range(len(wordscores)):
  430. #checkbest_firstlettervector.append([ n , wordscores[n][0][1]])
  431. #print('wordscores', wordscores)
  432. #best_n_lettervectors = sorted(checkbest_firstlettervector[::-1], key=lambda tup: tup[1], reverse=True)
  433. #print(best_n_lettervectors)
  434. #for wordscore in wordscores:
  435. ntupelscores = []
  436. ntupelscoresm = []
  437. for o in range(len(wordscores)):
  438. #print('newlettervectorindex')
  439. lastletterexistentindex = 1
  440. lastlettercame = False
  441. if wordscores[o][0][1] >= 1:
  442. for m in range(1, len(lettervector) + 1):
  443. #print(m)
  444. if o <= len(text.split()) - (m):
  445. triplescore = []
  446. for q in range(len(wordscores[o])):
  447. triplescore.append(0)
  448. #print(len(lettervector))
  449. for n in range(m):
  450. #print(wordscores[lettervectorindex[0] + n][n][1])
  451. #wordscores[lettervectorindex[0] + 1][1][1] + wordscores[lettervectorindex[0] + 2][2][1]
  452. for p in range(len(wordscores[o])):
  453. #print(wordscore[o + n][p][1])
  454. #print(len(Lettervector[p]))
  455. if wordscores[o + n][p][1] == len(Lettervector[p]):
  456. triplescore[p] += wordscores[o + n][p][1]
  457. letterlength = 0
  458. for r in range(len(lettervector)):
  459. letterlength += len(Lettervector[r])
  460. #print(wordscore)
  461. #print(sum(triplescore))
  462. if p == len(wordscores[o]) - 1 and wordscores[o + n][p][1] == len(Lettervector[p]) and lastlettercame == False and sum(triplescore) == letterlength:
  463. #print('oioioioioioioooioioioiiiiiiiiiiiiiiiiiiiiiiiiiii')
  464. lastletterexistentindex = n
  465. lastlettercame = True
  466. #triplescore += wordscores[o + n][p][1]
  467. ntupelscores.append([[o , m, lastletterexistentindex], sum(triplescore)])
  468. #ntupelscoresm.append([m , triplescore])
  469. #print(text.split())
  470. #print('bliblablub', ntupelscores)
  471. for tupel in ntupelscores:
  472. if text.split()[tupel[0][0]][0] == Lettervector[0][0]:
  473. tupel[1] += 3
  474. #print('b',text.split()[tupel[0][0] + tupel[0][1] - 1][0])
  475. #print('a',Lettervector[-1][0])
  476. if text.split()[tupel[0][0] + tupel[0][1] - 1][0] == Lettervector[-1][0]:
  477. tupel[1] += 3
  478. # Bestrafe laengere Tupel, sprich wenn durch weitere worte kein score dazukommt
  479. tupel[1] -= tupel[0][1] * 0.1
  480. bestntupelscoresorted = sorted(ntupelscores[::-1], key=lambda tup: tup[1], reverse=True)
  481. #bestntupelscoresortedm = sorted(ntupelscoresm[::-1], key=lambda tup: tup[1], reverse=True)
  482. #print('oioioioioioioioioioi',bestntupelscoresorted)
  483. outputntupel = []
  484. #print(bestntupelscoresorted)
  485. for s in range(bestntupelscoresorted[0][0][1] ):
  486. outputntupel.append(text.split()[bestntupelscoresorted[0][0][0] + s])
  487. #print(outputntupel)
  488. return outputntupel
  489. #def parseWordsContainingCertainSymbols(self, text, symbols):
  490. #print()
  491. #fooSeparator = 'title'
  492. #cwd = os.getcwd()
  493. #with open('dewiktionary-20181201-pages-articles.xml') as xmldok:
  494. #with open(cwd + '/' + 'classes.txt', 'w') as Outdok:
  495. #n = 0
  496. #done = False
  497. #while done == False:
  498. #for line in xmldok:
  499. #n += 1
  500. ##print(line)
  501. ##print(dok_to_token(line))
  502. ##print(n)
  503. #for word in line:
  504. #print(word)
  505. #try:
  506. #if dok_to_token(line)[:(len(fooSeparator) + 2)] == '<' + fooSeparator + '>':
  507. #Outdok.write(dok_to_token(line)[len(fooSeperator):-len(fooSeperator)] + '\n')
  508. #except:
  509. #pass
  510. #if n >= 100000:
  511. #quit()