You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

889 lines
43 KiB

  1. import os
  2. import yaml
  3. import json
  4. import urllib.request, urllib.error, urllib.parse
  5. from lxml import etree
  6. import lxml.html
  7. import lxml.html.soupparser
  8. from lxml import html
  9. import requests
  10. from trafilatura import extract
  11. from pdfminer.high_level import extract_pages
  12. from pdfminer.layout import LTTextContainer
  13. import time
  14. import subprocess
  15. class fdb_spider(object):
  16. def __init__(self, config_file):
  17. with open(config_file, "r") as stream:
  18. try:
  19. self.config = yaml.safe_load(stream)
  20. except yaml.YAMLError as exc:
  21. print(exc)
  22. # input list of funding databases in form of yaml file ['foerderinfo.bund.de', 'ausschreibungen.giz.de', .. , 'usw']
  23. def download_entry_list_pages_of_funding_databases(self, list_of_fdbs):
  24. # download only html pages of the funding databases specified in input
  25. for fdb in list_of_fdbs:
  26. for key in self.config:
  27. if key in list_of_fdbs:
  28. try:
  29. entry_list = self.config.get(key).get("entry-list")
  30. except Exception as e:
  31. print(
  32. "There is a problem with the configuration variable entryList in the config.yaml - the original error message is:",
  33. e,
  34. )
  35. try:
  36. entry_list_link1 = entry_list.get("link1")
  37. except Exception as e:
  38. print(
  39. "No link1 defined in config.yaml - the original error message is:",
  40. e,
  41. )
  42. try:
  43. entry_list_link2 = entry_list.get("link2")
  44. except Exception as e:
  45. print(
  46. "No link2 defined in config.yaml - the original error message is:",
  47. e,
  48. )
  49. try:
  50. entry_list_jslink1 = entry_list.get("jslink1")
  51. except Exception as e:
  52. print(
  53. "No jslink1 defined in config.yaml - the original error message is:",
  54. e,
  55. )
  56. entry_list_jslink1 = 'NONE'
  57. try:
  58. entry_list_jslink2 = entry_list.get("jslink2")
  59. except Exception as e:
  60. print(
  61. "No jslink2 defined in config.yaml - the original error message is:",
  62. e,
  63. )
  64. entry_list_jslink2 = 'NONE'
  65. try:
  66. entry_iteration_var_list = eval(entry_list.get("iteration-var-list"))
  67. except Exception as e:
  68. print(
  69. "No iteration-var-list defined in config.yaml - the original error message is:",
  70. e,
  71. )
  72. try:
  73. entry_jsiteration_var_list = eval(entry_list.get("jsiteration-var-list"))
  74. except Exception as e:
  75. print(
  76. "No jsiteration-var-list defined in config.yaml - the original error message is:",
  77. e,
  78. )
  79. try:
  80. entry_jsdomain = entry_list.get("jsdomain")
  81. except Exception as e:
  82. print(
  83. "No jsdomain defined in config.yaml - the original error message is:",
  84. e,
  85. )
  86. entry_jsdomain = 'NONE'
  87. if entry_jsdomain == 'NONE' or entry_jsdomain == 'None':
  88. for i in entry_iteration_var_list:
  89. # download the html page of the List of entrys
  90. response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
  91. # web_content = response.read().decode("UTF-8")
  92. try:
  93. web_content = response.read().decode("UTF-8")
  94. except Exception as e:
  95. try:
  96. web_content = response.read().decode("latin-1")
  97. print(
  98. "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
  99. e,
  100. )
  101. except Exception as ex:
  102. print(ex)
  103. # save interim results to files
  104. if (len(web_content)) < 10:
  105. print('getting the html page through urllib did not work, trying with requests librarys function get')
  106. try:
  107. res = requests.get(entry_list_link1 + str(i) + entry_list_link2)
  108. web_content = res.text
  109. except Exception as e:
  110. print('also requests library did not work, original error is:', e)
  111. # print(web_content)
  112. f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
  113. f.write(web_content)
  114. f.close
  115. else:
  116. from selenium import webdriver
  117. from selenium.webdriver.chrome.service import Service
  118. from pyvirtualdisplay import Display
  119. display = Display(visible=0, size=(800, 800))
  120. display.start()
  121. ##outputdir = '.'
  122. ##service_log_path = "{}/chromedriver.log".format(outputdir)
  123. ##service_args = ['--verbose']
  124. ##driver = webdriver.Chrome('/usr/bin/chromium')
  125. options = webdriver.ChromeOptions()
  126. options.add_argument('headless')
  127. options.add_argument("--remote-debugging-port=9222")
  128. options.add_argument('--no-sandbox')
  129. options.add_argument('--disable-dev-shm-usage')
  130. service = Service(executable_path='/usr/bin/chromedriver')
  131. driver = webdriver.Chrome(options=options, service=service)
  132. # driver = webdriver.Chrome()
  133. driver.get(entry_jsdomain)
  134. for i in range(len(entry_jsiteration_var_list)):
  135. time.sleep(2)
  136. print('trying to get element')
  137. try:
  138. element = driver.find_element(
  139. "xpath",
  140. entry_list_jslink1
  141. + str(entry_jsiteration_var_list[i])
  142. + entry_list_jslink2
  143. )
  144. print(entry_iteration_var_list[i])
  145. time.sleep(2)
  146. print('clicking..')
  147. element.click()
  148. time.sleep(2)
  149. #window_after = driver.window_handles[1]
  150. print('length of the window handles', len(driver.window_handles))
  151. #driver.switch_to.window(window_after)
  152. web_content = driver.page_source
  153. f = open("spiders/pages/" + key + str(entry_iteration_var_list[i]) + "entryList.html", "w+")
  154. f.write(web_content)
  155. f.close
  156. except Exception as e:
  157. print('the iteration var element for clicking the pages was not found.. the original message is:',e )
  158. def find_config_parameter(self, list_of_fdbs):
  159. for fdb in list_of_fdbs:
  160. try:
  161. iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
  162. except Exception as e:
  163. print(
  164. "There is a problem with the configuration variable entryList iteration var list in the config.yaml",
  165. e,
  166. )
  167. fdb_conf = self.config.get(fdb)
  168. fdb_domain = fdb_conf.get("domain")
  169. fdb_conf_entry_list = fdb_conf.get("entry-list")
  170. fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
  171. fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
  172. fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
  173. fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
  174. fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
  175. for i in iteration_var_list:
  176. print(i)
  177. try:
  178. # use soupparser to handle broken html
  179. tree = lxml.html.soupparser.parse(
  180. "spiders/pages/" + fdb + str(i) + "entryList.html"
  181. )
  182. except Exception as e:
  183. tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
  184. print(
  185. "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been",
  186. e,
  187. )
  188. try:
  189. print('this is the n looped elements of the parent specified in config.yaml:')
  190. #print('entrylistparent', fdb_conf_entry_list_parent)
  191. #print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"))
  192. #print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode())
  193. for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
  194. print('-----------------------------------------------------------------------------------------------------------------------------------------')
  195. print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode())
  196. print('this is the name children:')
  197. name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name)
  198. print(name_element)
  199. #for name in name_element:
  200. # print(name)
  201. print(len(name_element))
  202. print('this is the link children:')
  203. link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link)
  204. print(link_element)
  205. #for link in link_element:
  206. # print(link)
  207. print(len(link_element))
  208. print('this is the info children:')
  209. info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info)
  210. print(info_element)
  211. print(len(info_element))
  212. print('this is the period children:')
  213. period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period)
  214. print(period_element)
  215. print(len(period_element))
  216. except Exception as e:
  217. print(
  218. "parsing the html did not work.",
  219. e,
  220. )
  221. def parse_entry_list_data2dictionary(self, list_of_fdbs):
  222. for fdb in list_of_fdbs:
  223. try:
  224. iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
  225. except Exception as e:
  226. print(
  227. "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
  228. e,
  229. )
  230. for i in iteration_var_list:
  231. print(i)
  232. try:
  233. # use soupparser to handle broken html
  234. tree = lxml.html.soupparser.parse(
  235. "spiders/pages/" + fdb + str(i) + "entryList.html"
  236. )
  237. except Exception as e:
  238. tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
  239. print(
  240. "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
  241. e,
  242. )
  243. try:
  244. #print('this is the n looped elements of the parent specified in config.yaml:')
  245. #for e in tree.iter():
  246. # print(e.tag)
  247. #
  248. #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):
  249. #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']"):
  250. # print(etree.tostring(e).decode())
  251. dictionary_entry_list = {}
  252. fdb_conf = self.config.get(fdb)
  253. fdb_domain = fdb_conf.get("domain")
  254. fdb_conf_entry_list = fdb_conf.get("entry-list")
  255. fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
  256. fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
  257. fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
  258. fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
  259. fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
  260. #print('blabliblub')
  261. #print('len', len(tree.xpath(fdb_conf_entry_list_parent)))
  262. for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
  263. try:
  264. name = tree.xpath(
  265. fdb_conf_entry_list_parent
  266. + "["
  267. + str(n+1)
  268. + "]"
  269. + fdb_conf_entry_list_child_name
  270. )[0]
  271. except Exception as e:
  272. print("name could not be parsed", e)
  273. name = 'NONE'
  274. try:
  275. info = tree.xpath(
  276. fdb_conf_entry_list_parent
  277. + "["
  278. + str(n+1)
  279. + "]"
  280. + fdb_conf_entry_list_child_info
  281. )[0]
  282. except Exception as e:
  283. print("info could not be parsed", e, info)
  284. info = 'NONE'
  285. try:
  286. period = tree.xpath(
  287. fdb_conf_entry_list_parent
  288. + "["
  289. + str(n+1)
  290. + "]"
  291. + fdb_conf_entry_list_child_period
  292. )[0]
  293. #print('period', period)
  294. except Exception as e:
  295. print("period could not be parsed", e, period)
  296. period = 'NONE'
  297. try:
  298. link = tree.xpath(
  299. fdb_conf_entry_list_parent
  300. + "["
  301. + str(n+1)
  302. + "]"
  303. + fdb_conf_entry_list_child_link
  304. )[0]
  305. if 'javascript:' in link:
  306. #from selenium import webdriver
  307. print('link is javascript element, not url to parse')
  308. #url = 'https://example.com'
  309. #driver = webdriver.Chrome()
  310. #driver.get(url)
  311. #links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')]
  312. #print('link', link)
  313. except Exception as e:
  314. print("link could not be parsed", e, link)
  315. link = 'NONE'
  316. if len(name) > 0 and name != 'NONE':
  317. dictionary_entry_list[n] = {}
  318. dictionary_entry_list[n]["name"] = name
  319. dictionary_entry_list[n]["info"] = info
  320. dictionary_entry_list[n]["period"] = period
  321. #print('linklink', link, fdb_domain)
  322. if fdb_domain in link:
  323. print('oi')
  324. dictionary_entry_list[n]["link"] = link
  325. if fdb_domain not in link and 'http:' in link:
  326. print('oiA')
  327. dictionary_entry_list[n]["link"] = link
  328. if fdb_domain not in link and 'www.' in link:
  329. dictionary_entry_list[n]["link"] = link
  330. if fdb_domain not in link and 'https:' in link:
  331. dictionary_entry_list[n]["link"] = link
  332. if 'javascript:' in link:
  333. dictionary_entry_list[n]["link"] = link
  334. if fdb_domain not in link:
  335. if 'http' not in link:
  336. if 'www' not in link:
  337. #print('oiB')
  338. if link[0] == '/':
  339. if fdb_domain[-1] != '/':
  340. dictionary_entry_list[n]["link"] = fdb_domain + link
  341. #print('got into D', dictionary_entry_list[n]["link"])
  342. if fdb_domain[-1] == '/':
  343. dictionary_entry_list[n]["link"] = fdb_domain + link[1:]
  344. #print('got into C', dictionary_entry_list[n]["link"])
  345. if link[0] == '.' and link[1] == '/':
  346. if fdb_domain[-1] != '/':
  347. dictionary_entry_list[n]["link"] = fdb_domain + link[1:]
  348. #print('got into B', dictionary_entry_list[n]["link"])
  349. if fdb_domain[-1] == '/':
  350. dictionary_entry_list[n]["link"] = fdb_domain + link[2:]
  351. #print('got into A', dictionary_entry_list[n]["link"])
  352. if link[0] != '/':
  353. dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
  354. #print('got into last else', dictionary_entry_list[n]["link"])
  355. except Exception as e:
  356. print(
  357. "parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:",
  358. e,
  359. )
  360. # save interim results to files
  361. f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
  362. f.write(str(dictionary_entry_list))
  363. f.close
  364. def download_entry_data_htmls(self, list_of_fdbs):
  365. from selenium import webdriver
  366. from selenium.webdriver.chrome.service import Service
  367. from pyvirtualdisplay import Display
  368. display = Display(visible=0, size=(800, 800))
  369. display.start()
  370. #outputdir = '.'
  371. #service_log_path = "{}/chromedriver.log".format(outputdir)
  372. #service_args = ['--verbose']
  373. #driver = webdriver.Chrome('/usr/bin/chromium')
  374. options = webdriver.ChromeOptions()
  375. options.add_argument('headless')
  376. options.add_argument("--remote-debugging-port=9222")
  377. options.add_argument('--no-sandbox')
  378. options.add_argument('--disable-dev-shm-usage')
  379. service = Service(executable_path='/usr/bin/chromedriver')
  380. driver = webdriver.Chrome(options=options, service=service)
  381. #driver = webdriver.Chrome()
  382. for fdb in list_of_fdbs:
  383. try:
  384. iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
  385. except Exception as e:
  386. print(
  387. "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
  388. e,
  389. )
  390. print('starting to download the entry html pages..')
  391. for i in iteration_var_list:
  392. print(i)
  393. f = open("spiders/output/" + fdb + str(i) + "entryList.txt")
  394. text = f.read()
  395. dictionary_entry_list = eval(text)
  396. fdb_conf = self.config.get(fdb)
  397. fdb_domain = fdb_conf.get("domain")
  398. fdb_conf_entry_list = fdb_conf.get("entry-list")
  399. fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
  400. fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
  401. try:
  402. fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
  403. except Exception as e:
  404. print('the javascript link in the config is missing, original error message is:', e)
  405. fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
  406. fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
  407. driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
  408. for entry_id in dictionary_entry_list:
  409. print(entry_id)
  410. entry_link = dictionary_entry_list[entry_id]["link"]
  411. web_content = 'NONE'
  412. # download the html page of the entry
  413. if 'javascript' in entry_link:
  414. element = driver.find_element(
  415. "xpath",
  416. fdb_conf_entry_list_parent
  417. + "["
  418. + str(entry_id+1)
  419. + "]"
  420. + fdb_conf_entry_list_javascript_link
  421. )
  422. # to time.sleep was suggested for errors
  423. #import time
  424. #time.sleep(1)
  425. element.click()
  426. window_after = driver.window_handles[1]
  427. driver.switch_to.window(window_after)
  428. #element = driver.find_element("xpath", "//html")
  429. #web_content = element.text
  430. #entry_domain = driver.getCurrentUrl()
  431. entry_domain = driver.current_url
  432. dictionary_entry_list[entry_id]["domain"] = entry_domain
  433. web_content = driver.page_source
  434. file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
  435. os.makedirs(os.path.dirname(file_name), exist_ok=True)
  436. f = open(file_name, "w+")
  437. f.write(web_content)
  438. f.close
  439. window_before = driver.window_handles[0]
  440. driver.switch_to.window(window_before)
  441. if 'javascript' not in entry_link and '.pdf' not in entry_link:
  442. print('blabuuuuuba')
  443. #print('oi')
  444. try:
  445. # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
  446. url = entry_link
  447. req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
  448. response = urllib.request.urlopen(req)
  449. print('response from first one', response)
  450. except Exception as e:
  451. print('cookie giving then downloading did not work, original error is:', e)
  452. try:
  453. response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
  454. print(
  455. "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
  456. e,
  457. )
  458. except Exception as ex:
  459. print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
  460. try:
  461. web_content = response.read().decode("UTF-8")
  462. except Exception as e:
  463. try:
  464. web_content = response.read().decode("latin-1")
  465. print(
  466. "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
  467. e,
  468. )
  469. except Exception as ex:
  470. print(ex)
  471. # save interim results to files
  472. if '.pdf' in entry_link:
  473. file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
  474. response = requests.get(entry_link)
  475. os.makedirs(os.path.dirname(file_name), exist_ok=True)
  476. f = open(file_name, "bw")
  477. f.write(response.content)
  478. f.close
  479. file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
  480. wget_wrote = False
  481. if web_content == 'NONE':
  482. print('other downloading approaches did not work, trying requests')
  483. try:
  484. from requests_html import HTMLSession
  485. session = HTMLSession()
  486. r = session.get(entry_link)
  487. r.html.render()
  488. web_content = r.text
  489. except Exception as e:
  490. print('requests_html HTMLSession did not work trying wget, ori error is:', e)
  491. try:
  492. os.makedirs(os.path.dirname(file_name), exist_ok=True)
  493. oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
  494. wget_wrote = True
  495. except subprocess.CalledProcessError:
  496. print('wget downloading did not work.. saving NONE to file now')
  497. if wget_wrote == False:
  498. os.makedirs(os.path.dirname(file_name), exist_ok=True)
  499. f = open(file_name, "w+")
  500. f.write(web_content)
  501. f.close
  502. # save the entry_domain, implemented first for further downloads in javascript links
  503. f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
  504. f.write(str(dictionary_entry_list))
  505. f.close
  506. def parse_entry_data2dictionary(self, list_of_fdbs):
  507. for fdb in list_of_fdbs:
  508. try:
  509. fdb_config = self.config.get(fdb)
  510. print('oi oi',fdb_config)
  511. fdb_config_entrylist = fdb_config.get("entry-list")
  512. iteration_var_list = eval(fdb_config_entrylist.get("iteration-var-list"))
  513. except Exception as e:
  514. print(
  515. "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
  516. e,
  517. )
  518. for i in iteration_var_list:
  519. print("started to parse data of entry of " + fdb + " ..")
  520. f = open("spiders/output/" + fdb + str(i) + "entryList.txt")
  521. text = f.read()
  522. dictionary_entry_list = eval(text)
  523. fdb_conf = self.config.get(fdb)
  524. fdb_domain = fdb_conf.get("domain")
  525. fdb_conf_entry = fdb_conf.get("entry")
  526. #print('balubaluba', fdb_conf_entry)
  527. fdb_conf_entry_general = fdb_conf_entry.get("general")
  528. #print(fdb_conf_entry_general)
  529. for entry_id in dictionary_entry_list:
  530. print(
  531. "started to parse data of entry with name "
  532. + dictionary_entry_list[entry_id]["name"]
  533. + " .."
  534. )
  535. file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
  536. try:
  537. tree = lxml.html.soupparser.parse(file_name)
  538. except Exception as e:
  539. tree = html.parse(file_name)
  540. print(
  541. "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
  542. e,
  543. )
  544. if fdb_conf_entry_general["uniform"] == 'TRUE':
  545. fdb_conf_entry_unitrue = fdb_conf_entry.get("unitrue")
  546. for key in fdb_conf_entry_unitrue:
  547. fdb_conf_entry_unitrue_child = fdb_conf_entry_unitrue.get(key)
  548. child = tree.xpath(
  549. fdb_conf_entry_unitrue_child
  550. )[0]
  551. print("oi", child)
  552. if '.pdf' in child:
  553. print('child in entry data is pdf, downloading it..')
  554. file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".pdf"
  555. entry_link = dictionary_entry_list[entry_id]["link"]
  556. if 'http' not in child:
  557. if 'javascript' or 'js' not in entry_link and 'http' in entry_link:
  558. try:
  559. response = requests.get(entry_link + child)
  560. except Exception as e:
  561. print(entry_link + child + ' seems not a valid pdf link to download, orginal error message is:', e)
  562. if 'javascript' or 'js' in entry_link:
  563. entry_domain = dictionary_entry_list[entry_id]["domain"]
  564. if child[0] == '.' and child[1] == '/':
  565. if entry_domain[-1] == '/':
  566. pdf_link = entry_domain[:-1] + child[1:]
  567. if entry_domain[-1] != '/':
  568. for n in range(len(entry_domain)):
  569. if entry_domain[-n] != '/':
  570. entry_domain = entry_domain[:-1]
  571. else:
  572. break
  573. pdf_link = entry_domain + child[1:]
  574. if child[0] == '/':
  575. if entry_domain[-1] == '/':
  576. pdf_link = entry_domain[:-1] + child
  577. if entry_domain[-1] != '/':
  578. pdf_link = entry_domain + child
  579. print('pdf_link', pdf_link)
  580. try:
  581. response = requests.get(pdf_link)
  582. except Exception as e:
  583. print(pdf_link + ' seems not a valid pdf link to download, orginal error message is:', e)
  584. #response = requests.get(child)
  585. os.makedirs(os.path.dirname(file_name), exist_ok=True)
  586. f = open(file_name, "bw")
  587. f.write(response.content)
  588. f.close
  589. print('parsing a pdf', pdf_link, entry_id)
  590. try:
  591. generaltext = ''
  592. for page_layout in extract_pages(file_name):
  593. for element in page_layout:
  594. if isinstance(element, LTTextContainer):
  595. generaltext += element.get_text()
  596. except Exception as e:
  597. generaltext = 'NONE'
  598. print('parsing pdf did not work, the original error is:', e )
  599. dictionary_entry_list[entry_id][key] = generaltext
  600. if len(child) > 0 and '.pdf' not in child:
  601. dictionary_entry_list[entry_id][key] = child[
  602. 0
  603. ]
  604. else:
  605. fdb_conf_entry_unifalse = fdb_conf_entry.get("unifalse")
  606. fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist")
  607. if '.pdf' in dictionary_entry_list[entry_id]["link"]:
  608. print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id)
  609. try:
  610. generaltext = ''
  611. for page_layout in extract_pages(file_name):
  612. for element in page_layout:
  613. if isinstance(element, LTTextContainer):
  614. generaltext += element.get_text()
  615. except Exception as e:
  616. generaltext = 'NONE'
  617. print('parsing pdf did not work, the original error is:', e )
  618. else:
  619. p_text = tree.xpath(
  620. "//p//text()"
  621. )
  622. div_text = tree.xpath(
  623. "//div//text()"
  624. )
  625. #print("oi", text)
  626. generaltext = ''
  627. for n in range(len(p_text)):
  628. if len(p_text[n]) > 0:
  629. generaltext += p_text[n] + ' '
  630. for n in range(len(div_text)):
  631. if len(div_text[n]) > 0 and div_text[n] not in p_text:
  632. generaltext += div_text[n] + ' '
  633. generaltextlist = generaltext.split(' ')
  634. if len(generaltextlist) > 5000:
  635. print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist))
  636. file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
  637. try:
  638. with open(file_name , 'r', encoding='utf-8') as file:
  639. html_content = file.read()
  640. except Exception as e:
  641. with open(file_name , 'r', encoding='latin-1') as file:
  642. html_content = file.read()
  643. print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
  644. generaltext = extract(html_content)
  645. print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
  646. if len(generaltextlist) < 2:
  647. print('no text parsed, the wc is', len(generaltextlist))
  648. print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))
  649. file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
  650. try:
  651. with open(file_name , 'r', encoding='utf-8') as file:
  652. html_content = file.read()
  653. except Exception as e:
  654. with open(file_name , 'r', encoding='latin-1') as file:
  655. html_content = file.read()
  656. print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
  657. generaltext = extract(html_content)
  658. try:
  659. if len(generaltext) > 2:
  660. print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
  661. except:
  662. print('trafilatura got this out:', generaltext , 'setting generaltext to NONE')
  663. generaltext = 'NONE'
  664. dictionary_entry_list[entry_id]["text"] = generaltext
  665. dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)
  666. f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
  667. f.write(str(dictionary_entry_list))
  668. f.close