added further handling for javascript links not being urls, made config for giz work

This commit is contained in:
alpcentaur 2023-11-28 15:27:39 +00:00
parent a0075e429d
commit 89dcca2031
3 changed files with 34 additions and 7 deletions

View file

@ -55,13 +55,13 @@ giz:
link2: '' link2: ''
iteration-var-list: '[1,2,3,4,5,6,7]' iteration-var-list: '[1,2,3,4,5,6,7]'
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody" parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr"
child-name: "//tr//td[2]/text()" child-name: "//td[3]//text()"
child-link: "/tr//td[5]/a/@href" child-link: "//a/@href"
child-info: "/tr//td[3]/text()" child-info: "/td[4]/text()[1]"
child-period: "/tr/td[1]/text()" child-period: "//td[2]/abbr/text()"
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" #child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
child-sponsor: "//tr/td[4]/text()" child-sponsor: "/tr/td[4]/text()"
entry: entry:
general: general:
uniform: 'FALSE' uniform: 'FALSE'

View file

@ -296,6 +296,15 @@ class fdb_spider(object):
+ "]" + "]"
+ fdb_conf_entry_list_child_link + fdb_conf_entry_list_child_link
)[0] )[0]
if 'javascript:' in link:
#from selenium import webdriver
print('link is javascript element, not url to parse')
#url = 'https://example.com'
#driver = webdriver.Chrome()
#driver.get(url)
#links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')]
#print('link', link) #print('link', link)
except Exception as e: except Exception as e:
@ -313,7 +322,8 @@ class fdb_spider(object):
dictionary_entry_list[n]["link"] = link dictionary_entry_list[n]["link"] = link
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
dictionary_entry_list[n]["link"] = link dictionary_entry_list[n]["link"] = link
if 'javascript:' in link:
dictionary_entry_list[n]["link"] = link
else: else:
if link[-1] == '/': if link[-1] == '/':
dictionary_entry_list[n]["link"] = fdb_domain + link dictionary_entry_list[n]["link"] = fdb_domain + link
@ -397,7 +407,24 @@ class fdb_spider(object):
else: else:
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
if not web_content:
print('other downloading approaches did not work, trying requests')
try:
from requests_html import HTMLSession
session = HTMLSession()
r = session.get(entry_link)
r.html.render()
web_content = r.text
except Exception as e:
print('requests_html HTMLSession did not work')
os.makedirs(os.path.dirname(file_name), exist_ok=True) os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+") f = open(file_name, "w+")
f.write(web_content) f.write(web_content)