added further handling for javascript links not being urls, made config for giz work
This commit is contained in:
parent
a0075e429d
commit
89dcca2031
3 changed files with 34 additions and 7 deletions
Binary file not shown.
|
@ -55,13 +55,13 @@ giz:
|
|||
link2: ''
|
||||
iteration-var-list: '[1,2,3,4,5,6,7]'
|
||||
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
|
||||
parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody"
|
||||
child-name: "//tr//td[2]/text()"
|
||||
child-link: "/tr//td[5]/a/@href"
|
||||
child-info: "/tr//td[3]/text()"
|
||||
child-period: "/tr/td[1]/text()"
|
||||
parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr"
|
||||
child-name: "//td[3]//text()"
|
||||
child-link: "//a/@href"
|
||||
child-info: "/td[4]/text()[1]"
|
||||
child-period: "//td[2]/abbr/text()"
|
||||
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
||||
child-sponsor: "//tr/td[4]/text()"
|
||||
child-sponsor: "/tr/td[4]/text()"
|
||||
entry:
|
||||
general:
|
||||
uniform: 'FALSE'
|
||||
|
|
|
@ -296,6 +296,15 @@ class fdb_spider(object):
|
|||
+ "]"
|
||||
+ fdb_conf_entry_list_child_link
|
||||
)[0]
|
||||
|
||||
if 'javascript:' in link:
|
||||
#from selenium import webdriver
|
||||
print('link is javascript element, not url to parse')
|
||||
#url = 'https://example.com'
|
||||
#driver = webdriver.Chrome()
|
||||
#driver.get(url)
|
||||
#links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')]
|
||||
|
||||
#print('link', link)
|
||||
|
||||
except Exception as e:
|
||||
|
@ -313,7 +322,8 @@ class fdb_spider(object):
|
|||
dictionary_entry_list[n]["link"] = link
|
||||
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
|
||||
dictionary_entry_list[n]["link"] = link
|
||||
|
||||
if 'javascript:' in link:
|
||||
dictionary_entry_list[n]["link"] = link
|
||||
else:
|
||||
if link[-1] == '/':
|
||||
dictionary_entry_list[n]["link"] = fdb_domain + link
|
||||
|
@ -398,6 +408,23 @@ class fdb_spider(object):
|
|||
else:
|
||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||
|
||||
|
||||
if not web_content:
|
||||
print('other downloading approaches did not work, trying requests')
|
||||
|
||||
try:
|
||||
from requests_html import HTMLSession
|
||||
session = HTMLSession()
|
||||
|
||||
r = session.get(entry_link)
|
||||
|
||||
r.html.render()
|
||||
web_content = r.text
|
||||
|
||||
except Exception as e:
|
||||
print('requests_html HTMLSession did not work')
|
||||
|
||||
|
||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||
f = open(file_name, "w+")
|
||||
f.write(web_content)
|
||||
|
|
Loading…
Reference in a new issue