added further handling for javascript links not being urls, made config for giz work
This commit is contained in:
parent
a0075e429d
commit
89dcca2031
3 changed files with 34 additions and 7 deletions
Binary file not shown.
|
@ -55,13 +55,13 @@ giz:
|
||||||
link2: ''
|
link2: ''
|
||||||
iteration-var-list: '[1,2,3,4,5,6,7]'
|
iteration-var-list: '[1,2,3,4,5,6,7]'
|
||||||
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
|
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
|
||||||
parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody"
|
parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr"
|
||||||
child-name: "//tr//td[2]/text()"
|
child-name: "//td[3]//text()"
|
||||||
child-link: "/tr//td[5]/a/@href"
|
child-link: "//a/@href"
|
||||||
child-info: "/tr//td[3]/text()"
|
child-info: "/td[4]/text()[1]"
|
||||||
child-period: "/tr/td[1]/text()"
|
child-period: "//td[2]/abbr/text()"
|
||||||
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
||||||
child-sponsor: "//tr/td[4]/text()"
|
child-sponsor: "/tr/td[4]/text()"
|
||||||
entry:
|
entry:
|
||||||
general:
|
general:
|
||||||
uniform: 'FALSE'
|
uniform: 'FALSE'
|
||||||
|
|
|
@ -296,6 +296,15 @@ class fdb_spider(object):
|
||||||
+ "]"
|
+ "]"
|
||||||
+ fdb_conf_entry_list_child_link
|
+ fdb_conf_entry_list_child_link
|
||||||
)[0]
|
)[0]
|
||||||
|
|
||||||
|
if 'javascript:' in link:
|
||||||
|
#from selenium import webdriver
|
||||||
|
print('link is javascript element, not url to parse')
|
||||||
|
#url = 'https://example.com'
|
||||||
|
#driver = webdriver.Chrome()
|
||||||
|
#driver.get(url)
|
||||||
|
#links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')]
|
||||||
|
|
||||||
#print('link', link)
|
#print('link', link)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -313,7 +322,8 @@ class fdb_spider(object):
|
||||||
dictionary_entry_list[n]["link"] = link
|
dictionary_entry_list[n]["link"] = link
|
||||||
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
|
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
|
||||||
dictionary_entry_list[n]["link"] = link
|
dictionary_entry_list[n]["link"] = link
|
||||||
|
if 'javascript:' in link:
|
||||||
|
dictionary_entry_list[n]["link"] = link
|
||||||
else:
|
else:
|
||||||
if link[-1] == '/':
|
if link[-1] == '/':
|
||||||
dictionary_entry_list[n]["link"] = fdb_domain + link
|
dictionary_entry_list[n]["link"] = fdb_domain + link
|
||||||
|
@ -397,7 +407,24 @@ class fdb_spider(object):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
|
||||||
|
|
||||||
|
if not web_content:
|
||||||
|
print('other downloading approaches did not work, trying requests')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from requests_html import HTMLSession
|
||||||
|
session = HTMLSession()
|
||||||
|
|
||||||
|
r = session.get(entry_link)
|
||||||
|
|
||||||
|
r.html.render()
|
||||||
|
web_content = r.text
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('requests_html HTMLSession did not work')
|
||||||
|
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
f = open(file_name, "w+")
|
f = open(file_name, "w+")
|
||||||
f.write(web_content)
|
f.write(web_content)
|
||||||
|
|
Loading…
Reference in a new issue