diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 8d567f1..32a32f5 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/config.yaml b/spiders/config.yaml index 12cdb83..3627f8a 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -55,13 +55,13 @@ giz: link2: '' iteration-var-list: '[1,2,3,4,5,6,7]' #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" - parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody" - child-name: "//tr//td[2]/text()" - child-link: "/tr//td[5]/a/@href" - child-info: "/tr//td[3]/text()" - child-period: "/tr/td[1]/text()" + parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr" + child-name: "//td[3]//text()" + child-link: "//a/@href" + child-info: "/td[4]/text()[1]" + child-period: "//td[2]/abbr/text()" #child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" - child-sponsor: "//tr/td[4]/text()" + child-sponsor: "/tr/td[4]/text()" entry: general: uniform: 'FALSE' diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 3b5978e..e97ffcd 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -296,6 +296,15 @@ class fdb_spider(object): + "]" + fdb_conf_entry_list_child_link )[0] + + if 'javascript:' in link: + #from selenium import webdriver + print('link is javascript element, not url to parse') + #url = 'https://example.com' + #driver = webdriver.Chrome() + #driver.get(url) + #links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')] + #print('link', link) except Exception as e: @@ -313,7 +322,8 @@ class fdb_spider(object): dictionary_entry_list[n]["link"] = link if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): dictionary_entry_list[n]["link"] = link - + if 'javascript:' in link: + dictionary_entry_list[n]["link"] = link else: if link[-1] == '/': dictionary_entry_list[n]["link"] = fdb_domain + link @@ -397,7 +407,24 @@ class fdb_spider(object): else: file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + + if not web_content: + print('other downloading approaches did not work, trying requests') + + try: + from requests_html import HTMLSession + session = HTMLSession() + + r = session.get(entry_link) + r.html.render() + web_content = r.text + + except Exception as e: + print('requests_html HTMLSession did not work') + + os.makedirs(os.path.dirname(file_name), exist_ok=True) f = open(file_name, "w+") f.write(web_content)