added flow for selenium grabbing popup instead of links for entries
This commit is contained in:
parent
99c74dcbad
commit
ec180bed0a
4 changed files with 83 additions and 31 deletions
7
main.py
7
main.py
|
@ -4,7 +4,8 @@ from spiders.fdb_spider import *
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
config = "spiders/config.yaml"
|
config = "spiders/config.yaml"
|
||||||
list_of_fdbs = sys.argv[2]
|
list_of_fdbs = eval(sys.argv[1])
|
||||||
|
print(list_of_fdbs)
|
||||||
#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,11 +13,11 @@ list_of_fdbs = sys.argv[2]
|
||||||
|
|
||||||
spider = fdb_spider(config)
|
spider = fdb_spider(config)
|
||||||
|
|
||||||
spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
#spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||||
|
|
||||||
#spider.find_config_parameter(list_of_fdbs)
|
#spider.find_config_parameter(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
spider.download_entry_data_htmls(list_of_fdbs)
|
spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -58,6 +58,7 @@ giz:
|
||||||
parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr"
|
parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr"
|
||||||
child-name: "//td[3]//text()"
|
child-name: "//td[3]//text()"
|
||||||
child-link: "//a/@href"
|
child-link: "//a/@href"
|
||||||
|
javascript-link: "/td[6]/a"
|
||||||
child-info: "/td[4]/text()[1]"
|
child-info: "/td[4]/text()[1]"
|
||||||
child-period: "//td[2]/abbr/text()"
|
child-period: "//td[2]/abbr/text()"
|
||||||
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
||||||
|
|
|
@ -96,7 +96,7 @@ class fdb_spider(object):
|
||||||
print('also requests library did not work, original error is:', e)
|
print('also requests library did not work, original error is:', e)
|
||||||
|
|
||||||
|
|
||||||
print(web_content)
|
# print(web_content)
|
||||||
|
|
||||||
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
|
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
|
||||||
f.write(web_content)
|
f.write(web_content)
|
||||||
|
@ -343,6 +343,12 @@ class fdb_spider(object):
|
||||||
f.close
|
f.close
|
||||||
|
|
||||||
def download_entry_data_htmls(self, list_of_fdbs):
|
def download_entry_data_htmls(self, list_of_fdbs):
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
options.add_argument('headless')
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
for fdb in list_of_fdbs:
|
for fdb in list_of_fdbs:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -361,40 +367,84 @@ class fdb_spider(object):
|
||||||
|
|
||||||
dictionary_entry_list = eval(text)
|
dictionary_entry_list = eval(text)
|
||||||
|
|
||||||
|
fdb_conf = self.config.get(fdb)
|
||||||
|
fdb_domain = fdb_conf.get("domain")
|
||||||
|
fdb_conf_entry_list = fdb_conf.get("entry-list")
|
||||||
|
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
|
||||||
|
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
|
||||||
|
try:
|
||||||
|
fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
|
||||||
|
except Exception as e:
|
||||||
|
print('the javascript link in the config is missing, original error message is:', e)
|
||||||
|
fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
|
||||||
|
fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
|
||||||
|
|
||||||
|
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
|
||||||
|
|
||||||
for entry_id in dictionary_entry_list:
|
for entry_id in dictionary_entry_list:
|
||||||
entry_link = dictionary_entry_list[entry_id]["link"]
|
entry_link = dictionary_entry_list[entry_id]["link"]
|
||||||
|
web_content = 'NONE'
|
||||||
# download the html page of the entry
|
# download the html page of the entry
|
||||||
|
|
||||||
try:
|
|
||||||
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
if 'javascript' in entry_link:
|
||||||
url = entry_link
|
|
||||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
|
element = driver.find_element(
|
||||||
response = urllib.request.urlopen(req)
|
"xpath",
|
||||||
except Exception as e:
|
fdb_conf_entry_list_parent
|
||||||
|
+ "["
|
||||||
|
+ str(entry_id+1)
|
||||||
|
+ "]"
|
||||||
|
+ fdb_conf_entry_list_javascript_link
|
||||||
|
)
|
||||||
|
|
||||||
|
element.click()
|
||||||
|
window_after = driver.window_handles[1]
|
||||||
|
driver.switch_to.window(window_after)
|
||||||
|
element = driver.find_element("xpath", "//html")
|
||||||
|
web_content = element.text
|
||||||
|
|
||||||
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
|
f = open(file_name, "w+")
|
||||||
|
f.write(web_content)
|
||||||
|
f.close
|
||||||
|
|
||||||
|
window_before = driver.window_handles[0]
|
||||||
|
driver.switch_to.window(window_before)
|
||||||
|
|
||||||
|
|
||||||
|
if ('http' or 'www') in entry_link and ('javascript' or 'js' or '.pdf') not in enry_link:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
||||||
print(
|
url = entry_link
|
||||||
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
|
||||||
e,
|
response = urllib.request.urlopen(req)
|
||||||
)
|
except Exception as e:
|
||||||
except Exception as ex:
|
try:
|
||||||
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
|
||||||
|
print(
|
||||||
|
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
web_content = response.read().decode("UTF-8")
|
|
||||||
except Exception as e:
|
|
||||||
try:
|
try:
|
||||||
web_content = response.read().decode("latin-1")
|
web_content = response.read().decode("UTF-8")
|
||||||
print(
|
except Exception as e:
|
||||||
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
|
try:
|
||||||
e,
|
web_content = response.read().decode("latin-1")
|
||||||
)
|
print(
|
||||||
except Exception as ex:
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
|
||||||
print(ex)
|
e,
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
print(ex)
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
|
||||||
if '.pdf' in entry_link:
|
if '.pdf' in entry_link:
|
||||||
|
|
||||||
|
@ -409,7 +459,7 @@ class fdb_spider(object):
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
|
||||||
|
|
||||||
if not web_content:
|
if web_content == 'NONE':
|
||||||
print('other downloading approaches did not work, trying requests')
|
print('other downloading approaches did not work, trying requests')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in a new issue