added flow for selenium grabbing popup instead of links for entries

2023-12-05 22:16:07 +00:00 · 2023-12-05 22:16:07 +00:00 · ec180bed0a
commit ec180bed0a
parent 99c74dcbad
4 changed files with 83 additions and 31 deletions
--- a/main.py
+++ b/main.py
@ -4,7 +4,8 @@ from spiders.fdb_spider import *
 import sys
 config = "spiders/config.yaml"
-list_of_fdbs = sys.argv[2]
+list_of_fdbs = eval(sys.argv[1])
 print(list_of_fdbs)
 #list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
@ -12,11 +13,11 @@ list_of_fdbs = sys.argv[2]
 spider = fdb_spider(config)
-spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
+#spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
 #spider.find_config_parameter(list_of_fdbs)
-spider.parse_entry_list_data2dictionary(list_of_fdbs)
+#spider.parse_entry_list_data2dictionary(list_of_fdbs)
 spider.download_entry_data_htmls(list_of_fdbs)
--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -58,6 +58,7 @@ giz:
    parent:  "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr"
    child-name:  "//td[3]//text()"
    child-link:  "//a/@href"
    javascript-link: "/td[6]/a"
    child-info:  "/td[4]/text()[1]"
    child-period:  "//td[2]/abbr/text()"
    #child-period:  "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -96,7 +96,7 @@ class fdb_spider(object):
                                print('also requests library did not work, original error is:', e)
-                        print(web_content)
+                       # print(web_content)
                        f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
                        f.write(web_content)
@ -343,6 +343,12 @@ class fdb_spider(object):
                f.close
    def download_entry_data_htmls(self, list_of_fdbs):
        from selenium import webdriver
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        driver = webdriver.Chrome(options=options)
        for fdb in list_of_fdbs:
            try:
@ -361,40 +367,84 @@ class fdb_spider(object):
                dictionary_entry_list = eval(text)
                fdb_conf = self.config.get(fdb)
                fdb_domain = fdb_conf.get("domain")
                fdb_conf_entry_list = fdb_conf.get("entry-list")
                fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
                fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
                try:
                    fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
                except Exception as e:
                    print('the javascript link in the config is missing, original error message is:', e)
                fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
                fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
                driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
                for entry_id in dictionary_entry_list:
                    entry_link = dictionary_entry_list[entry_id]["link"]
-
+                    web_content = 'NONE'
                    # download the html page of the entry
-                    try:
+                    
-                        # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
+                    if 'javascript' in entry_link:
-                        url = entry_link
+                        
-                        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
+                        element = driver.find_element(
-                        response = urllib.request.urlopen(req)
+                            "xpath",
-                    except Exception as e:
+                            fdb_conf_entry_list_parent
                            + "["
                            + str(entry_id+1)
                            + "]"
                            + fdb_conf_entry_list_javascript_link
                        )
                        element.click()
                        window_after = driver.window_handles[1]
                        driver.switch_to.window(window_after)
                        element = driver.find_element("xpath", "//html")
                        web_content = element.text
                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                        os.makedirs(os.path.dirname(file_name), exist_ok=True)
                        f = open(file_name, "w+")
                        f.write(web_content)
                        f.close
                        window_before = driver.window_handles[0]
                        driver.switch_to.window(window_before)
                    if ('http' or 'www') in entry_link and ('javascript' or 'js' or '.pdf') not in enry_link:
                        try:
-                            response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
+                            # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
-                            print(
+                            url = entry_link
-                                "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
+                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
-                                e,
+                            response = urllib.request.urlopen(req)
-                            )
+                        except Exception as e:
-                        except Exception as ex:
+                            try:
-                            print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
+                                response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
                                print(
                                    "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
                                    e,
                                )
                            except Exception as ex:
                                print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
                    try:
                        web_content = response.read().decode("UTF-8")
                    except Exception as e:
                        try:
-                            web_content = response.read().decode("latin-1")
+                            web_content = response.read().decode("UTF-8")
-                            print(
+                        except Exception as e:
-                                "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
+                            try:
-                                e,
+                                web_content = response.read().decode("latin-1")
-                            )
+                                print(
-                        except Exception as ex:
+                                    "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
-                            print(ex)
+                                    e,
                                )
                            except Exception as ex:
                                print(ex)
-                    # save interim results to files
+                        # save interim results to files
                    if '.pdf' in entry_link:
@ -409,7 +459,7 @@ class fdb_spider(object):
                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
-                        if not web_content:
+                        if web_content == 'NONE':
                            print('other downloading approaches did not work, trying requests')
                            try: