added new database ted.europe.eu, created new case of slow downloading, intergrated scrolling into entrylistpagesdownload

2024-02-09 18:38:49 +00:00 · 2024-02-09 18:38:49 +00:00 · a0dd469f25
commit a0dd469f25
parent 094f092291
4 changed files with 109 additions and 36 deletions
--- a/main.py
+++ b/main.py
@ -5,8 +5,8 @@ import sys
 config = "spiders/config.yaml"
 #list_of_fdbs = eval(sys.argv[1])
-list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
+#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
-#list_of_fdbs = ["giz"]
+list_of_fdbs = ["ted.europa.eu"]
 # doing the crawling of government websites
--- a/spiders/pycache/fdb_spider.cpython-311.pyc
+++ b/spiders/pycache/fdb_spider.cpython-311.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -146,7 +146,7 @@ class fdb_spider(object):
                    else:
                        from selenium import webdriver
                        from selenium.webdriver.chrome.service import Service
-                        
+                        #from selenium.webdriver.common.action_chains import ActionChains
                        from pyvirtualdisplay import Display
                        display = Display(visible=0, size=(800, 800))  
                        display.start()
@ -164,6 +164,7 @@ class fdb_spider(object):
                        service = Service(executable_path='/usr/bin/chromedriver')
                        driver = webdriver.Chrome(options=options, service=service)
                        # driver = webdriver.Chrome()
                        driver.implicitly_wait(10) 
                        driver.get(entry_jsdomain)
                        for i in range(len(entry_jsiteration_var_list)):
                            time.sleep(2)
@ -176,8 +177,14 @@ class fdb_spider(object):
                                    + entry_list_jslink2
                                )
                                print(entry_iteration_var_list[i])
-                                time.sleep(2)
+                                time.sleep(1)
                                print('scrolling..')
                                # scroll into view, because otherwise with javascript generated elements
                                # it can be that clicking returns an error
                                driver.execute_script("arguments[0].scrollIntoView();", element)
                                print('clicking..')
                                time.sleep(1)
                                element.click()
                                time.sleep(2)
                                #window_after = driver.window_handles[1]
@ -476,6 +483,7 @@ class fdb_spider(object):
        options.add_argument('--disable-dev-shm-usage')
        service = Service(executable_path='/usr/bin/chromedriver')
        driver = webdriver.Chrome(options=options, service=service)
        driver.implicitly_wait(10) 
        #driver = webdriver.Chrome()
        for fdb in list_of_fdbs:
            print('spidering ' + fdb + ' ..')
@ -503,11 +511,21 @@ class fdb_spider(object):
                try:
                    fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
                except Exception as e:
                    fdb_conf_entry_list_javascript_link = 'NONE'
                    print('the javascript link in the config is missing, original error message is:', e)
                try:
                    fdb_conf_entry_list_slow_downloading = fdb_conf_entry_list.get("slow-downloading")
                except Exception as e:
                    print('the slow-downloading parameter is not set, original error message is:', e)    
                fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
                fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
-                driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
+                if fdb_conf_entry_list_slow_downloading == 'FALSE':
                    driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
                else:
                    pass
                for entry_id in dictionary_entry_list:
                    print(entry_id)
@ -516,7 +534,7 @@ class fdb_spider(object):
                    # download the html page of the entry
                    print(entry_link)
-                    if 'javascript' in entry_link:
+                    if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
                        print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
                        element = driver.find_element(
                            "xpath",
@ -556,42 +574,58 @@ class fdb_spider(object):
                        driver.switch_to.window(window_before)
-                    if 'javascript' not in entry_link and '.pdf' not in entry_link:
+                    if 'javascript' not in entry_link and '.pdf' not in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
                        print('blabuuuuuba')
                        #print('oi')
-                        try:
+                        
-                            # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
+                        
-                            url = entry_link
+                        if fdb_conf_entry_list_slow_downloading == 'TRUE':
-                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
+                            
                            response = urllib.request.urlopen(req)
                            print('response from first one', response)
                        except Exception as e:
                            print('cookie giving then downloading did not work, original error is:', e)
                            try:
                                response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
                                print(
                                    "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
                                    e,
                                )
                            except Exception as ex:
                                print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
                                print("trying to get slowly entry link " , entry_link)
                                driver.get(entry_link)
                                time.sleep(3)
                                web_content = driver.page_source
                            except Exception as e:
                                print("getting the html behind the entry link did not work, ori message is:", e)
                        else:
                        try:
                            web_content = response.read().decode("UTF-8")
                        except Exception as e:
                            try:
-                                web_content = response.read().decode("latin-1")
+                                # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
-                                print(
+                                url = entry_link
-                                    "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
+                                req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
-                                    e,
+                                response = urllib.request.urlopen(req)
-                                )
+                                print('response from first one', response)
-                            except Exception as ex:
+                            except Exception as e:
-                                print(ex)
+                                print('cookie giving then downloading did not work, original error is:', e)
                                try:
                                    response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
                                    print(
                                        "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
                                        e,
                                    )
                                except Exception as ex:
                                    print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
                        # save interim results to files
-                    if '.pdf' in entry_link:
+                            try:
                                web_content = response.read().decode("UTF-8")
                            except Exception as e:
                                try:
                                    web_content = response.read().decode("latin-1")
                                    print(
                                        "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
                                        e,
                                    )
                                except Exception as ex:
                                    print(ex)
                            # save interim results to files
                    if '.pdf' in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                        response = requests.get(entry_link)
@ -606,6 +640,10 @@ class fdb_spider(object):
                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                    wget_wrote = False
                    if web_content == 'NONE':
                        print('other downloading approaches did not work, trying requests')