added new database ted.europe.eu, created new case of slow downloading, intergrated scrolling into entrylistpagesdownload

2024-02-09 18:38:49 +00:00 · 2024-02-09 18:38:49 +00:00 · a0dd469f25
commit a0dd469f25
parent 094f092291
4 changed files with 109 additions and 36 deletions
--- a/main.py
+++ b/main.py
@ -5,8 +5,8 @@ import sys

 config = "spiders/config.yaml"
 #list_of_fdbs = eval(sys.argv[1])
-list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
-#list_of_fdbs = ["giz"]
+#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
+list_of_fdbs = ["ted.europa.eu"]


 # doing the crawling of government websites
--- a/spiders/pycache/fdb_spider.cpython-311.pyc
+++ b/spiders/pycache/fdb_spider.cpython-311.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -146,7 +146,7 @@ class fdb_spider(object):
                    else:
                        from selenium import webdriver
                        from selenium.webdriver.chrome.service import Service
-                        
+                        #from selenium.webdriver.common.action_chains import ActionChains
                        from pyvirtualdisplay import Display
                        display = Display(visible=0, size=(800, 800))  
                        display.start()
@ -164,6 +164,7 @@ class fdb_spider(object):
                        service = Service(executable_path='/usr/bin/chromedriver')
                        driver = webdriver.Chrome(options=options, service=service)
                        # driver = webdriver.Chrome()
+                        driver.implicitly_wait(10) 
                        driver.get(entry_jsdomain)
                        for i in range(len(entry_jsiteration_var_list)):
                            time.sleep(2)
@ -176,8 +177,14 @@ class fdb_spider(object):
                                    + entry_list_jslink2
                                )
                                print(entry_iteration_var_list[i])
-                                time.sleep(2)
+                                time.sleep(1)
+                                print('scrolling..')
+                                
+                                # scroll into view, because otherwise with javascript generated elements
+                                # it can be that clicking returns an error
+                                driver.execute_script("arguments[0].scrollIntoView();", element)
                                print('clicking..')
+                                time.sleep(1)
                                element.click()
                                time.sleep(2)
                                #window_after = driver.window_handles[1]
@ -476,6 +483,7 @@ class fdb_spider(object):
        options.add_argument('--disable-dev-shm-usage')
        service = Service(executable_path='/usr/bin/chromedriver')
        driver = webdriver.Chrome(options=options, service=service)
+        driver.implicitly_wait(10) 
        #driver = webdriver.Chrome()
        for fdb in list_of_fdbs:
            print('spidering ' + fdb + ' ..')
@ -503,12 +511,22 @@ class fdb_spider(object):
                try:
                    fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
                except Exception as e:
+                    fdb_conf_entry_list_javascript_link = 'NONE'
                    print('the javascript link in the config is missing, original error message is:', e)
+                try:
+                    fdb_conf_entry_list_slow_downloading = fdb_conf_entry_list.get("slow-downloading")
+                except Exception as e:
+                    print('the slow-downloading parameter is not set, original error message is:', e)    
                fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
                fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
                
+                if fdb_conf_entry_list_slow_downloading == 'FALSE':
+                
                    driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)

+                else:
+                    pass
+
                for entry_id in dictionary_entry_list:
                    print(entry_id)
                    entry_link = dictionary_entry_list[entry_id]["link"]
@ -516,7 +534,7 @@ class fdb_spider(object):
                    # download the html page of the entry
                    print(entry_link)
                    
-                    if 'javascript' in entry_link:
+                    if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
                        print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
                        element = driver.find_element(
                            "xpath",
@ -556,9 +574,25 @@ class fdb_spider(object):
                        driver.switch_to.window(window_before)
                        
                    
-                    if 'javascript' not in entry_link and '.pdf' not in entry_link:
+                    if 'javascript' not in entry_link and '.pdf' not in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
                        print('blabuuuuuba')
                        #print('oi')
+                        
+                        
+                        if fdb_conf_entry_list_slow_downloading == 'TRUE':
+                            
+                            try:
+                                
+                                print("trying to get slowly entry link " , entry_link)
+                                driver.get(entry_link)
+                                time.sleep(3)
+                                web_content = driver.page_source
+                            
+                            except Exception as e:
+                                print("getting the html behind the entry link did not work, ori message is:", e)
+                        
+                        else:
+                        
                            try:
                                # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
                                url = entry_link
@ -591,7 +625,7 @@ class fdb_spider(object):

                            # save interim results to files
                    
-                    if '.pdf' in entry_link:
+                    if '.pdf' in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
                    
                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                        response = requests.get(entry_link)
@ -606,6 +640,10 @@ class fdb_spider(object):
                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                        
                    wget_wrote = False
+                    
+                    
+                        
+                    
                    if web_content == 'NONE':
                        print('other downloading approaches did not work, trying requests')