merged onlinkgen with master, and added more universal chrome driver initialization to the beginning of the javascript entries gothrough function in download_entry_list_pages_of_funding_databases()

2023-12-14 12:38:14 +00:00 · 2023-12-14 12:38:14 +00:00 · 5627c80177
commit 5627c80177
parent 14b8db7941 fbee5d6229
4 changed files with 41 additions and 9 deletions
--- a/main.py
+++ b/main.py
@ -16,9 +16,9 @@ spider = fdb_spider(config)

 #spider.find_config_parameter(list_of_fdbs)

-#spider.parse_entry_list_data2dictionary(list_of_fdbs)
+spider.parse_entry_list_data2dictionary(list_of_fdbs)

-#spider.download_entry_data_htmls(list_of_fdbs)
+spider.download_entry_data_htmls(list_of_fdbs)

 spider.parse_entry_data2dictionary(list_of_fdbs)

--- a/requirements.txt
+++ b/requirements.txt
@ -20,6 +20,7 @@ pycryptodome==3.19.0
 PySocks==1.7.1
 python-dateutil==2.8.2
 pytz==2023.3.post1
+PyVirtualDisplay==3.0
 PyYAML==6.0.1
 regex==2023.10.3
 requests==2.31.0
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -9,7 +9,7 @@ foerderinfo.bund.de:
  entry-list:
    link1:  'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
    link2:  '#searchResults'
-    iteration-var-list:  '[1,2,3,4,5,6,7,8]'
+    iteration-var-list:  '[1,2,3,4,5]'
    parent:  "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']"
    child-name:  "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()"
    child-link:  "/a[@class='c-search-result']/@href"
@ -28,7 +28,7 @@ foerderinfo.bund.de-bekanntmachungen:
  entry-list:
    link1:  'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D'
    link2:  '#searchResults'
-    iteration-var-list:  '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]'
+    iteration-var-list:  '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]'
    #parent:  "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
    parent:  "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]"
    child-name:  "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()"
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -133,10 +133,22 @@ class fdb_spider(object):
                            f.close
                    else:
                        from selenium import webdriver
+                        from selenium.webdriver.chrome.service import Service
+                        from pyvirtualdisplay import Display
+                        display = Display(visible=0, size=(800, 800))  
+                        display.start()

+                        #outputdir = '.'
+                        #service_log_path = "{}/chromedriver.log".format(outputdir)
+                        #service_args = ['--verbose']
+                        #driver = webdriver.Chrome('/usr/bin/chromium')
                        options = webdriver.ChromeOptions()
                        options.add_argument('headless')
-                        driver = webdriver.Chrome(options=options)
+                        options.add_argument("--remote-debugging-port=9222")
+                        options.add_argument('--no-sandbox')
+                        options.add_argument('--disable-dev-shm-usage')
+                        service = Service(executable_path='/usr/bin/chromedriver')
+                        driver = webdriver.Chrome(options=options, service=service)
                                        

    def find_config_parameter(self, list_of_fdbs):
@ -357,15 +369,21 @@ class fdb_spider(object):

                            if fdb_domain in link:
                                dictionary_entry_list[n]["link"] = link
-                            if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
+                            if fdb_domain not in link and 'http:' in link:
+                                dictionary_entry_list[n]["link"] = link
+                            if fdb_domain not in link and 'www.' in link:
+                                dictionary_entry_list[n]["link"] = link
+                            if fdb_domain not in link and 'https:' in link:
                                dictionary_entry_list[n]["link"] = link
                            if 'javascript:' in link:
                                dictionary_entry_list[n]["link"] = link
-                            else:
+                            if fdb_domain not in link and ('http' or 'https' or 'www.') not in link:
                                if link[-1] == '/':
                                    dictionary_entry_list[n]["link"] = fdb_domain + link
                                else:
                                    dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
+                                
+                            

                except Exception as e:
                    print(
@ -382,10 +400,23 @@ class fdb_spider(object):
    def download_entry_data_htmls(self, list_of_fdbs):
        
        from selenium import webdriver
+        from selenium.webdriver.chrome.service import Service
+        from pyvirtualdisplay import Display
+        display = Display(visible=0, size=(800, 800))  
+        display.start()

+        #outputdir = '.'
+        #service_log_path = "{}/chromedriver.log".format(outputdir)
+        #service_args = ['--verbose']
+        #driver = webdriver.Chrome('/usr/bin/chromium')
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
-        driver = webdriver.Chrome(options=options)
+        options.add_argument("--remote-debugging-port=9222")
+        options.add_argument('--no-sandbox')
+        options.add_argument('--disable-dev-shm-usage')
+        service = Service(executable_path='/usr/bin/chromedriver')
+        driver = webdriver.Chrome(options=options, service=service)
+        #driver = webdriver.Chrome()
        for fdb in list_of_fdbs:
            
            try:
@ -464,7 +495,7 @@ class fdb_spider(object):
                        driver.switch_to.window(window_before)
                        
                    
-                    if ('http' or 'www') in entry_link and ('javascript' or 'js' or '.pdf') not in enry_link:
+                    if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link:
                    
                        try:
                            # defining cookie to not end up in endless loop because of cookie banners pointing to redirects