diff --git a/main.py b/main.py index 3cc9312..4a2d91f 100644 --- a/main.py +++ b/main.py @@ -16,9 +16,9 @@ spider = fdb_spider(config) #spider.find_config_parameter(list_of_fdbs) -#spider.parse_entry_list_data2dictionary(list_of_fdbs) +spider.parse_entry_list_data2dictionary(list_of_fdbs) -#spider.download_entry_data_htmls(list_of_fdbs) +spider.download_entry_data_htmls(list_of_fdbs) spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/requirements.txt b/requirements.txt index fe32fe3..434498f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ pycryptodome==3.19.0 PySocks==1.7.1 python-dateutil==2.8.2 pytz==2023.3.post1 +PyVirtualDisplay==3.0 PyYAML==6.0.1 regex==2023.10.3 requests==2.31.0 diff --git a/spiders/config.yaml b/spiders/config.yaml index 3afa2f7..dca8a01 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -9,7 +9,7 @@ foerderinfo.bund.de: entry-list: link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link2: '#searchResults' - iteration-var-list: '[1,2,3,4,5,6,7,8]' + iteration-var-list: '[1,2,3,4,5]' parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" child-link: "/a[@class='c-search-result']/@href" @@ -28,7 +28,7 @@ foerderinfo.bund.de-bekanntmachungen: entry-list: link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D' link2: '#searchResults' - iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]' + iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]' #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]" child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()" diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index e831f8d..023adf7 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -133,10 +133,22 @@ class fdb_spider(object): f.close else: from selenium import webdriver - + from selenium.webdriver.chrome.service import Service + from pyvirtualdisplay import Display + display = Display(visible=0, size=(800, 800)) + display.start() + + #outputdir = '.' + #service_log_path = "{}/chromedriver.log".format(outputdir) + #service_args = ['--verbose'] + #driver = webdriver.Chrome('/usr/bin/chromium') options = webdriver.ChromeOptions() options.add_argument('headless') - driver = webdriver.Chrome(options=options) + options.add_argument("--remote-debugging-port=9222") + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + service = Service(executable_path='/usr/bin/chromedriver') + driver = webdriver.Chrome(options=options, service=service) def find_config_parameter(self, list_of_fdbs): @@ -357,15 +369,21 @@ class fdb_spider(object): if fdb_domain in link: dictionary_entry_list[n]["link"] = link - if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): + if fdb_domain not in link and 'http:' in link: + dictionary_entry_list[n]["link"] = link + if fdb_domain not in link and 'www.' in link: + dictionary_entry_list[n]["link"] = link + if fdb_domain not in link and 'https:' in link: dictionary_entry_list[n]["link"] = link if 'javascript:' in link: dictionary_entry_list[n]["link"] = link - else: + if fdb_domain not in link and ('http' or 'https' or 'www.') not in link: if link[-1] == '/': dictionary_entry_list[n]["link"] = fdb_domain + link else: dictionary_entry_list[n]["link"] = fdb_domain + '/' + link + + except Exception as e: print( @@ -382,10 +400,23 @@ class fdb_spider(object): def download_entry_data_htmls(self, list_of_fdbs): from selenium import webdriver - + from selenium.webdriver.chrome.service import Service + from pyvirtualdisplay import Display + display = Display(visible=0, size=(800, 800)) + display.start() + + #outputdir = '.' + #service_log_path = "{}/chromedriver.log".format(outputdir) + #service_args = ['--verbose'] + #driver = webdriver.Chrome('/usr/bin/chromium') options = webdriver.ChromeOptions() options.add_argument('headless') - driver = webdriver.Chrome(options=options) + options.add_argument("--remote-debugging-port=9222") + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + service = Service(executable_path='/usr/bin/chromedriver') + driver = webdriver.Chrome(options=options, service=service) + #driver = webdriver.Chrome() for fdb in list_of_fdbs: try: @@ -464,7 +495,7 @@ class fdb_spider(object): driver.switch_to.window(window_before) - if ('http' or 'www') in entry_link and ('javascript' or 'js' or '.pdf') not in enry_link: + if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link: try: # defining cookie to not end up in endless loop because of cookie banners pointing to redirects