added new lines to chromedriver, to make it work on other systems

This commit is contained in:
alpcentaur 2023-12-13 16:05:26 +01:00
parent d2324d265a
commit 953f85ee5b
3 changed files with 25 additions and 6 deletions

View file

@ -9,7 +9,7 @@ foerderinfo.bund.de:
entry-list: entry-list:
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
link2: '#searchResults' link2: '#searchResults'
iteration-var-list: '[1,2,3,4,5,6,7,8]' iteration-var-list: '[1,2,3,4,5]'
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']"
child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()"
child-link: "/a[@class='c-search-result']/@href" child-link: "/a[@class='c-search-result']/@href"
@ -28,7 +28,7 @@ foerderinfo.bund.de-bekanntmachungen:
entry-list: entry-list:
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D' link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D'
link2: '#searchResults' link2: '#searchResults'
iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]' iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]'
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]" parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]"
child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()" child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()"

View file

@ -320,15 +320,21 @@ class fdb_spider(object):
if fdb_domain in link: if fdb_domain in link:
dictionary_entry_list[n]["link"] = link dictionary_entry_list[n]["link"] = link
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): if fdb_domain not in link and 'http:' in link:
dictionary_entry_list[n]["link"] = link
if fdb_domain not in link and 'www.' in link:
dictionary_entry_list[n]["link"] = link
if fdb_domain not in link and 'https:' in link:
dictionary_entry_list[n]["link"] = link dictionary_entry_list[n]["link"] = link
if 'javascript:' in link: if 'javascript:' in link:
dictionary_entry_list[n]["link"] = link dictionary_entry_list[n]["link"] = link
else: if fdb_domain not in link and ('http' or 'https' or 'www.') not in link:
if link[-1] == '/': if link[-1] == '/':
dictionary_entry_list[n]["link"] = fdb_domain + link dictionary_entry_list[n]["link"] = fdb_domain + link
else: else:
dictionary_entry_list[n]["link"] = fdb_domain + '/' + link dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
except Exception as e: except Exception as e:
print( print(
@ -345,10 +351,23 @@ class fdb_spider(object):
def download_entry_data_htmls(self, list_of_fdbs): def download_entry_data_htmls(self, list_of_fdbs):
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 800))
display.start()
#outputdir = '.'
#service_log_path = "{}/chromedriver.log".format(outputdir)
#service_args = ['--verbose']
#driver = webdriver.Chrome('/usr/bin/chromium')
options = webdriver.ChromeOptions() options = webdriver.ChromeOptions()
options.add_argument('headless') options.add_argument('headless')
driver = webdriver.Chrome(options=options) options.add_argument("--remote-debugging-port=9222")
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
service = Service(executable_path='/usr/bin/chromedriver')
driver = webdriver.Chrome(options=options, service=service)
#driver = webdriver.Chrome()
for fdb in list_of_fdbs: for fdb in list_of_fdbs:
try: try:
@ -427,7 +446,7 @@ class fdb_spider(object):
driver.switch_to.window(window_before) driver.switch_to.window(window_before)
if ('http' or 'www') in entry_link and ('javascript' or 'js' or '.pdf') not in enry_link: if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link:
try: try:
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects # defining cookie to not end up in endless loop because of cookie banners pointing to redirects