merged onlinkgen with master, and added more universal chrome driver initialization to the beginning of the javascript entries gothrough function in download_entry_list_pages_of_funding_databases()
This commit is contained in:
commit
5627c80177
4 changed files with 41 additions and 9 deletions
4
main.py
4
main.py
|
@ -16,9 +16,9 @@ spider = fdb_spider(config)
|
||||||
|
|
||||||
#spider.find_config_parameter(list_of_fdbs)
|
#spider.find_config_parameter(list_of_fdbs)
|
||||||
|
|
||||||
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
#spider.download_entry_data_htmls(list_of_fdbs)
|
spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_data2dictionary(list_of_fdbs)
|
spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ pycryptodome==3.19.0
|
||||||
PySocks==1.7.1
|
PySocks==1.7.1
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
pytz==2023.3.post1
|
pytz==2023.3.post1
|
||||||
|
PyVirtualDisplay==3.0
|
||||||
PyYAML==6.0.1
|
PyYAML==6.0.1
|
||||||
regex==2023.10.3
|
regex==2023.10.3
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
|
|
|
@ -9,7 +9,7 @@ foerderinfo.bund.de:
|
||||||
entry-list:
|
entry-list:
|
||||||
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
|
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
|
||||||
link2: '#searchResults'
|
link2: '#searchResults'
|
||||||
iteration-var-list: '[1,2,3,4,5,6,7,8]'
|
iteration-var-list: '[1,2,3,4,5]'
|
||||||
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']"
|
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']"
|
||||||
child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()"
|
child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()"
|
||||||
child-link: "/a[@class='c-search-result']/@href"
|
child-link: "/a[@class='c-search-result']/@href"
|
||||||
|
@ -28,7 +28,7 @@ foerderinfo.bund.de-bekanntmachungen:
|
||||||
entry-list:
|
entry-list:
|
||||||
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D'
|
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D'
|
||||||
link2: '#searchResults'
|
link2: '#searchResults'
|
||||||
iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]'
|
iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]'
|
||||||
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
|
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
|
||||||
parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]"
|
parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]"
|
||||||
child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()"
|
child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()"
|
||||||
|
|
|
@ -133,10 +133,22 @@ class fdb_spider(object):
|
||||||
f.close
|
f.close
|
||||||
else:
|
else:
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
display = Display(visible=0, size=(800, 800))
|
||||||
|
display.start()
|
||||||
|
|
||||||
|
#outputdir = '.'
|
||||||
|
#service_log_path = "{}/chromedriver.log".format(outputdir)
|
||||||
|
#service_args = ['--verbose']
|
||||||
|
#driver = webdriver.Chrome('/usr/bin/chromium')
|
||||||
options = webdriver.ChromeOptions()
|
options = webdriver.ChromeOptions()
|
||||||
options.add_argument('headless')
|
options.add_argument('headless')
|
||||||
driver = webdriver.Chrome(options=options)
|
options.add_argument("--remote-debugging-port=9222")
|
||||||
|
options.add_argument('--no-sandbox')
|
||||||
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
|
service = Service(executable_path='/usr/bin/chromedriver')
|
||||||
|
driver = webdriver.Chrome(options=options, service=service)
|
||||||
|
|
||||||
|
|
||||||
def find_config_parameter(self, list_of_fdbs):
|
def find_config_parameter(self, list_of_fdbs):
|
||||||
|
@ -357,16 +369,22 @@ class fdb_spider(object):
|
||||||
|
|
||||||
if fdb_domain in link:
|
if fdb_domain in link:
|
||||||
dictionary_entry_list[n]["link"] = link
|
dictionary_entry_list[n]["link"] = link
|
||||||
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
|
if fdb_domain not in link and 'http:' in link:
|
||||||
|
dictionary_entry_list[n]["link"] = link
|
||||||
|
if fdb_domain not in link and 'www.' in link:
|
||||||
|
dictionary_entry_list[n]["link"] = link
|
||||||
|
if fdb_domain not in link and 'https:' in link:
|
||||||
dictionary_entry_list[n]["link"] = link
|
dictionary_entry_list[n]["link"] = link
|
||||||
if 'javascript:' in link:
|
if 'javascript:' in link:
|
||||||
dictionary_entry_list[n]["link"] = link
|
dictionary_entry_list[n]["link"] = link
|
||||||
else:
|
if fdb_domain not in link and ('http' or 'https' or 'www.') not in link:
|
||||||
if link[-1] == '/':
|
if link[-1] == '/':
|
||||||
dictionary_entry_list[n]["link"] = fdb_domain + link
|
dictionary_entry_list[n]["link"] = fdb_domain + link
|
||||||
else:
|
else:
|
||||||
dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
|
dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(
|
print(
|
||||||
"parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:",
|
"parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:",
|
||||||
|
@ -382,10 +400,23 @@ class fdb_spider(object):
|
||||||
def download_entry_data_htmls(self, list_of_fdbs):
|
def download_entry_data_htmls(self, list_of_fdbs):
|
||||||
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
display = Display(visible=0, size=(800, 800))
|
||||||
|
display.start()
|
||||||
|
|
||||||
|
#outputdir = '.'
|
||||||
|
#service_log_path = "{}/chromedriver.log".format(outputdir)
|
||||||
|
#service_args = ['--verbose']
|
||||||
|
#driver = webdriver.Chrome('/usr/bin/chromium')
|
||||||
options = webdriver.ChromeOptions()
|
options = webdriver.ChromeOptions()
|
||||||
options.add_argument('headless')
|
options.add_argument('headless')
|
||||||
driver = webdriver.Chrome(options=options)
|
options.add_argument("--remote-debugging-port=9222")
|
||||||
|
options.add_argument('--no-sandbox')
|
||||||
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
|
service = Service(executable_path='/usr/bin/chromedriver')
|
||||||
|
driver = webdriver.Chrome(options=options, service=service)
|
||||||
|
#driver = webdriver.Chrome()
|
||||||
for fdb in list_of_fdbs:
|
for fdb in list_of_fdbs:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -464,7 +495,7 @@ class fdb_spider(object):
|
||||||
driver.switch_to.window(window_before)
|
driver.switch_to.window(window_before)
|
||||||
|
|
||||||
|
|
||||||
if ('http' or 'www') in entry_link and ('javascript' or 'js' or '.pdf') not in enry_link:
|
if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
||||||
|
|
Loading…
Reference in a new issue