diff --git a/spiders/config.yaml b/spiders/config.yaml index 16d2c41..1fa1174 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -38,8 +38,12 @@ foerderinfo.bund.de-bekanntmachungen: child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()" entry: - info-1: + general: + uniform: 'FALSE' + unitrue: parent: '//html//body//form//table' #child-name: '//html//body//form//table//tr[1]//td[2]//span' #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' + unifalse: + wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 6ec37cf..b7e9e73 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -327,6 +327,7 @@ class fdb_spider(object): # download the html page of the entry try: + # defining cookie to not end up in endless loop because of cookie banners pointing to redirects url = entry_link req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) response = urllib.request.urlopen(req)