Browse Source

added first config parameters for search on not uniform entries

onlinkgen
alpcentaur 11 months ago
parent
commit
b2cf4b67ce
2 changed files with 6 additions and 1 deletions
  1. +5
    -1
      spiders/config.yaml
  2. +1
    -0
      spiders/fdb_spider.py

+ 5
- 1
spiders/config.yaml View File

@ -38,8 +38,12 @@ foerderinfo.bund.de-bekanntmachungen:
child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()" child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()"
entry: entry:
info-1:
general:
uniform: 'FALSE'
unitrue:
parent: '//html//body//form//table' parent: '//html//body//form//table'
#child-name: '//html//body//form//table//tr[1]//td[2]//span' #child-name: '//html//body//form//table//tr[1]//td[2]//span'
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
unifalse:
wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"

+ 1
- 0
spiders/fdb_spider.py View File

@ -327,6 +327,7 @@ class fdb_spider(object):
# download the html page of the entry # download the html page of the entry
try: try:
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
url = entry_link url = entry_link
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
response = urllib.request.urlopen(req) response = urllib.request.urlopen(req)

Loading…
Cancel
Save