Browse Source

added working bund.de-bekanntmachungen config with new example of xpath contains

onlinkgen
alpcentaur 1 year ago
parent
commit
ff23c22e3c
4 changed files with 66 additions and 7 deletions
  1. +5
    -2
      main.py
  2. BIN
      spiders/__pycache__/fdb_spider.cpython-39.pyc
  3. +23
    -1
      spiders/config.yaml
  4. +38
    -4
      spiders/fdb_spider.py

+ 5
- 2
main.py View File

@ -1,7 +1,8 @@
from spiders.fdb_spider import * from spiders.fdb_spider import *
config = "spiders/config.yaml" config = "spiders/config.yaml"
list_of_fdbs = ["foerderinfo.bund.de"]
#list_of_fdbs = ["foerderinfo.bund.de"]
list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
# doing the crawling of government websites # doing the crawling of government websites
@ -10,9 +11,11 @@ spider = fdb_spider(config)
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) # spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
#spider.find_config_parameter(list_of_fdbs)
spider.parse_entry_list_data2dictionary(list_of_fdbs) spider.parse_entry_list_data2dictionary(list_of_fdbs)
spider.download_entry_data_htmls(list_of_fdbs)
# spider.download_entry_data_htmls(list_of_fdbs)
# spider.parse_entry_data2dictionary(list_of_fdbs) # spider.parse_entry_data2dictionary(list_of_fdbs)

BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc View File


+ 23
- 1
spiders/config.yaml View File

@ -13,6 +13,9 @@ foerderinfo.bund.de:
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']"
child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()"
child-link: "/a[@class='c-search-result']/@href" child-link: "/a[@class='c-search-result']/@href"
child-info: "//"
child-period: "/"
child-sponsor: "/"
entry: entry:
info-1: info-1:
parent: '//html//body//form//table' parent: '//html//body//form//table'
@ -20,4 +23,23 @@ foerderinfo.bund.de:
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
foerderinfo.bund.de-mobilitaet:
foerderinfo.bund.de-bekanntmachungen:
domain: 'http://foerderinfo.bund.de'
entry-list:
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D'
link2: '#searchResults'
iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]'
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]"
child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()"
child-link: "/@href"
child-info: "//div[@class='c-teaser__text-wrapper']//div[@class='c-teaser__text']/p/text()"
#child-period: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']/span[@class='c-topline__item']/text()"
child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()"
entry:
info-1:
parent: '//html//body//form//table'
#child-name: '//html//body//form//table//tr[1]//td[2]//span'
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'

+ 38
- 4
spiders/fdb_spider.py View File

@ -89,6 +89,9 @@ class fdb_spider(object):
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
for i in iteration_var_list: for i in iteration_var_list:
print(i) print(i)
@ -123,19 +126,33 @@ class fdb_spider(object):
print('-----------------------------------------------------------------------------------------------------------------------------------------') print('-----------------------------------------------------------------------------------------------------------------------------------------')
print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode()) print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode())
print('this is the first actual name element:')
print('this is the name children:')
name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name) name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name)
print(name_element) print(name_element)
for name in name_element:
print(name)
#for name in name_element:
# print(name)
print(len(name_element))
print('this is the first actual link element:')
print('this is the link children:')
link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link) link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link)
print(link_element) print(link_element)
#for link in link_element: #for link in link_element:
# print(link) # print(link)
print(len(link_element))
print('this is the info children:')
info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info)
print(info_element)
print(len(info_element))
print('this is the period children:')
period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period)
print(period_element)
print(len(period_element))
except Exception as e: except Exception as e:
print( print(
@ -194,6 +211,10 @@ class fdb_spider(object):
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
print('blabliblub') print('blabliblub')
print('len', len(tree.xpath(fdb_conf_entry_list_parent))) print('len', len(tree.xpath(fdb_conf_entry_list_parent)))
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
@ -202,6 +223,17 @@ class fdb_spider(object):
fdb_conf_entry_list_parent fdb_conf_entry_list_parent
+ fdb_conf_entry_list_child_name + fdb_conf_entry_list_child_name
)[n] )[n]
info = tree.xpath(
fdb_conf_entry_list_parent
+ fdb_conf_entry_list_child_info
)[n]
period = tree.xpath(
fdb_conf_entry_list_parent
+ fdb_conf_entry_list_child_period
)[n]
print('oi ', name) print('oi ', name)
print('blablidubbiduub') print('blablidubbiduub')
link = tree.xpath( link = tree.xpath(
@ -217,6 +249,8 @@ class fdb_spider(object):
if len(name) > 0: if len(name) > 0:
dictionary_entry_list[n] = {} dictionary_entry_list[n] = {}
dictionary_entry_list[n]["name"] = name dictionary_entry_list[n]["name"] = name
dictionary_entry_list[n]["info"] = info
dictionary_entry_list[n]["period"] = period
if fdb_domain in link: if fdb_domain in link:
dictionary_entry_list[n]["link"] = link dictionary_entry_list[n]["link"] = link

Loading…
Cancel
Save