added working bund.de-bekanntmachungen config with new example of xpath contains
This commit is contained in:
parent
06fa81e549
commit
ff23c22e3c
4 changed files with 66 additions and 7 deletions
7
main.py
7
main.py
|
@ -1,7 +1,8 @@
|
|||
from spiders.fdb_spider import *
|
||||
|
||||
config = "spiders/config.yaml"
|
||||
list_of_fdbs = ["foerderinfo.bund.de"]
|
||||
#list_of_fdbs = ["foerderinfo.bund.de"]
|
||||
list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
||||
|
||||
|
||||
# doing the crawling of government websites
|
||||
|
@ -10,9 +11,11 @@ spider = fdb_spider(config)
|
|||
|
||||
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||
|
||||
#spider.find_config_parameter(list_of_fdbs)
|
||||
|
||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||
|
||||
spider.download_entry_data_htmls(list_of_fdbs)
|
||||
# spider.download_entry_data_htmls(list_of_fdbs)
|
||||
|
||||
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||
|
||||
|
|
Binary file not shown.
|
@ -13,6 +13,9 @@ foerderinfo.bund.de:
|
|||
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']"
|
||||
child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()"
|
||||
child-link: "/a[@class='c-search-result']/@href"
|
||||
child-info: "//"
|
||||
child-period: "/"
|
||||
child-sponsor: "/"
|
||||
entry:
|
||||
info-1:
|
||||
parent: '//html//body//form//table'
|
||||
|
@ -20,4 +23,23 @@ foerderinfo.bund.de:
|
|||
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
|
||||
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
||||
|
||||
foerderinfo.bund.de-mobilitaet:
|
||||
foerderinfo.bund.de-bekanntmachungen:
|
||||
domain: 'http://foerderinfo.bund.de'
|
||||
entry-list:
|
||||
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D'
|
||||
link2: '#searchResults'
|
||||
iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]'
|
||||
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
|
||||
parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]"
|
||||
child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()"
|
||||
child-link: "/@href"
|
||||
child-info: "//div[@class='c-teaser__text-wrapper']//div[@class='c-teaser__text']/p/text()"
|
||||
#child-period: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']/span[@class='c-topline__item']/text()"
|
||||
child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
||||
child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()"
|
||||
entry:
|
||||
info-1:
|
||||
parent: '//html//body//form//table'
|
||||
#child-name: '//html//body//form//table//tr[1]//td[2]//span'
|
||||
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
|
||||
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
||||
|
|
|
@ -89,6 +89,9 @@ class fdb_spider(object):
|
|||
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
|
||||
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
|
||||
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
|
||||
fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
|
||||
fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
|
||||
|
||||
|
||||
for i in iteration_var_list:
|
||||
print(i)
|
||||
|
@ -123,19 +126,33 @@ class fdb_spider(object):
|
|||
print('-----------------------------------------------------------------------------------------------------------------------------------------')
|
||||
print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode())
|
||||
|
||||
print('this is the first actual name element:')
|
||||
print('this is the name children:')
|
||||
|
||||
name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name)
|
||||
print(name_element)
|
||||
for name in name_element:
|
||||
print(name)
|
||||
#for name in name_element:
|
||||
# print(name)
|
||||
print(len(name_element))
|
||||
|
||||
print('this is the first actual link element:')
|
||||
print('this is the link children:')
|
||||
|
||||
link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link)
|
||||
print(link_element)
|
||||
#for link in link_element:
|
||||
# print(link)
|
||||
print(len(link_element))
|
||||
|
||||
print('this is the info children:')
|
||||
|
||||
info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info)
|
||||
print(info_element)
|
||||
print(len(info_element))
|
||||
|
||||
print('this is the period children:')
|
||||
|
||||
period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period)
|
||||
print(period_element)
|
||||
print(len(period_element))
|
||||
|
||||
except Exception as e:
|
||||
print(
|
||||
|
@ -194,6 +211,10 @@ class fdb_spider(object):
|
|||
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
|
||||
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
|
||||
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
|
||||
fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
|
||||
fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
|
||||
|
||||
|
||||
print('blabliblub')
|
||||
print('len', len(tree.xpath(fdb_conf_entry_list_parent)))
|
||||
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
|
||||
|
@ -202,6 +223,17 @@ class fdb_spider(object):
|
|||
fdb_conf_entry_list_parent
|
||||
+ fdb_conf_entry_list_child_name
|
||||
)[n]
|
||||
|
||||
info = tree.xpath(
|
||||
fdb_conf_entry_list_parent
|
||||
+ fdb_conf_entry_list_child_info
|
||||
)[n]
|
||||
|
||||
period = tree.xpath(
|
||||
fdb_conf_entry_list_parent
|
||||
+ fdb_conf_entry_list_child_period
|
||||
)[n]
|
||||
|
||||
print('oi ', name)
|
||||
print('blablidubbiduub')
|
||||
link = tree.xpath(
|
||||
|
@ -217,6 +249,8 @@ class fdb_spider(object):
|
|||
if len(name) > 0:
|
||||
dictionary_entry_list[n] = {}
|
||||
dictionary_entry_list[n]["name"] = name
|
||||
dictionary_entry_list[n]["info"] = info
|
||||
dictionary_entry_list[n]["period"] = period
|
||||
|
||||
if fdb_domain in link:
|
||||
dictionary_entry_list[n]["link"] = link
|
||||
|
|
Loading…
Reference in a new issue