added working bund.de-bekanntmachungen config with new example of xpath contains
This commit is contained in:
parent
06fa81e549
commit
ff23c22e3c
4 changed files with 66 additions and 7 deletions
7
main.py
7
main.py
|
@ -1,7 +1,8 @@
|
||||||
from spiders.fdb_spider import *
|
from spiders.fdb_spider import *
|
||||||
|
|
||||||
config = "spiders/config.yaml"
|
config = "spiders/config.yaml"
|
||||||
list_of_fdbs = ["foerderinfo.bund.de"]
|
#list_of_fdbs = ["foerderinfo.bund.de"]
|
||||||
|
list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
||||||
|
|
||||||
|
|
||||||
# doing the crawling of government websites
|
# doing the crawling of government websites
|
||||||
|
@ -10,9 +11,11 @@ spider = fdb_spider(config)
|
||||||
|
|
||||||
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||||
|
|
||||||
|
#spider.find_config_parameter(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
spider.download_entry_data_htmls(list_of_fdbs)
|
# spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -13,6 +13,9 @@ foerderinfo.bund.de:
|
||||||
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']"
|
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']"
|
||||||
child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()"
|
child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()"
|
||||||
child-link: "/a[@class='c-search-result']/@href"
|
child-link: "/a[@class='c-search-result']/@href"
|
||||||
|
child-info: "//"
|
||||||
|
child-period: "/"
|
||||||
|
child-sponsor: "/"
|
||||||
entry:
|
entry:
|
||||||
info-1:
|
info-1:
|
||||||
parent: '//html//body//form//table'
|
parent: '//html//body//form//table'
|
||||||
|
@ -20,4 +23,23 @@ foerderinfo.bund.de:
|
||||||
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
|
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
|
||||||
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
||||||
|
|
||||||
foerderinfo.bund.de-mobilitaet:
|
foerderinfo.bund.de-bekanntmachungen:
|
||||||
|
domain: 'http://foerderinfo.bund.de'
|
||||||
|
entry-list:
|
||||||
|
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D'
|
||||||
|
link2: '#searchResults'
|
||||||
|
iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]'
|
||||||
|
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
|
||||||
|
parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]"
|
||||||
|
child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()"
|
||||||
|
child-link: "/@href"
|
||||||
|
child-info: "//div[@class='c-teaser__text-wrapper']//div[@class='c-teaser__text']/p/text()"
|
||||||
|
#child-period: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']/span[@class='c-topline__item']/text()"
|
||||||
|
child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
||||||
|
child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()"
|
||||||
|
entry:
|
||||||
|
info-1:
|
||||||
|
parent: '//html//body//form//table'
|
||||||
|
#child-name: '//html//body//form//table//tr[1]//td[2]//span'
|
||||||
|
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
|
||||||
|
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
||||||
|
|
|
@ -89,6 +89,9 @@ class fdb_spider(object):
|
||||||
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
|
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
|
||||||
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
|
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
|
||||||
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
|
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
|
||||||
|
fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
|
||||||
|
fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
|
||||||
|
|
||||||
|
|
||||||
for i in iteration_var_list:
|
for i in iteration_var_list:
|
||||||
print(i)
|
print(i)
|
||||||
|
@ -123,19 +126,33 @@ class fdb_spider(object):
|
||||||
print('-----------------------------------------------------------------------------------------------------------------------------------------')
|
print('-----------------------------------------------------------------------------------------------------------------------------------------')
|
||||||
print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode())
|
print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode())
|
||||||
|
|
||||||
print('this is the first actual name element:')
|
print('this is the name children:')
|
||||||
|
|
||||||
name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name)
|
name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name)
|
||||||
print(name_element)
|
print(name_element)
|
||||||
for name in name_element:
|
#for name in name_element:
|
||||||
print(name)
|
# print(name)
|
||||||
|
print(len(name_element))
|
||||||
|
|
||||||
print('this is the first actual link element:')
|
print('this is the link children:')
|
||||||
|
|
||||||
link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link)
|
link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link)
|
||||||
print(link_element)
|
print(link_element)
|
||||||
#for link in link_element:
|
#for link in link_element:
|
||||||
# print(link)
|
# print(link)
|
||||||
|
print(len(link_element))
|
||||||
|
|
||||||
|
print('this is the info children:')
|
||||||
|
|
||||||
|
info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info)
|
||||||
|
print(info_element)
|
||||||
|
print(len(info_element))
|
||||||
|
|
||||||
|
print('this is the period children:')
|
||||||
|
|
||||||
|
period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period)
|
||||||
|
print(period_element)
|
||||||
|
print(len(period_element))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(
|
print(
|
||||||
|
@ -194,6 +211,10 @@ class fdb_spider(object):
|
||||||
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
|
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
|
||||||
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
|
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
|
||||||
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
|
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
|
||||||
|
fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
|
||||||
|
fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
|
||||||
|
|
||||||
|
|
||||||
print('blabliblub')
|
print('blabliblub')
|
||||||
print('len', len(tree.xpath(fdb_conf_entry_list_parent)))
|
print('len', len(tree.xpath(fdb_conf_entry_list_parent)))
|
||||||
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
|
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
|
||||||
|
@ -202,6 +223,17 @@ class fdb_spider(object):
|
||||||
fdb_conf_entry_list_parent
|
fdb_conf_entry_list_parent
|
||||||
+ fdb_conf_entry_list_child_name
|
+ fdb_conf_entry_list_child_name
|
||||||
)[n]
|
)[n]
|
||||||
|
|
||||||
|
info = tree.xpath(
|
||||||
|
fdb_conf_entry_list_parent
|
||||||
|
+ fdb_conf_entry_list_child_info
|
||||||
|
)[n]
|
||||||
|
|
||||||
|
period = tree.xpath(
|
||||||
|
fdb_conf_entry_list_parent
|
||||||
|
+ fdb_conf_entry_list_child_period
|
||||||
|
)[n]
|
||||||
|
|
||||||
print('oi ', name)
|
print('oi ', name)
|
||||||
print('blablidubbiduub')
|
print('blablidubbiduub')
|
||||||
link = tree.xpath(
|
link = tree.xpath(
|
||||||
|
@ -217,6 +249,8 @@ class fdb_spider(object):
|
||||||
if len(name) > 0:
|
if len(name) > 0:
|
||||||
dictionary_entry_list[n] = {}
|
dictionary_entry_list[n] = {}
|
||||||
dictionary_entry_list[n]["name"] = name
|
dictionary_entry_list[n]["name"] = name
|
||||||
|
dictionary_entry_list[n]["info"] = info
|
||||||
|
dictionary_entry_list[n]["period"] = period
|
||||||
|
|
||||||
if fdb_domain in link:
|
if fdb_domain in link:
|
||||||
dictionary_entry_list[n]["link"] = link
|
dictionary_entry_list[n]["link"] = link
|
||||||
|
|
Loading…
Reference in a new issue