diff --git a/main.py b/main.py index 519b63d..e83d588 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,8 @@ from spiders.fdb_spider import * config = "spiders/config.yaml" -list_of_fdbs = ["foerderinfo.bund.de"] +#list_of_fdbs = ["foerderinfo.bund.de"] +list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"] # doing the crawling of government websites @@ -10,9 +11,11 @@ spider = fdb_spider(config) # spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) +#spider.find_config_parameter(list_of_fdbs) + spider.parse_entry_list_data2dictionary(list_of_fdbs) -spider.download_entry_data_htmls(list_of_fdbs) +# spider.download_entry_data_htmls(list_of_fdbs) # spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 2b5f1a9..3c02d28 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/config.yaml b/spiders/config.yaml index 6ddc76f..16d2c41 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -13,6 +13,9 @@ foerderinfo.bund.de: parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" child-link: "/a[@class='c-search-result']/@href" + child-info: "//" + child-period: "/" + child-sponsor: "/" entry: info-1: parent: '//html//body//form//table' @@ -20,4 +23,23 @@ foerderinfo.bund.de: #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' -foerderinfo.bund.de-mobilitaet: +foerderinfo.bund.de-bekanntmachungen: + domain: 'http://foerderinfo.bund.de' + entry-list: + link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D' + link2: '#searchResults' + iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]' + #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" + parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]" + child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()" + child-link: "/@href" + child-info: "//div[@class='c-teaser__text-wrapper']//div[@class='c-teaser__text']/p/text()" + #child-period: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']/span[@class='c-topline__item']/text()" + child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" + child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()" + entry: + info-1: + parent: '//html//body//form//table' + #child-name: '//html//body//form//table//tr[1]//td[2]//span' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 703688f..8aa6ae9 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -89,6 +89,9 @@ class fdb_spider(object): fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") + fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info") + fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") + for i in iteration_var_list: print(i) @@ -123,19 +126,33 @@ class fdb_spider(object): print('-----------------------------------------------------------------------------------------------------------------------------------------') print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode()) - print('this is the first actual name element:') + print('this is the name children:') name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name) print(name_element) - for name in name_element: - print(name) + #for name in name_element: + # print(name) + print(len(name_element)) - print('this is the first actual link element:') + print('this is the link children:') link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link) print(link_element) #for link in link_element: # print(link) + print(len(link_element)) + + print('this is the info children:') + + info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info) + print(info_element) + print(len(info_element)) + + print('this is the period children:') + + period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period) + print(period_element) + print(len(period_element)) except Exception as e: print( @@ -194,6 +211,10 @@ class fdb_spider(object): fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") + fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info") + fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") + + print('blabliblub') print('len', len(tree.xpath(fdb_conf_entry_list_parent))) for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): @@ -202,6 +223,17 @@ class fdb_spider(object): fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name )[n] + + info = tree.xpath( + fdb_conf_entry_list_parent + + fdb_conf_entry_list_child_info + )[n] + + period = tree.xpath( + fdb_conf_entry_list_parent + + fdb_conf_entry_list_child_period + )[n] + print('oi ', name) print('blablidubbiduub') link = tree.xpath( @@ -217,6 +249,8 @@ class fdb_spider(object): if len(name) > 0: dictionary_entry_list[n] = {} dictionary_entry_list[n]["name"] = name + dictionary_entry_list[n]["info"] = info + dictionary_entry_list[n]["period"] = period if fdb_domain in link: dictionary_entry_list[n]["link"] = link