diff --git a/main.py b/main.py new file mode 100644 index 0000000..fdce8be --- /dev/null +++ b/main.py @@ -0,0 +1,18 @@ +from spiders.fdb_spider import * + +config = "spiders/config.yaml" +list_of_fdbs = ["foerderinfo.bund.de"] + + +# doing the crawling of government websites + +# spider = fdb_spider(config) + +# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) + +# spider.parse_entry_list_data2dictionary(list_of_fdbs) + +# spider.download_entry_data_htmls(list_of_fdbs) + +# spider.parse_entry_data2dictionary(list_of_fdbs) + diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index a4930b8..13e4635 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -20,7 +20,7 @@ class fdb_spider(object): # input list of funding databases in form of yaml file ['foerderinfo.bund.de', 'ausschreibungen.giz.de', .. , 'usw'] - def download_link_list_pages_of_funding_databases(self, list_of_fdbs): + def download_entry_list_pages_of_funding_databases(self, list_of_fdbs): # download only html pages of the funding databases specified in input for fdb in list_of_fdbs: