diff --git a/main.py b/main.py index 15dcd94..94f1f88 100644 --- a/main.py +++ b/main.py @@ -1,15 +1,15 @@ from spiders.fdb_spider import * config = "spiders/config.yaml" -#list_of_fdbs = ["foerderinfo.bund.de"] -list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"] +list_of_fdbs = ["giz"] +#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"] # doing the crawling of government websites spider = fdb_spider(config) -# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) +spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) #spider.find_config_parameter(list_of_fdbs) @@ -17,5 +17,5 @@ spider = fdb_spider(config) #spider.download_entry_data_htmls(list_of_fdbs) -spider.parse_entry_data2dictionary(list_of_fdbs) +#spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 767558c..8d567f1 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/config.yaml b/spiders/config.yaml index 1fa1174..12cdb83 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -47,3 +47,29 @@ foerderinfo.bund.de-bekanntmachungen: #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' unifalse: wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" + +giz: + domain: 'https://ausschreibungen.giz.de' + entry-list: + link1: 'https://ausschreibungen.giz.de/Satellite/company/welcome.do?method=showTable&fromSearch=1&tableSortPROJECT_RESULT=2&tableSortAttributePROJECT_RESULT=publicationDate&selectedTablePagePROJECT_RESULT=' + link2: '' + iteration-var-list: '[1,2,3,4,5,6,7]' + #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" + parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody" + child-name: "//tr//td[2]/text()" + child-link: "/tr//td[5]/a/@href" + child-info: "/tr//td[3]/text()" + child-period: "/tr/td[1]/text()" + #child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" + child-sponsor: "//tr/td[4]/text()" + entry: + general: + uniform: 'FALSE' + unitrue: + parent: '//html//body//form//table' + #child-name: '//html//body//form//table//tr[1]//td[2]//span' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' + unifalse: + wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" + diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 4f97c90..3b5978e 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -69,10 +69,35 @@ class fdb_spider(object): # download the html page of the List of entrys response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2) - web_content = response.read().decode("UTF-8") - + # web_content = response.read().decode("UTF-8") + + try: + web_content = response.read().decode("UTF-8") + except Exception as e: + try: + web_content = response.read().decode("latin-1") + print( + "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", + e, + ) + except Exception as ex: + print(ex) + + + + # save interim results to files - + if (len(web_content)) < 10: + print('getting the html page through urllib did not work, trying with requests librarys function get') + try: + res = requests.get(entry_list_link1 + str(i) + entry_list_link2) + web_content = res.text + except Exception as e: + print('also requests library did not work, original error is:', e) + + + print(web_content) + f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") f.write(web_content) f.close