added further database in config.yaml, added new exception for downloading js generated html pages
This commit is contained in:
parent
df4a8289b8
commit
a0075e429d
4 changed files with 58 additions and 7 deletions
8
main.py
8
main.py
|
@ -1,15 +1,15 @@
|
||||||
from spiders.fdb_spider import *
|
from spiders.fdb_spider import *
|
||||||
|
|
||||||
config = "spiders/config.yaml"
|
config = "spiders/config.yaml"
|
||||||
#list_of_fdbs = ["foerderinfo.bund.de"]
|
list_of_fdbs = ["giz"]
|
||||||
list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
||||||
|
|
||||||
|
|
||||||
# doing the crawling of government websites
|
# doing the crawling of government websites
|
||||||
|
|
||||||
spider = fdb_spider(config)
|
spider = fdb_spider(config)
|
||||||
|
|
||||||
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||||
|
|
||||||
#spider.find_config_parameter(list_of_fdbs)
|
#spider.find_config_parameter(list_of_fdbs)
|
||||||
|
|
||||||
|
@ -17,5 +17,5 @@ spider = fdb_spider(config)
|
||||||
|
|
||||||
#spider.download_entry_data_htmls(list_of_fdbs)
|
#spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_data2dictionary(list_of_fdbs)
|
#spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -47,3 +47,29 @@ foerderinfo.bund.de-bekanntmachungen:
|
||||||
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
||||||
unifalse:
|
unifalse:
|
||||||
wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"
|
wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"
|
||||||
|
|
||||||
|
giz:
|
||||||
|
domain: 'https://ausschreibungen.giz.de'
|
||||||
|
entry-list:
|
||||||
|
link1: 'https://ausschreibungen.giz.de/Satellite/company/welcome.do?method=showTable&fromSearch=1&tableSortPROJECT_RESULT=2&tableSortAttributePROJECT_RESULT=publicationDate&selectedTablePagePROJECT_RESULT='
|
||||||
|
link2: ''
|
||||||
|
iteration-var-list: '[1,2,3,4,5,6,7]'
|
||||||
|
#parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
|
||||||
|
parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody"
|
||||||
|
child-name: "//tr//td[2]/text()"
|
||||||
|
child-link: "/tr//td[5]/a/@href"
|
||||||
|
child-info: "/tr//td[3]/text()"
|
||||||
|
child-period: "/tr/td[1]/text()"
|
||||||
|
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
||||||
|
child-sponsor: "//tr/td[4]/text()"
|
||||||
|
entry:
|
||||||
|
general:
|
||||||
|
uniform: 'FALSE'
|
||||||
|
unitrue:
|
||||||
|
parent: '//html//body//form//table'
|
||||||
|
#child-name: '//html//body//form//table//tr[1]//td[2]//span'
|
||||||
|
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
|
||||||
|
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
||||||
|
unifalse:
|
||||||
|
wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"
|
||||||
|
|
||||||
|
|
|
@ -69,10 +69,35 @@ class fdb_spider(object):
|
||||||
# download the html page of the List of entrys
|
# download the html page of the List of entrys
|
||||||
|
|
||||||
response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
|
response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
|
||||||
web_content = response.read().decode("UTF-8")
|
# web_content = response.read().decode("UTF-8")
|
||||||
|
|
||||||
|
try:
|
||||||
|
web_content = response.read().decode("UTF-8")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
web_content = response.read().decode("latin-1")
|
||||||
|
print(
|
||||||
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
print(ex)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
if (len(web_content)) < 10:
|
||||||
|
print('getting the html page through urllib did not work, trying with requests librarys function get')
|
||||||
|
try:
|
||||||
|
res = requests.get(entry_list_link1 + str(i) + entry_list_link2)
|
||||||
|
web_content = res.text
|
||||||
|
except Exception as e:
|
||||||
|
print('also requests library did not work, original error is:', e)
|
||||||
|
|
||||||
|
|
||||||
|
print(web_content)
|
||||||
|
|
||||||
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
|
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
|
||||||
f.write(web_content)
|
f.write(web_content)
|
||||||
f.close
|
f.close
|
||||||
|
|
Loading…
Reference in a new issue