first function works, actuall xml parser has still problems with certain xml types

This commit is contained in:
alpcentaur 2023-11-06 19:17:45 +00:00
parent 8b20bc178f
commit c078ee4b1b
14 changed files with 17458 additions and 19 deletions

3
.gitignore vendored
View file

@ -1,2 +1,3 @@
/venv
/spiders/pages/*
/spiders/output/*

View file

@ -6,13 +6,13 @@ list_of_fdbs = ["foerderinfo.bund.de"]
# doing the crawling of government websites
# spider = fdb_spider(config)
spider = fdb_spider(config)
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
# spider.parse_entry_list_data2dictionary(list_of_fdbs)
spider.parse_entry_list_data2dictionary(list_of_fdbs)
# spider.download_entry_data_htmls(list_of_fdbs)
spider.download_entry_data_htmls(list_of_fdbs)
# spider.parse_entry_data2dictionary(list_of_fdbs)

Binary file not shown.

View file

@ -9,10 +9,10 @@ foerderinfo.bund.de:
entry-list:
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
link2: '#searchResults'
iteration-var-list: [1,2,3,4,5,6,7,8]
parent: '//html//body//form//table//tr//td//table//tr'
child-name: '//td//a/text()'
child-link: '//td//a/@href'
iteration-var-list: '[1,2,3,4,5,6,7,8]'
parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
child-name: '//div.l-search-result-list_item//a//span.c-search-result__title'
child-link: '//div.l-search-result-list_item//a/@href'
entry:
info-1:
parent: '//html//body//form//table'

View file

@ -61,13 +61,13 @@ class fdb_spider(object):
# download the html page of the List of entrys
response = urllib.request.urlopen(entry_list_link1 + string(i) + entry_list_link2)
response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
web_content = response.read().decode("UTF-8")
# save interim results to files
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
f.write(webContent)
f.write(web_content)
f.close
def parse_entry_list_data2dictionary(self, list_of_fdbs):
@ -82,6 +82,7 @@ class fdb_spider(object):
)
for i in iteration_var_list:
print(i)
try:
# use soupparser to handle broken html
@ -89,13 +90,17 @@ class fdb_spider(object):
"spiders/pages/" + fdb + str(i) + "entryList.html"
)
# for e in tree.iter():
print('oioioioioioioioioioioiOIOI')
for e in tree.iter():
print(e.tag)
#
# print(e.tag)
#
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
#
# #print(etree.tostring(e).decode())
for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
print(etree.tostring(e).decode())
dictionary_entry_list = {}
@ -122,6 +127,8 @@ class fdb_spider(object):
+ fdb_conf_entry_list_child_link
)
print('oi' + name)
if len(name) > 0:
dictionary_entry_list[n] = {}
dictionary_entry_list[n]["name"] = name[0]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long