first function works, actuall xml parser has still problems with certain xml types

This commit is contained in:
alpcentaur 2023-11-06 19:17:45 +00:00
parent 8b20bc178f
commit c078ee4b1b
14 changed files with 17458 additions and 19 deletions

3
.gitignore vendored
View file

@ -1,2 +1,3 @@
/venv /venv
/spiders/pages/*
/spiders/output/*

View file

@ -6,13 +6,13 @@ list_of_fdbs = ["foerderinfo.bund.de"]
# doing the crawling of government websites # doing the crawling of government websites
# spider = fdb_spider(config) spider = fdb_spider(config)
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) # spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
# spider.parse_entry_list_data2dictionary(list_of_fdbs) spider.parse_entry_list_data2dictionary(list_of_fdbs)
# spider.download_entry_data_htmls(list_of_fdbs) spider.download_entry_data_htmls(list_of_fdbs)
# spider.parse_entry_data2dictionary(list_of_fdbs) # spider.parse_entry_data2dictionary(list_of_fdbs)

Binary file not shown.

View file

@ -7,12 +7,12 @@
foerderinfo.bund.de: foerderinfo.bund.de:
domain: 'http://foerderinfo.bund.de' domain: 'http://foerderinfo.bund.de'
entry-list: entry-list:
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
link2: '#searchResults' link2: '#searchResults'
iteration-var-list: [1,2,3,4,5,6,7,8] iteration-var-list: '[1,2,3,4,5,6,7,8]'
parent: '//html//body//form//table//tr//td//table//tr' parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
child-name: '//td//a/text()' child-name: '//div.l-search-result-list_item//a//span.c-search-result__title'
child-link: '//td//a/@href' child-link: '//div.l-search-result-list_item//a/@href'
entry: entry:
info-1: info-1:
parent: '//html//body//form//table' parent: '//html//body//form//table'

View file

@ -61,13 +61,13 @@ class fdb_spider(object):
# download the html page of the List of entrys # download the html page of the List of entrys
response = urllib.request.urlopen(entry_list_link1 + string(i) + entry_list_link2) response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
web_content = response.read().decode("UTF-8") web_content = response.read().decode("UTF-8")
# save interim results to files # save interim results to files
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
f.write(webContent) f.write(web_content)
f.close f.close
def parse_entry_list_data2dictionary(self, list_of_fdbs): def parse_entry_list_data2dictionary(self, list_of_fdbs):
@ -82,6 +82,7 @@ class fdb_spider(object):
) )
for i in iteration_var_list: for i in iteration_var_list:
print(i)
try: try:
# use soupparser to handle broken html # use soupparser to handle broken html
@ -89,13 +90,17 @@ class fdb_spider(object):
"spiders/pages/" + fdb + str(i) + "entryList.html" "spiders/pages/" + fdb + str(i) + "entryList.html"
) )
# for e in tree.iter():
print('oioioioioioioioioioioiOIOI')
for e in tree.iter():
print(e.tag)
# #
# print(e.tag) for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
#
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): print(etree.tostring(e).decode())
#
# #print(etree.tostring(e).decode())
dictionary_entry_list = {} dictionary_entry_list = {}
@ -122,6 +127,8 @@ class fdb_spider(object):
+ fdb_conf_entry_list_child_link + fdb_conf_entry_list_child_link
) )
print('oi' + name)
if len(name) > 0: if len(name) > 0:
dictionary_entry_list[n] = {} dictionary_entry_list[n] = {}
dictionary_entry_list[n]["name"] = name[0] dictionary_entry_list[n]["name"] = name[0]
@ -183,7 +190,7 @@ class fdb_spider(object):
def parse_entry_data2dictionary(self, list_of_fdbs): def parse_entry_data2dictionary(self, list_of_fdbs):
for fdb in list_of_fdbs: for fdb in list_of_fdbs:
try: try:
iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
except Exception as e: except Exception as e:
print( print(

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long