first function works, actuall xml parser has still problems with certain xml types
This commit is contained in:
parent
8b20bc178f
commit
c078ee4b1b
14 changed files with 17458 additions and 19 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
/venv
|
||||
|
||||
/spiders/pages/*
|
||||
/spiders/output/*
|
||||
|
|
6
main.py
6
main.py
|
@ -6,13 +6,13 @@ list_of_fdbs = ["foerderinfo.bund.de"]
|
|||
|
||||
# doing the crawling of government websites
|
||||
|
||||
# spider = fdb_spider(config)
|
||||
spider = fdb_spider(config)
|
||||
|
||||
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||
|
||||
# spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||
|
||||
# spider.download_entry_data_htmls(list_of_fdbs)
|
||||
spider.download_entry_data_htmls(list_of_fdbs)
|
||||
|
||||
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||
|
||||
|
|
BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc
Normal file
BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc
Normal file
Binary file not shown.
|
@ -7,12 +7,12 @@
|
|||
foerderinfo.bund.de:
|
||||
domain: 'http://foerderinfo.bund.de'
|
||||
entry-list:
|
||||
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
|
||||
link2: '#searchResults'
|
||||
iteration-var-list: [1,2,3,4,5,6,7,8]
|
||||
parent: '//html//body//form//table//tr//td//table//tr'
|
||||
child-name: '//td//a/text()'
|
||||
child-link: '//td//a/@href'
|
||||
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
|
||||
link2: '#searchResults'
|
||||
iteration-var-list: '[1,2,3,4,5,6,7,8]'
|
||||
parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
|
||||
child-name: '//div.l-search-result-list_item//a//span.c-search-result__title'
|
||||
child-link: '//div.l-search-result-list_item//a/@href'
|
||||
entry:
|
||||
info-1:
|
||||
parent: '//html//body//form//table'
|
||||
|
|
|
@ -61,13 +61,13 @@ class fdb_spider(object):
|
|||
|
||||
# download the html page of the List of entrys
|
||||
|
||||
response = urllib.request.urlopen(entry_list_link1 + string(i) + entry_list_link2)
|
||||
response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
|
||||
web_content = response.read().decode("UTF-8")
|
||||
|
||||
# save interim results to files
|
||||
|
||||
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
|
||||
f.write(webContent)
|
||||
f.write(web_content)
|
||||
f.close
|
||||
|
||||
def parse_entry_list_data2dictionary(self, list_of_fdbs):
|
||||
|
@ -82,6 +82,7 @@ class fdb_spider(object):
|
|||
)
|
||||
|
||||
for i in iteration_var_list:
|
||||
print(i)
|
||||
try:
|
||||
# use soupparser to handle broken html
|
||||
|
||||
|
@ -89,13 +90,17 @@ class fdb_spider(object):
|
|||
"spiders/pages/" + fdb + str(i) + "entryList.html"
|
||||
)
|
||||
|
||||
# for e in tree.iter():
|
||||
|
||||
|
||||
print('oioioioioioioioioioioiOIOI')
|
||||
|
||||
for e in tree.iter():
|
||||
|
||||
print(e.tag)
|
||||
#
|
||||
# print(e.tag)
|
||||
#
|
||||
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
|
||||
#
|
||||
# #print(etree.tostring(e).decode())
|
||||
for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
|
||||
|
||||
print(etree.tostring(e).decode())
|
||||
|
||||
dictionary_entry_list = {}
|
||||
|
||||
|
@ -121,6 +126,8 @@ class fdb_spider(object):
|
|||
+ "]"
|
||||
+ fdb_conf_entry_list_child_link
|
||||
)
|
||||
|
||||
print('oi' + name)
|
||||
|
||||
if len(name) > 0:
|
||||
dictionary_entry_list[n] = {}
|
||||
|
@ -183,7 +190,7 @@ class fdb_spider(object):
|
|||
def parse_entry_data2dictionary(self, list_of_fdbs):
|
||||
for fdb in list_of_fdbs:
|
||||
|
||||
try:
|
||||
try:
|
||||
iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
|
||||
except Exception as e:
|
||||
print(
|
||||
|
|
0
spiders/output/foerderinfo.bund.de1entryList.txt
Normal file
0
spiders/output/foerderinfo.bund.de1entryList.txt
Normal file
2351
spiders/pages/foerderinfo.bund.de1entryList.html
Normal file
2351
spiders/pages/foerderinfo.bund.de1entryList.html
Normal file
File diff suppressed because one or more lines are too long
2271
spiders/pages/foerderinfo.bund.de2entryList.html
Normal file
2271
spiders/pages/foerderinfo.bund.de2entryList.html
Normal file
File diff suppressed because one or more lines are too long
2179
spiders/pages/foerderinfo.bund.de3entryList.html
Normal file
2179
spiders/pages/foerderinfo.bund.de3entryList.html
Normal file
File diff suppressed because one or more lines are too long
2186
spiders/pages/foerderinfo.bund.de4entryList.html
Normal file
2186
spiders/pages/foerderinfo.bund.de4entryList.html
Normal file
File diff suppressed because one or more lines are too long
2185
spiders/pages/foerderinfo.bund.de5entryList.html
Normal file
2185
spiders/pages/foerderinfo.bund.de5entryList.html
Normal file
File diff suppressed because one or more lines are too long
2178
spiders/pages/foerderinfo.bund.de6entryList.html
Normal file
2178
spiders/pages/foerderinfo.bund.de6entryList.html
Normal file
File diff suppressed because one or more lines are too long
2173
spiders/pages/foerderinfo.bund.de7entryList.html
Normal file
2173
spiders/pages/foerderinfo.bund.de7entryList.html
Normal file
File diff suppressed because one or more lines are too long
1908
spiders/pages/foerderinfo.bund.de8entryList.html
Normal file
1908
spiders/pages/foerderinfo.bund.de8entryList.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue