first function works, actuall xml parser has still problems with certain xml types
This commit is contained in:
parent
8b20bc178f
commit
c078ee4b1b
14 changed files with 17458 additions and 19 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
/venv
|
/venv
|
||||||
|
/spiders/pages/*
|
||||||
|
/spiders/output/*
|
||||||
|
|
6
main.py
6
main.py
|
@ -6,13 +6,13 @@ list_of_fdbs = ["foerderinfo.bund.de"]
|
||||||
|
|
||||||
# doing the crawling of government websites
|
# doing the crawling of government websites
|
||||||
|
|
||||||
# spider = fdb_spider(config)
|
spider = fdb_spider(config)
|
||||||
|
|
||||||
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||||
|
|
||||||
# spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
# spider.download_entry_data_htmls(list_of_fdbs)
|
spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc
Normal file
BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc
Normal file
Binary file not shown.
|
@ -9,10 +9,10 @@ foerderinfo.bund.de:
|
||||||
entry-list:
|
entry-list:
|
||||||
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
|
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
|
||||||
link2: '#searchResults'
|
link2: '#searchResults'
|
||||||
iteration-var-list: [1,2,3,4,5,6,7,8]
|
iteration-var-list: '[1,2,3,4,5,6,7,8]'
|
||||||
parent: '//html//body//form//table//tr//td//table//tr'
|
parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
|
||||||
child-name: '//td//a/text()'
|
child-name: '//div.l-search-result-list_item//a//span.c-search-result__title'
|
||||||
child-link: '//td//a/@href'
|
child-link: '//div.l-search-result-list_item//a/@href'
|
||||||
entry:
|
entry:
|
||||||
info-1:
|
info-1:
|
||||||
parent: '//html//body//form//table'
|
parent: '//html//body//form//table'
|
||||||
|
|
|
@ -61,13 +61,13 @@ class fdb_spider(object):
|
||||||
|
|
||||||
# download the html page of the List of entrys
|
# download the html page of the List of entrys
|
||||||
|
|
||||||
response = urllib.request.urlopen(entry_list_link1 + string(i) + entry_list_link2)
|
response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
|
||||||
web_content = response.read().decode("UTF-8")
|
web_content = response.read().decode("UTF-8")
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
|
||||||
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
|
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
|
||||||
f.write(webContent)
|
f.write(web_content)
|
||||||
f.close
|
f.close
|
||||||
|
|
||||||
def parse_entry_list_data2dictionary(self, list_of_fdbs):
|
def parse_entry_list_data2dictionary(self, list_of_fdbs):
|
||||||
|
@ -82,6 +82,7 @@ class fdb_spider(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
for i in iteration_var_list:
|
for i in iteration_var_list:
|
||||||
|
print(i)
|
||||||
try:
|
try:
|
||||||
# use soupparser to handle broken html
|
# use soupparser to handle broken html
|
||||||
|
|
||||||
|
@ -89,13 +90,17 @@ class fdb_spider(object):
|
||||||
"spiders/pages/" + fdb + str(i) + "entryList.html"
|
"spiders/pages/" + fdb + str(i) + "entryList.html"
|
||||||
)
|
)
|
||||||
|
|
||||||
# for e in tree.iter():
|
|
||||||
|
|
||||||
|
print('oioioioioioioioioioioiOIOI')
|
||||||
|
|
||||||
|
for e in tree.iter():
|
||||||
|
|
||||||
|
print(e.tag)
|
||||||
#
|
#
|
||||||
# print(e.tag)
|
for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
|
||||||
#
|
|
||||||
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
|
print(etree.tostring(e).decode())
|
||||||
#
|
|
||||||
# #print(etree.tostring(e).decode())
|
|
||||||
|
|
||||||
dictionary_entry_list = {}
|
dictionary_entry_list = {}
|
||||||
|
|
||||||
|
@ -122,6 +127,8 @@ class fdb_spider(object):
|
||||||
+ fdb_conf_entry_list_child_link
|
+ fdb_conf_entry_list_child_link
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print('oi' + name)
|
||||||
|
|
||||||
if len(name) > 0:
|
if len(name) > 0:
|
||||||
dictionary_entry_list[n] = {}
|
dictionary_entry_list[n] = {}
|
||||||
dictionary_entry_list[n]["name"] = name[0]
|
dictionary_entry_list[n]["name"] = name[0]
|
||||||
|
|
0
spiders/output/foerderinfo.bund.de1entryList.txt
Normal file
0
spiders/output/foerderinfo.bund.de1entryList.txt
Normal file
2351
spiders/pages/foerderinfo.bund.de1entryList.html
Normal file
2351
spiders/pages/foerderinfo.bund.de1entryList.html
Normal file
File diff suppressed because one or more lines are too long
2271
spiders/pages/foerderinfo.bund.de2entryList.html
Normal file
2271
spiders/pages/foerderinfo.bund.de2entryList.html
Normal file
File diff suppressed because one or more lines are too long
2179
spiders/pages/foerderinfo.bund.de3entryList.html
Normal file
2179
spiders/pages/foerderinfo.bund.de3entryList.html
Normal file
File diff suppressed because one or more lines are too long
2186
spiders/pages/foerderinfo.bund.de4entryList.html
Normal file
2186
spiders/pages/foerderinfo.bund.de4entryList.html
Normal file
File diff suppressed because one or more lines are too long
2185
spiders/pages/foerderinfo.bund.de5entryList.html
Normal file
2185
spiders/pages/foerderinfo.bund.de5entryList.html
Normal file
File diff suppressed because one or more lines are too long
2178
spiders/pages/foerderinfo.bund.de6entryList.html
Normal file
2178
spiders/pages/foerderinfo.bund.de6entryList.html
Normal file
File diff suppressed because one or more lines are too long
2173
spiders/pages/foerderinfo.bund.de7entryList.html
Normal file
2173
spiders/pages/foerderinfo.bund.de7entryList.html
Normal file
File diff suppressed because one or more lines are too long
1908
spiders/pages/foerderinfo.bund.de8entryList.html
Normal file
1908
spiders/pages/foerderinfo.bund.de8entryList.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue