Browse Source

first function works, actuall xml parser has still problems with certain xml types

onlinkgen
alpcentaur 1 year ago
parent
commit
c078ee4b1b
14 changed files with 17458 additions and 19 deletions
  1. +2
    -1
      .gitignore
  2. +3
    -3
      main.py
  3. BIN
      spiders/__pycache__/fdb_spider.cpython-39.pyc
  4. +6
    -6
      spiders/config.yaml
  5. +16
    -9
      spiders/fdb_spider.py
  6. +0
    -0
      spiders/output/foerderinfo.bund.de1entryList.txt
  7. +2351
    -0
      spiders/pages/foerderinfo.bund.de1entryList.html
  8. +2271
    -0
      spiders/pages/foerderinfo.bund.de2entryList.html
  9. +2179
    -0
      spiders/pages/foerderinfo.bund.de3entryList.html
  10. +2186
    -0
      spiders/pages/foerderinfo.bund.de4entryList.html
  11. +2185
    -0
      spiders/pages/foerderinfo.bund.de5entryList.html
  12. +2178
    -0
      spiders/pages/foerderinfo.bund.de6entryList.html
  13. +2173
    -0
      spiders/pages/foerderinfo.bund.de7entryList.html
  14. +1908
    -0
      spiders/pages/foerderinfo.bund.de8entryList.html

+ 2
- 1
.gitignore View File

@ -1,2 +1,3 @@
/venv /venv
/spiders/pages/*
/spiders/output/*

+ 3
- 3
main.py View File

@ -6,13 +6,13 @@ list_of_fdbs = ["foerderinfo.bund.de"]
# doing the crawling of government websites # doing the crawling of government websites
# spider = fdb_spider(config)
spider = fdb_spider(config)
# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) # spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
# spider.parse_entry_list_data2dictionary(list_of_fdbs)
spider.parse_entry_list_data2dictionary(list_of_fdbs)
# spider.download_entry_data_htmls(list_of_fdbs)
spider.download_entry_data_htmls(list_of_fdbs)
# spider.parse_entry_data2dictionary(list_of_fdbs) # spider.parse_entry_data2dictionary(list_of_fdbs)

BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc View File


+ 6
- 6
spiders/config.yaml View File

@ -7,12 +7,12 @@
foerderinfo.bund.de: foerderinfo.bund.de:
domain: 'http://foerderinfo.bund.de' domain: 'http://foerderinfo.bund.de'
entry-list: entry-list:
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
link2: '#searchResults'
iteration-var-list: [1,2,3,4,5,6,7,8]
parent: '//html//body//form//table//tr//td//table//tr'
child-name: '//td//a/text()'
child-link: '//td//a/@href'
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
link2: '#searchResults'
iteration-var-list: '[1,2,3,4,5,6,7,8]'
parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
child-name: '//div.l-search-result-list_item//a//span.c-search-result__title'
child-link: '//div.l-search-result-list_item//a/@href'
entry: entry:
info-1: info-1:
parent: '//html//body//form//table' parent: '//html//body//form//table'

+ 16
- 9
spiders/fdb_spider.py View File

@ -61,13 +61,13 @@ class fdb_spider(object):
# download the html page of the List of entrys # download the html page of the List of entrys
response = urllib.request.urlopen(entry_list_link1 + string(i) + entry_list_link2)
response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
web_content = response.read().decode("UTF-8") web_content = response.read().decode("UTF-8")
# save interim results to files # save interim results to files
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
f.write(webContent)
f.write(web_content)
f.close f.close
def parse_entry_list_data2dictionary(self, list_of_fdbs): def parse_entry_list_data2dictionary(self, list_of_fdbs):
@ -82,6 +82,7 @@ class fdb_spider(object):
) )
for i in iteration_var_list: for i in iteration_var_list:
print(i)
try: try:
# use soupparser to handle broken html # use soupparser to handle broken html
@ -89,13 +90,17 @@ class fdb_spider(object):
"spiders/pages/" + fdb + str(i) + "entryList.html" "spiders/pages/" + fdb + str(i) + "entryList.html"
) )
# for e in tree.iter():
#
# print(e.tag)
#
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
print('oioioioioioioioioioioiOIOI')
for e in tree.iter():
print(e.tag)
# #
# #print(etree.tostring(e).decode())
for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
print(etree.tostring(e).decode())
dictionary_entry_list = {} dictionary_entry_list = {}
@ -121,6 +126,8 @@ class fdb_spider(object):
+ "]" + "]"
+ fdb_conf_entry_list_child_link + fdb_conf_entry_list_child_link
) )
print('oi' + name)
if len(name) > 0: if len(name) > 0:
dictionary_entry_list[n] = {} dictionary_entry_list[n] = {}
@ -183,7 +190,7 @@ class fdb_spider(object):
def parse_entry_data2dictionary(self, list_of_fdbs): def parse_entry_data2dictionary(self, list_of_fdbs):
for fdb in list_of_fdbs: for fdb in list_of_fdbs:
try:
try:
iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
except Exception as e: except Exception as e:
print( print(

+ 0
- 0
spiders/output/foerderinfo.bund.de1entryList.txt View File


+ 2351
- 0
spiders/pages/foerderinfo.bund.de1entryList.html
File diff suppressed because it is too large
View File


+ 2271
- 0
spiders/pages/foerderinfo.bund.de2entryList.html
File diff suppressed because it is too large
View File


+ 2179
- 0
spiders/pages/foerderinfo.bund.de3entryList.html
File diff suppressed because it is too large
View File


+ 2186
- 0
spiders/pages/foerderinfo.bund.de4entryList.html
File diff suppressed because it is too large
View File


+ 2185
- 0
spiders/pages/foerderinfo.bund.de5entryList.html
File diff suppressed because it is too large
View File


+ 2178
- 0
spiders/pages/foerderinfo.bund.de6entryList.html
File diff suppressed because it is too large
View File


+ 2173
- 0
spiders/pages/foerderinfo.bund.de7entryList.html
File diff suppressed because it is too large
View File


+ 1908
- 0
spiders/pages/foerderinfo.bund.de8entryList.html
File diff suppressed because it is too large
View File


Loading…
Cancel
Save