first function works, actuall xml parser has still problems with certain xml types

2023-11-06 19:17:45 +00:00 · 2023-11-06 19:17:45 +00:00 · c078ee4b1b
commit c078ee4b1b
parent 8b20bc178f
14 changed files with 17458 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 /venv
-
+/spiders/pages/*
 /spiders/output/*
--- a/main.py
+++ b/main.py
@ -6,13 +6,13 @@ list_of_fdbs = ["foerderinfo.bund.de"]
 # doing the crawling of government websites
-# spider = fdb_spider(config)
+spider = fdb_spider(config)
 # spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
-# spider.parse_entry_list_data2dictionary(list_of_fdbs)
+spider.parse_entry_list_data2dictionary(list_of_fdbs)
-# spider.download_entry_data_htmls(list_of_fdbs)
+spider.download_entry_data_htmls(list_of_fdbs)
 # spider.parse_entry_data2dictionary(list_of_fdbs)
--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -7,12 +7,12 @@
 foerderinfo.bund.de:
  domain: 'http://foerderinfo.bund.de'
  entry-list:
-    link1:  		'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
+    link1:  'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
-    link2:		'#searchResults'
+    link2:  '#searchResults'
-    iteration-var-list:	[1,2,3,4,5,6,7,8]	
+    iteration-var-list:  '[1,2,3,4,5,6,7,8]'
-    parent:  		'//html//body//form//table//tr//td//table//tr'
+    parent:  '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
-    child-name:		'//td//a/text()'
+    child-name:  '//div.l-search-result-list_item//a//span.c-search-result__title'
-    child-link:		'//td//a/@href'
+    child-link:  '//div.l-search-result-list_item//a/@href'
  entry:
    info-1:
      parent:  '//html//body//form//table'
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -61,13 +61,13 @@ class fdb_spider(object):
                        # download the html page of the List of entrys
-                        response = urllib.request.urlopen(entry_list_link1 + string(i) + entry_list_link2)
+                        response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
                        web_content = response.read().decode("UTF-8")
                        # save interim results to files
                        f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
-                        f.write(webContent)
+                        f.write(web_content)
                        f.close
    def parse_entry_list_data2dictionary(self, list_of_fdbs):
@ -82,6 +82,7 @@ class fdb_spider(object):
                )
            for i in iteration_var_list:
                print(i)
                try:
                    # use soupparser to handle broken html
@ -89,13 +90,17 @@ class fdb_spider(object):
                        "spiders/pages/" + fdb + str(i) + "entryList.html"
                    )
-                    #                for e in tree.iter():
+                    
                    print('oioioioioioioioioioioiOIOI')
                    for e in tree.iter():
                        print(e.tag)
                    #
-                    #                    print(e.tag)
+                    for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
-                    #
+                    
-                    #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
+                        print(etree.tostring(e).decode())
                    #
                    #                     #print(etree.tostring(e).decode())
                    dictionary_entry_list = {}
@ -122,6 +127,8 @@ class fdb_spider(object):
                            + fdb_conf_entry_list_child_link
                        )
                        print('oi' + name)
                        if len(name) > 0:
                            dictionary_entry_list[n] = {}
                            dictionary_entry_list[n]["name"] = name[0]
@ -183,7 +190,7 @@ class fdb_spider(object):
    def parse_entry_data2dictionary(self, list_of_fdbs):
        for fdb in list_of_fdbs:
-             try:
+            try:
                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
            except Exception as e:
                print(
--- a/spiders/output/foerderinfo.bund.de1entryList.txt
+++ b/spiders/output/foerderinfo.bund.de1entryList.txt
--- a/spiders/pages/foerderinfo.bund.de1entryList.html
+++ b/spiders/pages/foerderinfo.bund.de1entryList.html
--- a/spiders/pages/foerderinfo.bund.de2entryList.html
+++ b/spiders/pages/foerderinfo.bund.de2entryList.html
--- a/spiders/pages/foerderinfo.bund.de3entryList.html
+++ b/spiders/pages/foerderinfo.bund.de3entryList.html
--- a/spiders/pages/foerderinfo.bund.de4entryList.html
+++ b/spiders/pages/foerderinfo.bund.de4entryList.html
--- a/spiders/pages/foerderinfo.bund.de5entryList.html
+++ b/spiders/pages/foerderinfo.bund.de5entryList.html
--- a/spiders/pages/foerderinfo.bund.de6entryList.html
+++ b/spiders/pages/foerderinfo.bund.de6entryList.html
--- a/spiders/pages/foerderinfo.bund.de7entryList.html
+++ b/spiders/pages/foerderinfo.bund.de7entryList.html
--- a/spiders/pages/foerderinfo.bund.de8entryList.html
+++ b/spiders/pages/foerderinfo.bund.de8entryList.html