added further database in config.yaml, added new exception for downloading js generated html pages

2023-11-27 15:10:11 +00:00 · 2023-11-27 15:10:11 +00:00 · a0075e429d
commit a0075e429d
parent df4a8289b8
4 changed files with 58 additions and 7 deletions
--- a/main.py
+++ b/main.py
@ -1,15 +1,15 @@
 from spiders.fdb_spider import *

 config = "spiders/config.yaml"
-#list_of_fdbs = ["foerderinfo.bund.de"]
-list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
+list_of_fdbs = ["giz"]
+#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]


 # doing the crawling of government websites

 spider = fdb_spider(config)

-# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
+spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)

 #spider.find_config_parameter(list_of_fdbs)

@ -17,5 +17,5 @@ spider = fdb_spider(config)

 #spider.download_entry_data_htmls(list_of_fdbs)

-spider.parse_entry_data2dictionary(list_of_fdbs)
+#spider.parse_entry_data2dictionary(list_of_fdbs)

--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -47,3 +47,29 @@ foerderinfo.bund.de-bekanntmachungen:
      #child-deadline:  '//html/body/form/table/tr[2]/td[3]/span + label.1'
    unifalse:
      wordlist:  "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"
+
+giz:
+  domain: 'https://ausschreibungen.giz.de'
+  entry-list:
+    link1:  'https://ausschreibungen.giz.de/Satellite/company/welcome.do?method=showTable&fromSearch=1&tableSortPROJECT_RESULT=2&tableSortAttributePROJECT_RESULT=publicationDate&selectedTablePagePROJECT_RESULT='
+    link2:  ''
+    iteration-var-list:  '[1,2,3,4,5,6,7]'
+    #parent:  "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
+    parent:  "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody"
+    child-name:  "//tr//td[2]/text()"
+    child-link:  "/tr//td[5]/a/@href"
+    child-info:  "/tr//td[3]/text()"
+    child-period:  "/tr/td[1]/text()"
+    #child-period:  "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
+    child-sponsor: "//tr/td[4]/text()"
+  entry:
+    general:
+      uniform: 'FALSE'
+    unitrue:
+      parent:  '//html//body//form//table'
+      #child-name:  '//html//body//form//table//tr[1]//td[2]//span'
+      #child-sum:  '//html//body//form//table//tr[2]//td[1]//span//img'
+      #child-deadline:  '//html/body/form/table/tr[2]/td[3]/span + label.1'
+    unifalse:
+      wordlist:  "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"
+
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -69,10 +69,35 @@ class fdb_spider(object):
                        # download the html page of the List of entrys

                        response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
-                        web_content = response.read().decode("UTF-8")
-
+                        # web_content = response.read().decode("UTF-8")
+                        
+                        try:
+                            web_content = response.read().decode("UTF-8")
+                        except Exception as e:
+                            try:
+                                web_content = response.read().decode("latin-1")
+                                print(
+                                    "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
+                                    e,
+                                )
+                            except Exception as ex:
+                                print(ex)
+                            
+                        
+                        
+                        
                        # save interim results to files
-
+                        if (len(web_content)) < 10:
+                            print('getting the html page through urllib did not work, trying with requests librarys function get')
+                            try:
+                                res = requests.get(entry_list_link1 + str(i) + entry_list_link2)
+                                web_content = res.text
+                            except Exception as e:
+                                print('also requests library did not work, original error is:', e)
+                                
+                        
+                        print(web_content)
+                        
                        f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
                        f.write(web_content)
                        f.close