added further database in config.yaml, added new exception for downloading js generated html pages

2023-11-27 15:10:11 +00:00 · 2023-11-27 15:10:11 +00:00 · a0075e429d
commit a0075e429d
parent df4a8289b8
4 changed files with 58 additions and 7 deletions
--- a/main.py
+++ b/main.py
@ -1,15 +1,15 @@
 from spiders.fdb_spider import *
 config = "spiders/config.yaml"
-#list_of_fdbs = ["foerderinfo.bund.de"]
+list_of_fdbs = ["giz"]
-list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
+#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
 # doing the crawling of government websites
 spider = fdb_spider(config)
-# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
+spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
 #spider.find_config_parameter(list_of_fdbs)
@ -17,5 +17,5 @@ spider = fdb_spider(config)
 #spider.download_entry_data_htmls(list_of_fdbs)
-spider.parse_entry_data2dictionary(list_of_fdbs)
+#spider.parse_entry_data2dictionary(list_of_fdbs)
--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -47,3 +47,29 @@ foerderinfo.bund.de-bekanntmachungen:
      #child-deadline:  '//html/body/form/table/tr[2]/td[3]/span + label.1'
    unifalse:
      wordlist:  "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"
 giz:
  domain: 'https://ausschreibungen.giz.de'
  entry-list:
    link1:  'https://ausschreibungen.giz.de/Satellite/company/welcome.do?method=showTable&fromSearch=1&tableSortPROJECT_RESULT=2&tableSortAttributePROJECT_RESULT=publicationDate&selectedTablePagePROJECT_RESULT='
    link2:  ''
    iteration-var-list:  '[1,2,3,4,5,6,7]'
    #parent:  "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']"
    parent:  "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody"
    child-name:  "//tr//td[2]/text()"
    child-link:  "/tr//td[5]/a/@href"
    child-info:  "/tr//td[3]/text()"
    child-period:  "/tr/td[1]/text()"
    #child-period:  "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
    child-sponsor: "//tr/td[4]/text()"
  entry:
    general:
      uniform: 'FALSE'
    unitrue:
      parent:  '//html//body//form//table'
      #child-name:  '//html//body//form//table//tr[1]//td[2]//span'
      #child-sum:  '//html//body//form//table//tr[2]//td[1]//span//img'
      #child-deadline:  '//html/body/form/table/tr[2]/td[3]/span + label.1'
    unifalse:
      wordlist:  "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -69,10 +69,35 @@ class fdb_spider(object):
                        # download the html page of the List of entrys
                        response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
-                        web_content = response.read().decode("UTF-8")
+                        # web_content = response.read().decode("UTF-8")
-
+                        
                        try:
                            web_content = response.read().decode("UTF-8")
                        except Exception as e:
                            try:
                                web_content = response.read().decode("latin-1")
                                print(
                                    "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
                                    e,
                                )
                            except Exception as ex:
                                print(ex)
                        # save interim results to files
-
+                        if (len(web_content)) < 10:
                            print('getting the html page through urllib did not work, trying with requests librarys function get')
                            try:
                                res = requests.get(entry_list_link1 + str(i) + entry_list_link2)
                                web_content = res.text
                            except Exception as e:
                                print('also requests library did not work, original error is:', e)
                        print(web_content)
                        f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
                        f.write(web_content)
                        f.close