started adding javascript handling on highest spider level

2023-12-14 12:07:14 +00:00 · 2023-12-14 12:07:14 +00:00 · 14b8db7941
commit 14b8db7941
parent d2324d265a
4 changed files with 97 additions and 33 deletions
--- a/spiders/pycache/fdb_spider.cpython-311.pyc
+++ b/spiders/pycache/fdb_spider.cpython-311.pyc
--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -74,3 +74,30 @@ giz:
    unifalse:
      wordlist:  "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"

+evergabe-online:
+  domain: 'https://www.evergabe-online.de/'
+  entry-list:
+    link1: 'https://www.evergabe-online.de/search.html?101-1.-searchPanel-results-searchResults-results-topToolbars-toolbars-1-span-navigator-navigation-'
+    link2: '-pageLink'
+    jsdomain: 'https://www.evergabe-online.de/search.html'
+    jslink1:  '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span['
+    jslink2:  ']'
+    iteration-var-list: "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102]"
+    parent:  "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody"
+    child-name:  "//tr/td[1]/div/a/text()"
+    child-link:  "//tr/td[1]/div/a/@href"
+    javascript-link: "/td[6]/a"
+    child-info:  "/td[4]/text()[1]"
+    child-period:  "//td[2]/abbr/text()"
+    #child-period:  "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
+    child-sponsor: "/tr/td[4]/text()"
+  entry:
+    general:
+      uniform: 'TRUE'
+    unitrue:
+      #parent:  '//html//body//form//table'
+      text:  '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a/@href'
+      #child-sum:  '//html//body//form//table//tr[2]//td[1]//span//img'
+      #child-deadline:  '//html/body/form/table/tr[2]/td[3]/span + label.1'
+    unifalse:
+      wordlist:  "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -55,6 +55,23 @@ class fdb_spider(object):
                            "No link2 defined in config.yaml - the original error message is:",
                            e,
                        )
+                    
+                    try:
+                        entry_list_jslink1 = entry_list.get("jslink1")
+                    except Exception as e:
+                        print(
+                            "No jslink1 defined in config.yaml - the original error message is:",
+                            e,
+                        )
+                        entry_list_jslink1 = 'NONE'
+                    try:
+                        entry_list_jslink2 = entry_list.get("jslink2")
+                    except Exception as e:
+                        print(
+                            "No jslink2 defined in config.yaml - the original error message is:",
+                            e,
+                        )
+                        entry_list_jslink2 = 'NONE'

                    try:
                        entry_iteration_var_list = eval(entry_list.get("iteration-var-list"))
@ -63,44 +80,64 @@ class fdb_spider(object):
                            "No iteration-var-list defined in config.yaml - the original error message is:",
                            e,
                        )
+                    try:
+                        entry_jsdomain = eval(entry_list.get("jsdomain"))
+                    except Exception as e:
+                        print(
+                            "No iteration-var-list defined in config.yaml - the original error message is:",
+                            e,
+                        )
+                        entry_jsdomain = 'NONE'

-                    for i in entry_iteration_var_list:
+                    if entry_jsdomain == 'NONE':

-                        # download the html page of the List of entrys
+                        for i in entry_iteration_var_list:

-                        response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
-                        # web_content = response.read().decode("UTF-8")
-                        
-                        try:
-                            web_content = response.read().decode("UTF-8")
-                        except Exception as e:
-                            try:
-                                web_content = response.read().decode("latin-1")
-                                print(
-                                    "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
-                                    e,
-                                )
-                            except Exception as ex:
-                                print(ex)
                            
-                        
-                        
-                        
-                        # save interim results to files
-                        if (len(web_content)) < 10:
-                            print('getting the html page through urllib did not work, trying with requests librarys function get')
-                            try:
-                                res = requests.get(entry_list_link1 + str(i) + entry_list_link2)
-                                web_content = res.text
-                            except Exception as e:
-                                print('also requests library did not work, original error is:', e)
                                
-                        
-                       # print(web_content)
-                        
-                        f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
-                        f.write(web_content)
-                        f.close
+
+                            # download the html page of the List of entrys
+
+                            response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
+                            # web_content = response.read().decode("UTF-8")
+                            
+                            try:
+                                web_content = response.read().decode("UTF-8")
+                            except Exception as e:
+                                try:
+                                    web_content = response.read().decode("latin-1")
+                                    print(
+                                        "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
+                                        e,
+                                    )
+                                except Exception as ex:
+                                    print(ex)
+                                
+                            
+                            
+                            
+                            # save interim results to files
+                            if (len(web_content)) < 10:
+                                print('getting the html page through urllib did not work, trying with requests librarys function get')
+                                try:
+                                    res = requests.get(entry_list_link1 + str(i) + entry_list_link2)
+                                    web_content = res.text
+                                except Exception as e:
+                                    print('also requests library did not work, original error is:', e)
+                                    
+                            
+                        # print(web_content)
+                            
+                            f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
+                            f.write(web_content)
+                            f.close
+                    else:
+                        from selenium import webdriver
+
+                        options = webdriver.ChromeOptions()
+                        options.add_argument('headless')
+                        driver = webdriver.Chrome(options=options)
+                                        

    def find_config_parameter(self, list_of_fdbs):
        for fdb in list_of_fdbs: