diff --git a/spiders/__pycache__/fdb_spider.cpython-311.pyc b/spiders/__pycache__/fdb_spider.cpython-311.pyc index f3f6ff0..8c574fc 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-311.pyc and b/spiders/__pycache__/fdb_spider.cpython-311.pyc differ diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index f4a1860..a9293e6 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/config.yaml b/spiders/config.yaml index 2d1a81a..3afa2f7 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -74,3 +74,30 @@ giz: unifalse: wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" +evergabe-online: + domain: 'https://www.evergabe-online.de/' + entry-list: + link1: 'https://www.evergabe-online.de/search.html?101-1.-searchPanel-results-searchResults-results-topToolbars-toolbars-1-span-navigator-navigation-' + link2: '-pageLink' + jsdomain: 'https://www.evergabe-online.de/search.html' + jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span[' + jslink2: ']' + iteration-var-list: "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102]" + parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody" + child-name: "//tr/td[1]/div/a/text()" + child-link: "//tr/td[1]/div/a/@href" + javascript-link: "/td[6]/a" + child-info: "/td[4]/text()[1]" + child-period: "//td[2]/abbr/text()" + #child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" + child-sponsor: "/tr/td[4]/text()" + entry: + general: + uniform: 'TRUE' + unitrue: + #parent: '//html//body//form//table' + text: '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a/@href' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' + unifalse: + wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index e27cecc..e831f8d 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -55,6 +55,23 @@ class fdb_spider(object): "No link2 defined in config.yaml - the original error message is:", e, ) + + try: + entry_list_jslink1 = entry_list.get("jslink1") + except Exception as e: + print( + "No jslink1 defined in config.yaml - the original error message is:", + e, + ) + entry_list_jslink1 = 'NONE' + try: + entry_list_jslink2 = entry_list.get("jslink2") + except Exception as e: + print( + "No jslink2 defined in config.yaml - the original error message is:", + e, + ) + entry_list_jslink2 = 'NONE' try: entry_iteration_var_list = eval(entry_list.get("iteration-var-list")) @@ -63,44 +80,64 @@ class fdb_spider(object): "No iteration-var-list defined in config.yaml - the original error message is:", e, ) + try: + entry_jsdomain = eval(entry_list.get("jsdomain")) + except Exception as e: + print( + "No iteration-var-list defined in config.yaml - the original error message is:", + e, + ) + entry_jsdomain = 'NONE' - for i in entry_iteration_var_list: + if entry_jsdomain == 'NONE': - # download the html page of the List of entrys + for i in entry_iteration_var_list: - response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2) - # web_content = response.read().decode("UTF-8") - - try: - web_content = response.read().decode("UTF-8") - except Exception as e: - try: - web_content = response.read().decode("latin-1") - print( - "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", - e, - ) - except Exception as ex: - print(ex) - - - - # save interim results to files - if (len(web_content)) < 10: - print('getting the html page through urllib did not work, trying with requests librarys function get') + + + # download the html page of the List of entrys + + response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2) + # web_content = response.read().decode("UTF-8") + try: - res = requests.get(entry_list_link1 + str(i) + entry_list_link2) - web_content = res.text + web_content = response.read().decode("UTF-8") except Exception as e: - print('also requests library did not work, original error is:', e) + try: + web_content = response.read().decode("latin-1") + print( + "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", + e, + ) + except Exception as ex: + print(ex) - - # print(web_content) - - f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") - f.write(web_content) - f.close + + + + # save interim results to files + if (len(web_content)) < 10: + print('getting the html page through urllib did not work, trying with requests librarys function get') + try: + res = requests.get(entry_list_link1 + str(i) + entry_list_link2) + web_content = res.text + except Exception as e: + print('also requests library did not work, original error is:', e) + + + # print(web_content) + + f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") + f.write(web_content) + f.close + else: + from selenium import webdriver + + options = webdriver.ChromeOptions() + options.add_argument('headless') + driver = webdriver.Chrome(options=options) + def find_config_parameter(self, list_of_fdbs): for fdb in list_of_fdbs: