From 8b20bc178f80ccc4c4a4a39365cd9808b777af61 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Mon, 6 Nov 2023 18:17:32 +0000 Subject: [PATCH] added multi pages configuration and code --- spiders/config.yaml | 9 +- spiders/fdb_spider.py | 263 +++++++++++++++++++++++++----------------- 2 files changed, 161 insertions(+), 111 deletions(-) diff --git a/spiders/config.yaml b/spiders/config.yaml index fc060a3..6ce8ef0 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -7,14 +7,13 @@ foerderinfo.bund.de: domain: 'http://foerderinfo.bund.de' entry-list: - link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html#searchResults' - link2: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D2#searchResults' - link3: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D3#searchResults' - iteration-var-list: [1,1 + link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' + link2: '#searchResults' + iteration-var-list: [1,2,3,4,5,6,7,8] parent: '//html//body//form//table//tr//td//table//tr' child-name: '//td//a/text()' child-link: '//td//a/@href' - member: + entry: info-1: parent: '//html//body//form//table' #child-name: '//html//body//form//table//tr[1]//td[2]//span' diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 8a5f397..dcd6c06 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -34,153 +34,204 @@ class fdb_spider(object): e, ) try: - entry_list_link = entry_list.get("link") + entry_list_link1 = entry_list.get("link1") except Exception as e: print( - "No entryListLink defined in config.yaml - the original error message is:", + "No link1 defined in config.yaml - the original error message is:", e, ) - # download the html page of the List of entrys + try: + entry_list_link2 = entry_list.get("link2") + except Exception as e: + print( + "No link2 defined in config.yaml - the original error message is:", + e, + ) - response = urllib.request.urlopen(entry_list_link) - web_content = response.read().decode("UTF-8") + try: + entry_iteration_var_list = eval(entry_list.get("iteration-var-list")) + except Exception as e: + print( + "No iteration-var-list defined in config.yaml - the original error message is:", + e, + ) - # save interim results to files + for i in entry_iteration_var_list: - f = open("spiders/pages/" + key + "entryList.html", "w+") - f.write(webContent) - f.close + # download the html page of the List of entrys + + response = urllib.request.urlopen(entry_list_link1 + string(i) + entry_list_link2) + web_content = response.read().decode("UTF-8") + + # save interim results to files + + f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") + f.write(webContent) + f.close def parse_entry_list_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: + try: - # use soupparser to handle broken html - - tree = lxml.html.soupparser.parse( - "spiders/pages/" + fdb + "entryList.html" + iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) + except Exception as e: + print( + "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", + e, ) + + for i in iteration_var_list: + try: + # use soupparser to handle broken html - # for e in tree.iter(): - # - # print(e.tag) - # - # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): - # - # #print(etree.tostring(e).decode()) - - dictionary_entry_list = {} - - fdb_conf = self.config.get(fdb) - fdb_domain = fdb_conf.get("domain") - fdb_conf_entry_list = fdb_conf.get("entryList") - fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") - fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") - fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") - - for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): - name = tree.xpath( - fdb_conf_entry_list_parent - + "[" - + str(n) - + "]" - + fdb_conf_entry_list_child_name - ) - link = tree.xpath( - fdb_conf_entry_list_parent - + "[" - + str(n) - + "]" - + fdb_conf_entry_list_child_link + tree = lxml.html.soupparser.parse( + "spiders/pages/" + fdb + str(i) + "entryList.html" ) - if len(name) > 0: - dictionary_entry_list[n] = {} - dictionary_entry_list[n]["name"] = name[0] + # for e in tree.iter(): + # + # print(e.tag) + # + # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): + # + # #print(etree.tostring(e).decode()) + + dictionary_entry_list = {} + + fdb_conf = self.config.get(fdb) + fdb_domain = fdb_conf.get("domain") + fdb_conf_entry_list = fdb_conf.get("entry-list") + fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") + fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") + fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") + + for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): + name = tree.xpath( + fdb_conf_entry_list_parent + + "[" + + str(n) + + "]" + + fdb_conf_entry_list_child_name + ) + link = tree.xpath( + fdb_conf_entry_list_parent + + "[" + + str(n) + + "]" + + fdb_conf_entry_list_child_link + ) - if fdb_domain in link[0]: - dictionary_entry_list[n]["link"] = link[0] + if len(name) > 0: + dictionary_entry_list[n] = {} + dictionary_entry_list[n]["name"] = name[0] - if fdb_domain not in link[0]: - dictionary_entry_list[n]["link"] = fdb_domain + link[0] + if fdb_domain in link[0]: + dictionary_entry_list[n]["link"] = link[0] - except Exception as e: - print( - "parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:", - e, - ) + if fdb_domain not in link[0]: + dictionary_entry_list[n]["link"] = fdb_domain + link[0] + + except Exception as e: + print( + "parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:", + e, + ) - # save interim results to files + # save interim results to files - f = open("spiders/output/" + fdb + "entryList.txt", "w+") - f.write(str(dictionary_entry_list)) - f.close + f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") + f.write(str(dictionary_entry_list)) + f.close def download_entry_data_htmls(self, list_of_fdbs): for fdb in list_of_fdbs: - f = open("spiders/output/" + fdb + "entryList.txt") - text = f.read() + + try: + iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) + except Exception as e: + print( + "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", + e, + ) + + for i in iteration_var_list: + + + f = open("spiders/output/" + fdb + str(i) + "entryList.txt") + text = f.read() - dictionary_entry_list = eval(text) + dictionary_entry_list = eval(text) - for entry_id in dictionary_entry_list: - entry_link = dictionary_entry_list[entry_id]["link"] + for entry_id in dictionary_entry_list: + entry_link = dictionary_entry_list[entry_id]["link"] - # download the html page of the entry + # download the html page of the entry - response = urllib.request.urlopen(entry_link) - web_content = response.read().decode("UTF-8") + response = urllib.request.urlopen(entry_link) + web_content = response.read().decode("UTF-8") - # save interim results to files + # save interim results to files - file_name = "spiders/pages/" + fdb + "/" + str(entry_id) + ".html" + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" - os.makedirs(os.path.dirname(file_name), exist_ok=True) - f = open(file_name, "w+") - f.write(web_content) - f.close + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "w+") + f.write(web_content) + f.close def parse_entry_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: - print("started to parse data of entry of " + fdb + " ..") - - f = open("spiders/output/" + fdb + "entryList.txt") - text = f.read() + + try: + iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) + except Exception as e: + print( + "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", + e, + ) + + for i in iteration_var_list: + + print("started to parse data of entry of " + fdb + " ..") - dictionary_entry_list = eval(text) + f = open("spiders/output/" + fdb + str(i) + "entryList.txt") + text = f.read() - fdb_conf = self.config.get(fdb) - fdb_domain = fdb_conf.get("domain") - fdb_conf_entry = fdb_conf.get("entry") - fdb_conf_entry_info1 = fdb_conf_entry.get("info-1") - fdb_conf_entry_info1_parent = fdb_conf_entry_info1.get("parent") - fdb_conf_entry_info1_child_1 = fdb_conf_entry_info1.get( - "child-1" - ) + dictionary_entry_list = eval(text) - for entry_id in dictionary_entry_list: - print( - "started to parse data of entry with name " - + dictionary_entry_list[entry_id]["name"] - + " .." + fdb_conf = self.config.get(fdb) + fdb_domain = fdb_conf.get("domain") + fdb_conf_entry = fdb_conf.get("entry") + fdb_conf_entry_info1 = fdb_conf_entry.get("info-1") + fdb_conf_entry_info1_parent = fdb_conf_entry_info1.get("parent") + fdb_conf_entry_info1_child_1 = fdb_conf_entry_info1.get( + "child-1" ) - file_name = "spiders/pages/" + fdb + "/" + str(entry_id) + ".html" + for entry_id in dictionary_entry_list: + print( + "started to parse data of entry with name " + + dictionary_entry_list[entry_id]["name"] + + " .." + ) - tree = lxml.html.soupparser.parse(file_name) + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" - child_1 = tree.xpath( - fdb_conf_entry_info1_parent - + fdb_conf_entry_info1_child_1 - ) + tree = lxml.html.soupparser.parse(file_name) - print("oi", child_1) + child_1 = tree.xpath( + fdb_conf_entry_info1_parent + + fdb_conf_entry_info1_child_1 + ) + + print("oi", child_1) - if len(child_1) > 0: - dictionary_entry_list[entry_id]["child_1"] = child_1[ - 0 - ] + if len(child_1) > 0: + dictionary_entry_list[entry_id]["child_1"] = child_1[ + 0 + ] - f = open("spiders/output/" + fdb + "entryList.txt", "w+") - f.write(str(dictionary_entry_list)) - f.close + f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") + f.write(str(dictionary_entry_list)) + f.close