From 687d40f156c50e8ed0fc9899ef51a9be77126d2f Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Mon, 30 Oct 2023 16:41:14 +0000 Subject: [PATCH] first change of naming, first commit for the actual spider based on importPEP --- fdb_spider.py | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 fdb_spider.py diff --git a/fdb_spider.py b/fdb_spider.py new file mode 100644 index 0000000..8c7d4c5 --- /dev/null +++ b/fdb_spider.py @@ -0,0 +1,186 @@ +import os + +import yaml +import json + +import urllib.request, urllib.error, urllib.parse + +from lxml import etree +import lxml.html +import lxml.html.soupparser + + +class fdb_spider(object): + def __init__(self, config_file): + with open(config_file, "r") as stream: + try: + self.config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + # input list of funding databases in form of yaml file ['foerderinfo.bund.de', 'ausschreibungen.giz.de', .. , 'usw'] + + def download_link_list_pages_of_funding_databases(self, list_of_fdbs): + # download only html pages of the funding databases specified in input + + for fdb in list_of_fdbs: + for key in self.config: + if key in list_of_fdbs: + try: + entry_list = self.config.get(key).get("entry_list") + except Exception as e: + print( + "There is a problem with the configuration variable entryList in the config.yaml - the original error message is:", + e, + ) + try: + entry_list_link = entry_list.get("link") + except Exception as e: + print( + "No entryListLink defined in config.yaml - the original error message is:", + e, + ) + + # download the html page of the List of entrys + + response = urllib.request.urlopen(entry_list_link) + web_content = response.read().decode("UTF-8") + + # save interim results to files + + f = open("spiders/pages/" + key + "entryList.html", "w+") + f.write(webContent) + f.close + + def parse_entry_list_data2dictionary(self, list_of_fdbs): + for fdb in list_of_fdbs: + try: + # use soupparser to handle broken html + + tree = lxml.html.soupparser.parse( + "spiders/pages/" + fdb + "entryList.html" + ) + + # for e in tree.iter(): + # + # print(e.tag) + # + # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): + # + # #print(etree.tostring(e).decode()) + + dictionary_entry_list = {} + + fdb_conf = self.config.get(fdb) + fdb_domain = fdb_conf.get("domain") + fdb_conf_entry_list = fdb_conf.get("entryList") + fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") + fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") + fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") + + for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): + name = tree.xpath( + fdb_conf_entry_list_parent + + "[" + + str(n) + + "]" + + fdb_conf_entry_list_child_name + ) + link = tree.xpath( + fdb_conf_entry_list_parent + + "[" + + str(n) + + "]" + + fdb_conf_entry_list_child_link + ) + + if len(name) > 0: + dictionary_entry_list[n] = {} + dictionary_entry_list[n]["name"] = name[0] + + if fdb_domain in link[0]: + dictionary_entry_list[n]["link"] = link[0] + + if fdb_domain not in link[0]: + dictionary_entry_list[n]["link"] = fdb_domain + link[0] + + except Exception as e: + print( + "parsing the html did not work. Possibly you first have to downloadentryListPagesOfCountries(). The original error message is:", + e, + ) + + # save interim results to files + + f = open("spiders/output/" + fdb + "entryList.txt", "w+") + f.write(str(dictionary_entry_list)) + f.close + + def download_entry_data_htmls(self, list_of_fdbs): + for fdb in list_of_fdbs: + f = open("spiders/output/" + fdb + "entryList.txt") + text = f.read() + + dictionary_entry_list = eval(text) + + for entry_id in dictionary_entry_list: + entry_link = dictionary_entry_list[entry_id]["link"] + + # download the html page of the entry + + response = urllib.request.urlopen(entry_link) + web_content = response.read().decode("UTF-8") + + # save interim results to files + + file_name = "spiders/pages/" + fdb + "/" + str(entry_id) + ".html" + + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "w+") + f.write(web_content) + f.close + + def parse_entry_data2dictionary(self, list_of_fdbs): + for fdb in list_of_fdbs: + print("started to parse data of entry of " + fdb + " ..") + + f = open("spiders/output/" + fdb + "entryList.txt") + text = f.read() + + dictionary_entry_list = eval(text) + + fdb_conf = self.config.get(fdb) + fdb_domain = fdb_conf.get("domain") + fdb_conf_entry = fdb_conf.get("entry") + fdb_conf_entry_info1 = fdb_conf_entry.get("info-1") + fdb_conf_entry_info1_parent = fdb_conf_entry_info1.get("parent") + fdb_conf_entry_info1_child_1 = fdb_conf_entry_info1.get( + "child-1" + ) + + for entry_id in dictionary_entry_list: + print( + "started to parse data of entry with name " + + dictionary_entry_list[entry_id]["name"] + + " .." + ) + + file_name = "spiders/pages/" + fdb + "/" + str(entry_id) + ".html" + + tree = lxml.html.soupparser.parse(file_name) + + child_1 = tree.xpath( + fdb_conf_entry_info1_parent + + fdb_conf_entry_info1_child_1 + ) + + print("oi", child_1) + + if len(child_1) > 0: + dictionary_entry_list[entry_id]["child_1"] = child_1[ + 0 + ] + + f = open("spiders/output/" + fdb + "entryList.txt", "w+") + f.write(str(dictionary_entry_list)) + f.close