alpcentaur
/
fdb-spider


								import os


								import yaml

								import json


								import urllib.request, urllib.error, urllib.parse


								from lxml import etree

								import lxml.html

								import lxml.html.soupparser

								from lxml import html


								class fdb_spider(object):

								    def __init__(self, config_file):

								        with open(config_file, "r") as stream:

								            try:

								                self.config = yaml.safe_load(stream)

								            except yaml.YAMLError as exc:

								                print(exc)


								    # input list of funding databases in form of yaml file ['foerderinfo.bund.de', 'ausschreibungen.giz.de', .. , 'usw']


								    def download_entry_list_pages_of_funding_databases(self, list_of_fdbs):

								        # download only html pages of the funding databases specified in input


								        for fdb in list_of_fdbs:

								            for key in self.config:

								                if key in list_of_fdbs:

								                    try:

								                        entry_list = self.config.get(key).get("entry-list")

								                    except Exception as e:

								                        print(

								                            "There is a problem with the configuration variable entryList in the config.yaml - the original error message is:",

								                            e,

								                        )

								                    try:

								                        entry_list_link1 = entry_list.get("link1")

								                    except Exception as e:

								                        print(

								                            "No link1 defined in config.yaml - the original error message is:",

								                            e,

								                        )


								                    try:

								                        entry_list_link2 = entry_list.get("link2")

								                    except Exception as e:

								                        print(

								                            "No link2 defined in config.yaml - the original error message is:",

								                            e,

								                        )


								                    try:

								                        entry_iteration_var_list = eval(entry_list.get("iteration-var-list"))

								                    except Exception as e:

								                        print(

								                            "No iteration-var-list defined in config.yaml - the original error message is:",

								                            e,

								                        )


								                    for i in entry_iteration_var_list:


								                        # download the html page of the List of entrys


								                        response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)

								                        web_content = response.read().decode("UTF-8")


								                        # save interim results to files


								                        f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")

								                        f.write(web_content)

								                        f.close


								    def find_config_parameter(self, list_of_fdbs):

								        for fdb in list_of_fdbs:


								            try:

								                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))

								            except Exception as e:

								                print(

								                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml",

								                    e,

								                )


								            fdb_conf = self.config.get(fdb)

								            fdb_domain = fdb_conf.get("domain")

								            fdb_conf_entry_list = fdb_conf.get("entry-list")

								            fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")

								            fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")

								            fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")


								            for i in iteration_var_list:

								                print(i)


								                try:

								                    # use soupparser to handle broken html


								                    tree = lxml.html.soupparser.parse(

								                        "spiders/pages/" + fdb + str(i) + "entryList.html"

								                    )


								                except Exception as e:

								                    tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")

								                    print(

								                        "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been",

								                        e,

								                    )


								                try:


								                    print('this is the n looped elements of the parent specified in config.yaml:')


								                    #print('entrylistparent', fdb_conf_entry_list_parent)


								                    #print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"))


								                    #print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode())


								                    for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):

								                        print('-----------------------------------------------------------------------------------------------------------------------------------------')

								                        print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode())


								                    print('this is the first actual name element:')


								                    name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name)

								                    print(name_element)

								                    for name in name_element:

								                        print(name)


								                    print('this is the first actual link element:')


								                    link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link)

								                    print(link_element)

								                    #for link in link_element:

								                    #    print(link)


								                except Exception as e:

								                    print(

								                        "parsing the html did not work.",

								                        e,

								                    )


								    def parse_entry_list_data2dictionary(self, list_of_fdbs):

								        for fdb in list_of_fdbs:


								            try:

								                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))

								            except Exception as e:

								                print(

								                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",

								                    e,

								                )


								            for i in iteration_var_list:

								                print(i)

								                try:

								                    # use soupparser to handle broken html


								                    tree = lxml.html.soupparser.parse(

								                        "spiders/pages/" + fdb + str(i) + "entryList.html"

								                    )


								                except Exception as e:

								                    tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")

								                    print(

								                        "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",

								                        e,

								                    )


								                try:


								                    #print('this is the n looped elements of the parent specified in config.yaml:')


								                    #for e in tree.iter():


								                    #    print(e.tag)

								                    #

								                    #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):

								                    #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']"):


								                    #    print(etree.tostring(e).decode())


								                    dictionary_entry_list = {}


								                    fdb_conf = self.config.get(fdb)

								                    fdb_domain = fdb_conf.get("domain")

								                    fdb_conf_entry_list = fdb_conf.get("entry-list")

								                    fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")

								                    fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")

								                    fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")

								                    print('blabliblub')

								                    print('len', len(tree.xpath(fdb_conf_entry_list_parent)))

								                    for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):

								                        print('oi inside the loop')

								                        name = tree.xpath(

								                            fdb_conf_entry_list_parent

								                            + fdb_conf_entry_list_child_name

								                        )[n]

								                        print('oi ', name)

								                        print('blablidubbiduub')

								                        link = tree.xpath(

								                            fdb_conf_entry_list_parent

								                        #    + "["

								                        #    + str(n)

								                        #    + "]"

								                            + fdb_conf_entry_list_child_link

								                        )[n]


								                        print('oi' + name)


								                        if len(name) > 0:

								                            dictionary_entry_list[n] = {}

								                            dictionary_entry_list[n]["name"] = name


								                            if fdb_domain in link:

								                                dictionary_entry_list[n]["link"] = link


								                            if fdb_domain not in link:

								                                if link[-1] == '/':

								                                    dictionary_entry_list[n]["link"] = fdb_domain + link

								                                else:

								                                    dictionary_entry_list[n]["link"] = fdb_domain + '/' + link


								                except Exception as e:

								                    print(

								                        "parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:",

								                        e,

								                    )


								                # save interim results to files


								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")

								                f.write(str(dictionary_entry_list))

								                f.close


								    def download_entry_data_htmls(self, list_of_fdbs):

								        for fdb in list_of_fdbs:


								            try:

								                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))

								            except Exception as e:

								                print(

								                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",

								                    e,

								                )


								            for i in iteration_var_list:


								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt")

								                text = f.read()


								                dictionary_entry_list = eval(text)


								                for entry_id in dictionary_entry_list:

								                    entry_link = dictionary_entry_list[entry_id]["link"]


								                    # download the html page of the entry


								                    response = urllib.request.urlopen(entry_link)

								                    web_content = response.read().decode("UTF-8")


								                    # save interim results to files


								                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"


								                    os.makedirs(os.path.dirname(file_name), exist_ok=True)

								                    f = open(file_name, "w+")

								                    f.write(web_content)

								                    f.close


								    def parse_entry_data2dictionary(self, list_of_fdbs):

								        for fdb in list_of_fdbs:


								            try:

								                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))

								            except Exception as e:

								                print(

								                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",

								                    e,

								                )


								            for i in iteration_var_list:


								                print("started to parse data of entry of " + fdb + " ..")


								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt")

								                text = f.read()


								                dictionary_entry_list = eval(text)


								                fdb_conf = self.config.get(fdb)

								                fdb_domain = fdb_conf.get("domain")

								                fdb_conf_entry = fdb_conf.get("entry")

								                fdb_conf_entry_info1 = fdb_conf_entry.get("info-1")

								                fdb_conf_entry_info1_parent = fdb_conf_entry_info1.get("parent")

								                fdb_conf_entry_info1_child_1 = fdb_conf_entry_info1.get(

								                    "child-1"

								                )


								                for entry_id in dictionary_entry_list:

								                    print(

								                        "started to parse data of entry with name "

								                        + dictionary_entry_list[entry_id]["name"]

								                        + " .."

								                    )


								                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"


								                    tree = lxml.html.soupparser.parse(file_name)


								                    child_1 = tree.xpath(

								                        fdb_conf_entry_info1_parent

								                        + fdb_conf_entry_info1_child_1

								                    )


								                    print("oi", child_1)


								                    if len(child_1) > 0:

								                        dictionary_entry_list[entry_id]["child_1"] = child_1[

								                            0

								                        ]


								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")

								                f.write(str(dictionary_entry_list))

								                f.close