alpcentaur
/
fdb-spider


								import os


								import yaml

								import json


								import urllib.request, urllib.error, urllib.parse


								from lxml import etree

								import lxml.html

								import lxml.html.soupparser

								from lxml import html


								import requests


								from trafilatura import extract


								from pdfminer.high_level import extract_pages

								from pdfminer.layout import LTTextContainer


								import time


								import subprocess


								class fdb_spider(object):

								    def __init__(self, config_file):

								        with open(config_file, "r") as stream:

								            try:

								                self.config = yaml.safe_load(stream)

								            except yaml.YAMLError as exc:

								                print(exc)


								    # input list of funding databases in form of yaml file ['foerderinfo.bund.de', 'ausschreibungen.giz.de', .. , 'usw']


								    def download_entry_list_pages_of_funding_databases(self, list_of_fdbs):

								        # download only html pages of the funding databases specified in input


								        for fdb in list_of_fdbs:

								            for key in self.config:

								                if key in list_of_fdbs:

								                    try:

								                        entry_list = self.config.get(key).get("entry-list")

								                    except Exception as e:

								                        print(

								                            "There is a problem with the configuration variable entryList in the config.yaml - the original error message is:",

								                            e,

								                        )

								                    try:

								                        entry_list_link1 = entry_list.get("link1")

								                    except Exception as e:

								                        print(

								                            "No link1 defined in config.yaml - the original error message is:",

								                            e,

								                        )


								                    try:

								                        entry_list_link2 = entry_list.get("link2")

								                    except Exception as e:

								                        print(

								                            "No link2 defined in config.yaml - the original error message is:",

								                            e,

								                        )


								                    try:

								                        entry_list_jslink1 = entry_list.get("jslink1")

								                    except Exception as e:

								                        print(

								                            "No jslink1 defined in config.yaml - the original error message is:",

								                            e,

								                        )

								                        entry_list_jslink1 = 'NONE'

								                    try:

								                        entry_list_jslink2 = entry_list.get("jslink2")

								                    except Exception as e:

								                        print(

								                            "No jslink2 defined in config.yaml - the original error message is:",

								                            e,

								                        )

								                        entry_list_jslink2 = 'NONE'


								                    try:

								                        entry_iteration_var_list = eval(entry_list.get("iteration-var-list"))

								                    except Exception as e:

								                        print(

								                            "No iteration-var-list defined in config.yaml - the original error message is:",

								                            e,

								                        )

								                    try:

								                        entry_jsiteration_var_list = eval(entry_list.get("jsiteration-var-list"))

								                    except Exception as e:

								                        print(

								                            "No jsiteration-var-list defined in config.yaml - the original error message is:",

								                            e,

								                        )

								                    try:

								                        entry_jsdomain = entry_list.get("jsdomain")

								                    except Exception as e:

								                        print(

								                            "No jsdomain defined in config.yaml - the original error message is:",

								                            e,

								                        )

								                        entry_jsdomain = 'NONE'


								                    if entry_jsdomain == 'NONE' or entry_jsdomain == 'None':


								                        for i in entry_iteration_var_list:


								                            # download the html page of the List of entrys


								                            response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)

								                            # web_content = response.read().decode("UTF-8")


								                            try:

								                                web_content = response.read().decode("UTF-8")

								                            except Exception as e:

								                                try:

								                                    web_content = response.read().decode("latin-1")

								                                    print(

								                                        "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",

								                                        e,

								                                    )

								                                except Exception as ex:

								                                    print(ex)


								                            # save interim results to files

								                            if (len(web_content)) < 10:

								                                print('getting the html page through urllib did not work, trying with requests librarys function get')

								                                try:

								                                    res = requests.get(entry_list_link1 + str(i) + entry_list_link2)

								                                    web_content = res.text

								                                except Exception as e:

								                                    print('also requests library did not work, original error is:', e)


								                        # print(web_content)


								                            f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")

								                            f.write(web_content)

								                            f.close

								                    else:

								                        from selenium import webdriver

								                        from selenium.webdriver.chrome.service import Service

								                        #from selenium.webdriver.common.action_chains import ActionChains

								                        from pyvirtualdisplay import Display

								                        # changed display to 1200, because element was not found in "mobile version" with 800 width

								                        display = Display(visible=0, size=(1200, 800))

								                        display.start()


								                        ##outputdir = '.'

								                        ##service_log_path = "{}/chromedriver.log".format(outputdir)

								                        ##service_args = ['--verbose']

								                        ##driver = webdriver.Chrome('/usr/bin/chromium')


								                        options = webdriver.ChromeOptions()

								                        #options.add_argument('headless')

								                        options.add_argument("--remote-debugging-port=9222")

								                        options.add_argument('--no-sandbox')

								                        options.add_argument('--disable-dev-shm-usage')

								                        service = Service(executable_path='/usr/bin/chromedriver')

								                        driver = webdriver.Chrome(options=options, service=service)

								                        # driver = webdriver.Chrome()

								                        driver.implicitly_wait(5)

								                        driver.get(entry_jsdomain)


								                        try:

								                            accept_button = driver.find_element("xpath","//button[contains(text(), 'akzeptieren')]")

								                            accept_button.click()

								                        except Exception as e:

								                            print(e, 'no cookies to accept..')

								                            pass

								                        try:

								                            accept_button = driver.find_element("xpath","//button[contains(text(), 'Accept')]")

								                            accept_button.click()

								                        except Exception as e:

								                            print(e, 'no cookies to accept..')

								                            pass


								                        for i in range(len(entry_jsiteration_var_list)):

								                            time.sleep(1)

								                            print('trying to get element')

								                            try:


								                                # scroll down, to get the javascript view loading to get the elements

								                                driver.execute_script("scroll(0, 600)")


								                                element = driver.find_element(

								                                    "xpath",

								                                    entry_list_jslink1

								                                    + str(entry_jsiteration_var_list[i])

								                                    + entry_list_jslink2

								                                )

								                                print(entry_iteration_var_list[i])

								                                time.sleep(1)

								                                print('scrolling..')


								                                # scroll into view, because otherwise with javascript generated elements

								                                # it can be that clicking returns an error

								                                driver.execute_script("arguments[0].scrollIntoView();", element)

								                                print('clicking..')

								                                time.sleep(1)

								                                element.click()

								                                time.sleep(1)

								                                #window_after = driver.window_handles[1]

								                                print('length of the window handles', len(driver.window_handles))

								                                #driver.switch_to.window(window_after)

								                                web_content = driver.page_source


								                                f = open("spiders/pages/" + key + str(entry_iteration_var_list[i]) + "entryList.html", "w+")

								                                f.write(web_content)

								                                f.close

								                            except Exception as e:

								                                print('the iteration var element for clicking the pages was not found.. the original message is:',e )


								    def find_config_parameter(self, list_of_fdbs):

								        for fdb in list_of_fdbs:


								            try:

								                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))

								            except Exception as e:

								                print(

								                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml",

								                    e,

								                )


								            fdb_conf = self.config.get(fdb)

								            fdb_domain = fdb_conf.get("domain")

								            fdb_conf_entry_list = fdb_conf.get("entry-list")

								            fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")

								            fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")

								            fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")

								            fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")

								            fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")


								            for i in iteration_var_list:

								                print(i)


								                try:

								                    # use soupparser to handle broken html


								                    tree = lxml.html.soupparser.parse(

								                        "spiders/pages/" + fdb + str(i) + "entryList.html"

								                    )


								                except Exception as e:

								                    tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")

								                    print(

								                        "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been",

								                        e,

								                    )


								                try:


								                    print('this is the n looped elements of the parent specified in config.yaml:')


								                    print('entrylistparent', fdb_conf_entry_list_parent)


								                    print(tree.xpath("//html//body//div"))


								                    print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[0]).decode())


								                    for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):

								                        print('-----------------------------------------------------------------------------------------------------------------------------------------')

								                        print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode())


								                    print('this is the name children:')


								                    name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name)

								                    print(name_element)

								                    #for name in name_element:

								                    #    print(name)

								                    print(len(name_element))


								                    print('this is the link children:')


								                    link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link)

								                    print(link_element)

								                    #for link in link_element:

								                    #    print(link)

								                    print(len(link_element))


								                    print('this is the info children:')


								                    info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info)

								                    print(info_element)

								                    print(len(info_element))


								                    print('this is the period children:')


								                    period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period)

								                    print(period_element)

								                    print(len(period_element))


								                except Exception as e:

								                    print(

								                        "parsing the html did not work.",

								                        e,

								                    )


								    def parse_entry_list_data2dictionary(self, list_of_fdbs):

								        for fdb in list_of_fdbs:


								            try:

								                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))

								            except Exception as e:

								                print(

								                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",

								                    e,

								                )


								            for i in iteration_var_list:

								                print(i)

								                try:

								                    # use soupparser to handle broken html


								                    tree = lxml.html.soupparser.parse(

								                        "spiders/pages/" + fdb + str(i) + "entryList.html"

								                    )


								                except Exception as e:

								                    tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")

								                    print(

								                        "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",

								                        e,

								                    )


								                try:


								                    #print('this is the n looped elements of the parent specified in config.yaml:')


								                    #for e in tree.iter():


								                    #    print(e.tag)

								                    #

								                    #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):

								                    #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']"):


								                    #    print(etree.tostring(e).decode())


								                    dictionary_entry_list = {}


								                    fdb_conf = self.config.get(fdb)

								                    fdb_domain = fdb_conf.get("domain")

								                    fdb_conf_entry_list = fdb_conf.get("entry-list")

								                    fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")

								                    fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")

								                    fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")

								                    fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")

								                    fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")


								                    #print('blabliblub')

								                    #print('len', len(tree.xpath(fdb_conf_entry_list_parent)))

								                    for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):


								                        try:

								                            name = tree.xpath(

								                                fdb_conf_entry_list_parent

								                                + "["

								                                + str(n+1)

								                                + "]"

								                                + fdb_conf_entry_list_child_name

								                            )[0]


								                        except Exception as e:

								                            print("name could not be parsed", e)

								                            name = 'NONE'


								                        try:

								                            info = tree.xpath(

								                                fdb_conf_entry_list_parent

								                                + "["

								                                + str(n+1)

								                                + "]"

								                                + fdb_conf_entry_list_child_info

								                            )[0]


								                        except Exception as e:

								                            print("info could not be parsed", e, info)

								                            info = 'NONE'


								                        try:

								                            period = tree.xpath(

								                                fdb_conf_entry_list_parent

								                                + "["

								                                + str(n+1)

								                                + "]"

								                                + fdb_conf_entry_list_child_period

								                            )[0]

								                            #print('period', period)

								                        except Exception as e:

								                            print("period could not be parsed", e, period)

								                            period = 'NONE'


								                        try:

								                            link = tree.xpath(

								                                fdb_conf_entry_list_parent

								                                + "["

								                                + str(n+1)

								                                + "]"

								                                + fdb_conf_entry_list_child_link

								                            )[0]


								                            if 'javascript:' in link:

								                                #from selenium import webdriver

								                                print('link is javascript element, not url to parse')

								                                #url = 'https://example.com'

								                                #driver = webdriver.Chrome()

								                                #driver.get(url)

								                                #links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')]


								                            #print('link', link)


								                        except Exception as e:

								                            print("link could not be parsed", e, link)

								                            link = 'NONE'


								                        if len(name) > 0 and name != 'NONE':

								                            dictionary_entry_list[n] = {}

								                            dictionary_entry_list[n]["name"] = name

								                            dictionary_entry_list[n]["info"] = info

								                            dictionary_entry_list[n]["period"] = period

								                            print('linklink', link, fdb_domain)

								                            if fdb_domain in link:

								                                print('oi')

								                                dictionary_entry_list[n]["link"] = link

								                            if fdb_domain not in link and 'http:' in link:

								                                print('oiA')

								                                dictionary_entry_list[n]["link"] = link

								                            if fdb_domain not in link and 'www.' in link:

								                                dictionary_entry_list[n]["link"] = link

								                            if fdb_domain not in link and 'https:' in link:

								                                dictionary_entry_list[n]["link"] = link

								                            if 'javascript:' in link:

								                                dictionary_entry_list[n]["link"] = link

								                            if fdb_domain not in link:

								                                if 'http' not in link:

								                                    if 'www' not in link:

								                                        print('oiB')

								                                        if link[0] == '/':

								                                            if fdb_domain[-1] != '/':

								                                                dictionary_entry_list[n]["link"] = fdb_domain + link

								                                                #print('got into D', dictionary_entry_list[n]["link"])

								                                            if fdb_domain[-1] == '/':

								                                                dictionary_entry_list[n]["link"] = fdb_domain + link[1:]

								                                                #print('got into C', dictionary_entry_list[n]["link"])

								                                        if link[0] == '.' and link[1] == '/':

								                                            if fdb_domain[-1] != '/':

								                                                dictionary_entry_list[n]["link"] = fdb_domain + link[1:]

								                                                print('got into B', dictionary_entry_list[n]["link"])

								                                            if fdb_domain[-1] == '/':

								                                                dictionary_entry_list[n]["link"] = fdb_domain + link[2:]

								                                                print('got into A', dictionary_entry_list[n]["link"])


								                                        if link[0] != '/' and link[0] != '.':

								                                            dictionary_entry_list[n]["link"] = fdb_domain + '/' + link

								                                            #print('got into last else', dictionary_entry_list[n]["link"])


								                except Exception as e:

								                    print(

								                        "parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:",

								                        e,

								                    )


								                # save interim results to files


								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")

								                f.write(str(dictionary_entry_list))

								                f.close


								    def download_entry_data_htmls(self, list_of_fdbs):


								        from selenium import webdriver

								        from selenium.webdriver.chrome.service import Service

								        from pyvirtualdisplay import Display

								        display = Display(visible=0, size=(800, 800))

								        display.start()


								        #outputdir = '.'

								        #service_log_path = "{}/chromedriver.log".format(outputdir)

								        #service_args = ['--verbose']

								        #driver = webdriver.Chrome('/usr/bin/chromium')

								        options = webdriver.ChromeOptions()

								        #options.add_argument('headless')

								        options.add_argument("--remote-debugging-port=9222")

								        options.add_argument('--no-sandbox')

								        options.add_argument('--disable-dev-shm-usage')

								        service = Service(executable_path='/usr/bin/chromedriver')

								        driver = webdriver.Chrome(options=options, service=service)

								        driver.implicitly_wait(10)

								        #driver = webdriver.Chrome()

								        for fdb in list_of_fdbs:

								            print('spidering ' + fdb + ' ..')

								            try:

								                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))

								            except Exception as e:

								                print(

								                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",

								                    e,

								                )

								            print('starting to download the entry html pages..')

								            for i in iteration_var_list:

								                print(i)


								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt")

								                text = f.read()


								                dictionary_entry_list = eval(text)


								                fdb_conf = self.config.get(fdb)

								                fdb_domain = fdb_conf.get("domain")

								                fdb_conf_entry_list = fdb_conf.get("entry-list")

								                fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")

								                fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")

								                try:

								                    fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")

								                except Exception as e:

								                    fdb_conf_entry_list_javascript_link = 'NONE'

								                    print('the javascript link in the config is missing, original error message is:', e)

								                try:

								                    fdb_conf_entry_list_slow_downloading = fdb_conf_entry_list.get("slow-downloading")

								                except Exception as e:

								                    print('the slow-downloading parameter is not set, original error message is:', e)

								                fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")

								                fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")


								                if fdb_conf_entry_list_slow_downloading == 'FALSE':


								                    driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)


								                else:

								                    pass


								                for entry_id in dictionary_entry_list:

								                    print(entry_id)

								                    entry_link = dictionary_entry_list[entry_id]["link"]

								                    web_content = 'NONE'

								                    # download the html page of the entry

								                    print(entry_link)


								                    if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':


								                        try:

								                            accept_button = driver.find_element("xpath","//button[contains(text(), 'akzeptieren')]")

								                            accept_button.click()

								                        except Exception as e:

								                            print(e, 'no cookies to accept..')

								                            pass


								                        driver.execute_script("scroll(0, 600)")


								                        print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)

								                        element = driver.find_element(

								                            "xpath",

								                            fdb_conf_entry_list_parent

								                            + "["

								                            + str(entry_id+1)

								                            + "]"

								                            + fdb_conf_entry_list_javascript_link

								                        )


								                        # to time.sleep was suggested for errors

								                        import time

								                        time.sleep(1)


								                        element.click()

								                        window_after = driver.window_handles[1]

								                        driver.switch_to.window(window_after)

								                        #element = driver.find_element("xpath", "//html")

								                        #web_content = element.text


								                        #entry_domain = driver.getCurrentUrl()

								                        entry_domain = driver.current_url


								                        dictionary_entry_list[entry_id]["domain"] = entry_domain


								                        web_content = driver.page_source


								                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"

								                        os.makedirs(os.path.dirname(file_name), exist_ok=True)

								                        f = open(file_name, "w+")

								                        f.write(web_content)

								                        f.close


								                        window_before = driver.window_handles[0]

								                        driver.switch_to.window(window_before)


								                    if 'javascript' not in entry_link and '.pdf' not in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':

								                        print('blabuuuuuba')

								                        #print('oi')


								                        if fdb_conf_entry_list_slow_downloading == 'TRUE':


								                            try:


								                                print("trying to get slowly entry link " , entry_link)

								                                driver.get(entry_link)

								                                time.sleep(3)

								                                web_content = driver.page_source


								                            except Exception as e:

								                                print("getting the html behind the entry link did not work, ori message is:", e)


								                        else:


								                            try:

								                                # defining cookie to not end up in endless loop because of cookie banners pointing to redirects

								                                url = entry_link

								                                req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})

								                                response = urllib.request.urlopen(req)

								                                print('response from first one', response)

								                            except Exception as e:

								                                print('cookie giving then downloading did not work, original error is:', e)

								                                try:

								                                    response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))

								                                    print(

								                                        "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",

								                                        e,

								                                    )

								                                except Exception as ex:

								                                    print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )


								                            try:

								                                web_content = response.read().decode("UTF-8")

								                            except Exception as e:

								                                try:

								                                    web_content = response.read().decode("latin-1")

								                                    print(

								                                        "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",

								                                        e,

								                                    )

								                                except Exception as ex:

								                                    print(ex)


								                            # save interim results to files


								                    if '.pdf' in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':


								                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"

								                        response = requests.get(entry_link)

								                        os.makedirs(os.path.dirname(file_name), exist_ok=True)

								                        f = open(file_name, "bw")

								                        f.write(response.content)

								                        f.close


								                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"


								                    wget_wrote = False


								                    if web_content == 'NONE':

								                        print('other downloading approaches did not work, trying requests')


								                        try:

								                            from requests_html import HTMLSession

								                            session = HTMLSession()


								                            r = session.get(entry_link)


								                            r.html.render()

								                            web_content = r.text


								                        except Exception as e:

								                            print('requests_html HTMLSession did not work trying wget, ori error is:', e)


								                            try:

								                                os.makedirs(os.path.dirname(file_name), exist_ok=True)

								                                oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])

								                                wget_wrote = True

								                            except subprocess.CalledProcessError:

								                                print('wget downloading did not work.. saving NONE to file now')


								                    if wget_wrote == False:

								                        os.makedirs(os.path.dirname(file_name), exist_ok=True)

								                        f = open(file_name, "w+")

								                        f.write(web_content)

								                        f.close


								                # save the entry_domain, implemented first for further downloads in javascript links

								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")

								                f.write(str(dictionary_entry_list))

								                f.close


								    def parse_entry_data2dictionary(self, list_of_fdbs):

								        for fdb in list_of_fdbs:


								            try:

								                fdb_config = self.config.get(fdb)

								                print('oi oi',fdb_config)

								                fdb_config_entrylist = fdb_config.get("entry-list")

								                iteration_var_list = eval(fdb_config_entrylist.get("iteration-var-list"))

								            except Exception as e:

								                print(

								                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",

								                    e,

								                )


								            for i in iteration_var_list:


								                print("started to parse data of entry of " + fdb + " ..")


								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt")

								                text = f.read()


								                dictionary_entry_list = eval(text)


								                fdb_conf = self.config.get(fdb)

								                fdb_domain = fdb_conf.get("domain")

								                fdb_conf_entry = fdb_conf.get("entry")

								                #print('balubaluba', fdb_conf_entry)

								                fdb_conf_entry_general = fdb_conf_entry.get("general")

								                #print(fdb_conf_entry_general)


								                for entry_id in dictionary_entry_list:

								                    print(

								                        "started to parse data of entry with name "

								                        + dictionary_entry_list[entry_id]["name"]

								                        + " .."

								                    )


								                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"


								                    try:


								                        tree = lxml.html.soupparser.parse(file_name)


								                    except Exception as e:

								                        tree = html.parse(file_name)

								                        print(

								                            "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",

								                            e,

								                        )


								                    if fdb_conf_entry_general["uniform"] == 'TRUE':

								                        fdb_conf_entry_unitrue = fdb_conf_entry.get("unitrue")


								                        for key in fdb_conf_entry_unitrue:

								                            fdb_conf_entry_unitrue_child = fdb_conf_entry_unitrue.get(key)


								                            print('unitrue_child',fdb_conf_entry_unitrue_child)

								                            try:

								                                child = tree.xpath(

								                                    fdb_conf_entry_unitrue_child

								                                )[0]

								                                print('oi', child)

								                            except:

								                                print('getting unitruechild did not work')

								                                child = 'NONE'

								                            print("oi", child)


								                            if '.pdf' in child:


								                                print('child in entry data is pdf, downloading it..')


								                                file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".pdf"

								                                entry_link = dictionary_entry_list[entry_id]["link"]

								                                print('that is the child: ' + child)

								                                if 'http' in child:

								                                    try:

								                                        response = requests.get(child)

								                                    except Exception as e:

								                                        print(child + ' does not appear to be valid pdf link to download, original message is ' + e)


								                                if 'http' not in child:

								                                    if 'javascript' or 'js' not in entry_link and 'http' in entry_link:

								                                        try:

								                                            response = requests.get(entry_link + child)

								                                        except Exception as e:

								                                            print(entry_link + child + ' seems not a valid pdf link to download, orginal error message is:', e)


								                                    if 'javascript' or 'js' in entry_link:

								                                        entry_domain = dictionary_entry_list[entry_id]["domain"]

								                                        if child[0] == '.' and child[1] == '/':

								                                            if entry_domain[-1] == '/':

								                                                pdf_link = entry_domain[:-1] + child[1:]

								                                            if entry_domain[-1] != '/':

								                                                #print('it got into OIOIOIOOIOI')

								                                                #print('before loop ', entry_domain)

								                                                cut_value = 0

								                                                for n in range(len(entry_domain)):

								                                                    if entry_domain[-n] != '/':

								                                                        cut_value += 1


								                                                    else:

								                                                        break


								                                                entry_domain = entry_domain[:-cut_value]

								                                                #print('after loop ', entry_domain)


								                                                pdf_link = entry_domain + child[1:]

								                                                #print('the pdf link after recursive until slash: ', pdf_link)


								                                        if child[0] == '/':

								                                            if entry_domain[-1] == '/':

								                                                pdf_link = entry_domain[:-1] + child

								                                            if entry_domain[-1] != '/':

								                                                pdf_link = entry_domain + child


								                                        print('pdf_link', pdf_link)

								                                        try:

								                                                response = requests.get(pdf_link)

								                                        except Exception as e:

								                                            print(pdf_link + ' seems not a valid pdf link to download, orginal error message is:', e)


								                                #response = requests.get(child)

								                                os.makedirs(os.path.dirname(file_name), exist_ok=True)

								                                f = open(file_name, "bw")

								                                f.write(response.content)

								                                f.close


								                                print('parsing a pdf', pdf_link, entry_id)


								                                try:


								                                    generaltext = ''


								                                    for page_layout in extract_pages(file_name):

								                                        for element in page_layout:

								                                            if isinstance(element, LTTextContainer):

								                                                generaltext += element.get_text()


								                                except Exception as e:

								                                    generaltext = 'NONE'

								                                    print('parsing pdf did not work, the original error is:', e )


								                                dictionary_entry_list[entry_id][key] = generaltext


								                            if len(child) > 0 and '.pdf' not in child:

								                                dictionary_entry_list[entry_id][key] = child[

								                                    0

								                                ]


								                    else:

								                        fdb_conf_entry_unifalse = fdb_conf_entry.get("unifalse")

								                        fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist")


								                        if '.pdf' in dictionary_entry_list[entry_id]["link"]:


								                            print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id)


								                            try:


								                                generaltext = ''


								                                for page_layout in extract_pages(file_name):

								                                    for element in page_layout:

								                                        if isinstance(element, LTTextContainer):

								                                            generaltext += element.get_text()


								                            except Exception as e:

								                                generaltext = 'NONE'

								                                print('parsing pdf did not work, the original error is:', e )


								                        else:


								                            p_text = tree.xpath(

								                                "//p//text()"

								                            )


								                            div_text = tree.xpath(

								                                "//div//text()"

								                            )


								                            #print("oi", text)

								                            generaltext = ''

								                            for n in range(len(p_text)):


								                                if len(p_text[n]) > 0:

								                                    generaltext += p_text[n] + ' '


								                            for n in range(len(div_text)):


								                                if len(div_text[n]) > 0 and div_text[n] not in p_text:

								                                    generaltext += div_text[n] + ' '


								                            generaltextlist = generaltext.split(' ')

								                            if len(generaltextlist) > 5000:

								                                print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist))


								                                file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"


								                                try:

								                                    with open(file_name , 'r', encoding='utf-8') as file:

								                                        html_content = file.read()

								                                except Exception as e:


								                                    with open(file_name , 'r', encoding='latin-1') as file:

								                                        html_content = file.read()

								                                    print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)


								                                generaltext = extract(html_content)

								                                print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))


								                            if len(generaltextlist) < 2:


								                                print('no text parsed, the wc is', len(generaltextlist))


								                                print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))


								                                file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"


								                                try:

								                                    with open(file_name , 'r', encoding='utf-8') as file:

								                                        html_content = file.read()

								                                except Exception as e:


								                                    with open(file_name , 'r', encoding='latin-1') as file:

								                                        html_content = file.read()

								                                    print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)


								                                generaltext = extract(html_content)

								                                try:

								                                    if len(generaltext) > 2:

								                                        print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))

								                                except:


								                                    print('trafilatura got this out:', generaltext , 'setting generaltext to NONE')

								                                    generaltext = 'NONE'


								                        dictionary_entry_list[entry_id]["text"] = generaltext

								                        dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)


								                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")

								                f.write(str(dictionary_entry_list))

								                f.close