import os

import yaml
import json

import urllib.request, urllib.error, urllib.parse

from lxml import etree
import lxml.html
import lxml.html.soupparser
from lxml import html

import requests

from trafilatura import extract

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

class fdb_spider(object):
    def __init__(self, config_file):
        with open(config_file, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)

    # input list of funding databases in form of yaml file ['foerderinfo.bund.de', 'ausschreibungen.giz.de', .. , 'usw']

    def download_entry_list_pages_of_funding_databases(self, list_of_fdbs):
        # download only html pages of the funding databases specified in input

        for fdb in list_of_fdbs:
            for key in self.config:
                if key in list_of_fdbs:
                    try:
                        entry_list = self.config.get(key).get("entry-list")
                    except Exception as e:
                        print(
                            "There is a problem with the configuration variable entryList in the config.yaml - the original error message is:",
                            e,
                        )
                    try:
                        entry_list_link1 = entry_list.get("link1")
                    except Exception as e:	
                        print(
                            "No link1 defined in config.yaml - the original error message is:",
                            e,
                        )

                    try:
                        entry_list_link2 = entry_list.get("link2")
                    except Exception as e:
                        print(
                            "No link2 defined in config.yaml - the original error message is:",
                            e,
                        )

                    try:
                        entry_iteration_var_list = eval(entry_list.get("iteration-var-list"))
                    except Exception as e:
                        print(
                            "No iteration-var-list defined in config.yaml - the original error message is:",
                            e,
                        )

                    for i in entry_iteration_var_list:

                        # download the html page of the List of entrys

                        response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2)
                        # web_content = response.read().decode("UTF-8")
                        
                        try:
                            web_content = response.read().decode("UTF-8")
                        except Exception as e:
                            try:
                                web_content = response.read().decode("latin-1")
                                print(
                                    "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
                                    e,
                                )
                            except Exception as ex:
                                print(ex)
                            
                        
                        # save interim results to files
                        if (len(web_content)) < 10:
                            print('getting the html page through urllib did not work, trying with requests librarys function get')
                            try:
                                res = requests.get(entry_list_link1 + str(i) + entry_list_link2)
                                web_content = res.text
                            except Exception as e:
                                print('also requests library did not work, original error is:', e)
                                
                        
                        print(web_content)
                        
                        f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+")
                        f.write(web_content)
                        f.close

    def find_config_parameter(self, list_of_fdbs):
        for fdb in list_of_fdbs:

            try:
                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
            except Exception as e:
                print(
                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml",
                    e,
                )

            fdb_conf = self.config.get(fdb)
            fdb_domain = fdb_conf.get("domain")
            fdb_conf_entry_list = fdb_conf.get("entry-list")
            fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
            fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
            fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
            fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
            fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")


            for i in iteration_var_list:
                print(i)
                
                
                try:
                    # use soupparser to handle broken html

                    tree = lxml.html.soupparser.parse(
                        "spiders/pages/" + fdb + str(i) + "entryList.html"
                    )

                except Exception as e:
                    tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
                    print(
                        "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been",
                        e,
                    )

                try:

                    print('this is the n looped elements of the parent specified in config.yaml:')

                    #print('entrylistparent', fdb_conf_entry_list_parent)

                    #print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"))

                    #print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode())

                    for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
                        print('-----------------------------------------------------------------------------------------------------------------------------------------')
                        print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode())

                    print('this is the name children:')

                    name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name)
                    print(name_element)
                    #for name in name_element:
                    #    print(name)
                    print(len(name_element))

                    print('this is the link children:')

                    link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link)
                    print(link_element)
                    #for link in link_element:
                    #    print(link)
                    print(len(link_element))

                    print('this is the info children:')

                    info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info)
                    print(info_element)
                    print(len(info_element))

                    print('this is the period children:')

                    period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period)
                    print(period_element)
                    print(len(period_element))

                except Exception as e:
                    print(
                        "parsing the html did not work.",
                        e,
                    )


    def parse_entry_list_data2dictionary(self, list_of_fdbs):
        for fdb in list_of_fdbs:
            
            try:
                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
            except Exception as e:
                print(
                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
                    e,
                )
            
            for i in iteration_var_list:
                print(i)
                try:
                    # use soupparser to handle broken html

                    tree = lxml.html.soupparser.parse(
                        "spiders/pages/" + fdb + str(i) + "entryList.html"
                    )

                except Exception as e:
                    tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
                    print(
                        "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
                        e,
                    )

                try:    

                    #print('this is the n looped elements of the parent specified in config.yaml:')

                    #for e in tree.iter():
                        
                    #    print(e.tag)
                    #
                    #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):
                    #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']"):
                    
                    #    print(etree.tostring(e).decode())

                    
                    dictionary_entry_list = {}

                    fdb_conf = self.config.get(fdb)
                    fdb_domain = fdb_conf.get("domain")
                    fdb_conf_entry_list = fdb_conf.get("entry-list")
                    fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
                    fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
                    fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
                    fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info")
                    fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")


                    #print('blabliblub')
                    #print('len', len(tree.xpath(fdb_conf_entry_list_parent)))
                    for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):

                        try:
                            name = tree.xpath(
                                fdb_conf_entry_list_parent
                                + "["
                                + str(n+1)
                                + "]"
                                + fdb_conf_entry_list_child_name
                            )[0]
                        
                        except Exception as e:
                            print("name could not be parsed", e)
                            name = 'NONE'

                        try:
                            info = tree.xpath(
                                fdb_conf_entry_list_parent
                                + "["
                                + str(n+1)
                                + "]"
                                + fdb_conf_entry_list_child_info
                            )[0]
                        
                        except Exception as e:
                            print("info could not be parsed", e, info)
                            info = 'NONE'

                        try:
                            period = tree.xpath(
                                fdb_conf_entry_list_parent
                                + "["
                                + str(n+1)
                                + "]"
                                + fdb_conf_entry_list_child_period
                            )[0]
                            #print('period', period)
                        except Exception as e:
                            print("period could not be parsed", e, period)
                            period = 'NONE'

                        try:
                            link = tree.xpath(
                                fdb_conf_entry_list_parent
                                + "["
                                + str(n+1)
                                + "]"
                                + fdb_conf_entry_list_child_link
                            )[0]
                            
                            if 'javascript:' in link:
                                #from selenium import webdriver
                                print('link is javascript element, not url to parse')
                                #url = 'https://example.com'
                                #driver = webdriver.Chrome()
                                #driver.get(url)
                                #links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')]
                            
                            #print('link', link)
                        
                        except Exception as e:
                            print("link could not be parsed", e, link)
                            link = 'NONE'


                        if len(name) > 0 and name != 'NONE':
                            dictionary_entry_list[n] = {}
                            dictionary_entry_list[n]["name"] = name
                            dictionary_entry_list[n]["info"] = info
                            dictionary_entry_list[n]["period"] = period

                            if fdb_domain in link:
                                dictionary_entry_list[n]["link"] = link
                            if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
                                dictionary_entry_list[n]["link"] = link
                            if 'javascript:' in link:
                                dictionary_entry_list[n]["link"] = link
                            else:
                                if link[-1] == '/':
                                    dictionary_entry_list[n]["link"] = fdb_domain + link
                                else:
                                    dictionary_entry_list[n]["link"] = fdb_domain + '/' + link

                except Exception as e:
                    print(
                        "parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:",
                        e,
                    )

                # save interim results to files

                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
                f.write(str(dictionary_entry_list))
                f.close

    def download_entry_data_htmls(self, list_of_fdbs):
        for fdb in list_of_fdbs:
            
            try:
                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
            except Exception as e:
                print(
                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
                    e,
                )
            
            for i in iteration_var_list:
            
            
                f = open("spiders/output/" + fdb + str(i) + "entryList.txt")
                text = f.read()

                dictionary_entry_list = eval(text)

                for entry_id in dictionary_entry_list:
                    entry_link = dictionary_entry_list[entry_id]["link"]

                    # download the html page of the entry
                    
                    try:
                        # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
                        url = entry_link
                        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
                        response = urllib.request.urlopen(req)
                    except Exception as e:
                        try:
                            response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
                            print(
                                "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
                                e,
                            )
                        except Exception as ex:
                            print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
                    
                    
                    try:
                        web_content = response.read().decode("UTF-8")
                    except Exception as e:
                        try:
                            web_content = response.read().decode("latin-1")
                            print(
                                "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
                                e,
                            )
                        except Exception as ex:
                            print(ex)

                    # save interim results to files
                    
                    if '.pdf' in entry_link:
                    
                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                        response = requests.get(entry_link)
                        os.makedirs(os.path.dirname(file_name), exist_ok=True)
                        f = open(file_name, "bw")
                        f.write(response.content)
                        f.close
                    
                    else:
                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                        
                        
                        if not web_content:
                            print('other downloading approaches did not work, trying requests')
                            
                            try:
                                from requests_html import HTMLSession
                                session = HTMLSession()

                                r = session.get(entry_link)

                                r.html.render()
                                web_content = r.text
                        
                            except Exception as e:
                                print('requests_html HTMLSession did not work')
                        
                        
                        os.makedirs(os.path.dirname(file_name), exist_ok=True)
                        f = open(file_name, "w+")
                        f.write(web_content)
                        f.close

    def parse_entry_data2dictionary(self, list_of_fdbs):
        for fdb in list_of_fdbs:
            
            try:
                iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list"))
            except Exception as e:
                print(
                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
                    e,
                )
            
            for i in iteration_var_list:
            
                print("started to parse data of entry of " + fdb + " ..")

                f = open("spiders/output/" + fdb + str(i) + "entryList.txt")
                text = f.read()

                dictionary_entry_list = eval(text)

                fdb_conf = self.config.get(fdb)
                fdb_domain = fdb_conf.get("domain")
                fdb_conf_entry = fdb_conf.get("entry")
                #print('balubaluba', fdb_conf_entry)
                fdb_conf_entry_general = fdb_conf_entry.get("general")
                #print(fdb_conf_entry_general)
                
                
                for entry_id in dictionary_entry_list:
                    print(
                        "started to parse data of entry with name "
                        + dictionary_entry_list[entry_id]["name"]
                        + " .."
                    )

                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"

                    try:

                        tree = lxml.html.soupparser.parse(file_name)
                
                    except Exception as e:
                        tree = html.parse(file_name)
                        print(
                            "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
                            e,
                        )
                
                    if fdb_conf_entry_general["uniform"] == 'TRUE':
                        fdb_conf_entry_unitrue = fdb_conf_entry.get("unitrue")
                        
                        for key in fdb_conf_entry_unitrue:
                            fdb_conf_entry_unitrue_child = fdb_conf_entry_unitrue.get(key)
                            
                            
                            child = tree.xpath(
                                fdb_conf_entry_unitrue_entry_child
                            )

                            #print("oi", child)

                            if len(child) > 0:
                                dictionary_entry_list[entry_id][key] = child[
                                    0
                                ]

                            
                    else:
                        fdb_conf_entry_unifalse = fdb_conf_entry.get("unifalse")
                        fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist")
                        
                        
                        if '.pdf' in dictionary_entry_list[entry_id]["link"]:
                            
                            print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id)
                            
                            try:
                            
                                generaltext = ''
                        
                                for page_layout in extract_pages(file_name):
                                    for element in page_layout:
                                        if isinstance(element, LTTextContainer):
                                            generaltext += element.get_text()    
                            
                            except Exception as e:
                                generaltext = 'NONE'
                                print('parsing pdf did not work, the original error is:', e )
                                
                            
                        else:

                            p_text = tree.xpath(
                                "//p//text()"
                            )
                            
                            div_text = tree.xpath(
                                "//div//text()"
                            )
                            
                            
                            #print("oi", text)
                            generaltext = ''
                            for n in range(len(p_text)):
                                
                                if len(p_text[n]) > 0:
                                    generaltext += p_text[n] + ' '
                            
                            for n in range(len(div_text)):
                                
                                if len(div_text[n]) > 0 and div_text[n] not in p_text:
                                    generaltext += div_text[n] + ' '
                            
                            
                            generaltextlist = generaltext.split(' ')
                            if len(generaltextlist) > 5000:
                                print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist))
                                
                                file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                                
                                try:
                                    with open(file_name , 'r', encoding='utf-8') as file:
                                        html_content = file.read()
                                except Exception as e:
                                    
                                    with open(file_name , 'r', encoding='latin-1') as file:
                                        html_content = file.read()
                                    print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
                                
                                generaltext = extract(html_content)
                                print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
                                
                            if len(generaltextlist) < 2:
                                                        
                                    
                                print('no text parsed, the wc is', len(generaltextlist))
                                
                                print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))
                                
                                file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                                
                                try:
                                    with open(file_name , 'r', encoding='utf-8') as file:
                                        html_content = file.read()
                                except Exception as e:
                                    
                                    with open(file_name , 'r', encoding='latin-1') as file:
                                        html_content = file.read()
                                    print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
                                
                                generaltext = extract(html_content)
                                try:
                                    if len(generaltext) > 2:
                                        print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
                                except:
                                    
                                    print('trafilatura got this out:', generaltext , 'setting generaltext to NONE')
                                    generaltext = 'NONE'
                        
                        dictionary_entry_list[entry_id]["text"] = generaltext
                        dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)
                                

                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
                f.write(str(dictionary_entry_list))
                f.close