import os import yaml import json import urllib.request, urllib.error, urllib.parse from lxml import etree import lxml.html import lxml.html.soupparser from lxml import html import requests from trafilatura import extract from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer class fdb_spider(object): def __init__(self, config_file): with open(config_file, "r") as stream: try: self.config = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) # input list of funding databases in form of yaml file ['', '', .. , 'usw'] def download_entry_list_pages_of_funding_databases(self, list_of_fdbs): # download only html pages of the funding databases specified in input for fdb in list_of_fdbs: for key in self.config: if key in list_of_fdbs: try: entry_list = self.config.get(key).get("entry-list") except Exception as e: print( "There is a problem with the configuration variable entryList in the config.yaml - the original error message is:", e, ) try: entry_list_link1 = entry_list.get("link1") except Exception as e: print( "No link1 defined in config.yaml - the original error message is:", e, ) try: entry_list_link2 = entry_list.get("link2") except Exception as e: print( "No link2 defined in config.yaml - the original error message is:", e, ) try: entry_iteration_var_list = eval(entry_list.get("iteration-var-list")) except Exception as e: print( "No iteration-var-list defined in config.yaml - the original error message is:", e, ) for i in entry_iteration_var_list: # download the html page of the List of entrys response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2) # web_content ="UTF-8") try: web_content ="UTF-8") except Exception as e: try: web_content ="latin-1") print( "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", e, ) except Exception as ex: print(ex) # save interim results to files if (len(web_content)) < 10: print('getting the html page through urllib did not work, trying with requests librarys function get') try: res = requests.get(entry_list_link1 + str(i) + entry_list_link2) web_content = res.text except Exception as e: print('also requests library did not work, original error is:', e) print(web_content) f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") f.write(web_content) f.close def find_config_parameter(self, list_of_fdbs): for fdb in list_of_fdbs: try: iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) except Exception as e: print( "There is a problem with the configuration variable entryList iteration var list in the config.yaml", e, ) fdb_conf = self.config.get(fdb) fdb_domain = fdb_conf.get("domain") fdb_conf_entry_list = fdb_conf.get("entry-list") fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info") fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") for i in iteration_var_list: print(i) try: # use soupparser to handle broken html tree = lxml.html.soupparser.parse( "spiders/pages/" + fdb + str(i) + "entryList.html" ) except Exception as e: tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html") print( "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been", e, ) try: print('this is the n looped elements of the parent specified in config.yaml:') #print('entrylistparent', fdb_conf_entry_list_parent) #print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']")) #print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode()) for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): print('-----------------------------------------------------------------------------------------------------------------------------------------') print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode()) print('this is the name children:') name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name) print(name_element) #for name in name_element: # print(name) print(len(name_element)) print('this is the link children:') link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link) print(link_element) #for link in link_element: # print(link) print(len(link_element)) print('this is the info children:') info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info) print(info_element) print(len(info_element)) print('this is the period children:') period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period) print(period_element) print(len(period_element)) except Exception as e: print( "parsing the html did not work.", e, ) def parse_entry_list_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: try: iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) except Exception as e: print( "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", e, ) for i in iteration_var_list: print(i) try: # use soupparser to handle broken html tree = lxml.html.soupparser.parse( "spiders/pages/" + fdb + str(i) + "entryList.html" ) except Exception as e: tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html") print( "parsing the xml files did not work with the soupparser. The original error message is:", e, ) try: #print('this is the n looped elements of the parent specified in config.yaml:') #for e in tree.iter(): # print(e.tag) # #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"): #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']"): # print(etree.tostring(e).decode()) dictionary_entry_list = {} fdb_conf = self.config.get(fdb) fdb_domain = fdb_conf.get("domain") fdb_conf_entry_list = fdb_conf.get("entry-list") fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info") fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") #print('blabliblub') #print('len', len(tree.xpath(fdb_conf_entry_list_parent))) for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): try: name = tree.xpath( fdb_conf_entry_list_parent + "[" + str(n+1) + "]" + fdb_conf_entry_list_child_name )[0] except Exception as e: print("name could not be parsed", e) name = 'NONE' try: info = tree.xpath( fdb_conf_entry_list_parent + "[" + str(n+1) + "]" + fdb_conf_entry_list_child_info )[0] except Exception as e: print("info could not be parsed", e, info) info = 'NONE' try: period = tree.xpath( fdb_conf_entry_list_parent + "[" + str(n+1) + "]" + fdb_conf_entry_list_child_period )[0] #print('period', period) except Exception as e: print("period could not be parsed", e, period) period = 'NONE' try: link = tree.xpath( fdb_conf_entry_list_parent + "[" + str(n+1) + "]" + fdb_conf_entry_list_child_link )[0] if 'javascript:' in link: #from selenium import webdriver print('link is javascript element, not url to parse') #url = '' #driver = webdriver.Chrome() #driver.get(url) #links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')] #print('link', link) except Exception as e: print("link could not be parsed", e, link) link = 'NONE' if len(name) > 0 and name != 'NONE': dictionary_entry_list[n] = {} dictionary_entry_list[n]["name"] = name dictionary_entry_list[n]["info"] = info dictionary_entry_list[n]["period"] = period if fdb_domain in link: dictionary_entry_list[n]["link"] = link if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): dictionary_entry_list[n]["link"] = link if 'javascript:' in link: dictionary_entry_list[n]["link"] = link else: if link[-1] == '/': dictionary_entry_list[n]["link"] = fdb_domain + link else: dictionary_entry_list[n]["link"] = fdb_domain + '/' + link except Exception as e: print( "parsing the html did not work. The original error message is:", e, ) # save interim results to files f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") f.write(str(dictionary_entry_list)) f.close def download_entry_data_htmls(self, list_of_fdbs): for fdb in list_of_fdbs: try: iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) except Exception as e: print( "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", e, ) for i in iteration_var_list: f = open("spiders/output/" + fdb + str(i) + "entryList.txt") text = dictionary_entry_list = eval(text) for entry_id in dictionary_entry_list: entry_link = dictionary_entry_list[entry_id]["link"] # download the html page of the entry try: # defining cookie to not end up in endless loop because of cookie banners pointing to redirects url = entry_link req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) response = urllib.request.urlopen(req) except Exception as e: try: response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) print( "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:", e, ) except Exception as ex: print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex ) try: web_content ="UTF-8") except Exception as e: try: web_content ="latin-1") print( "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", e, ) except Exception as ex: print(ex) # save interim results to files if '.pdf' in entry_link: file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" response = requests.get(entry_link) os.makedirs(os.path.dirname(file_name), exist_ok=True) f = open(file_name, "bw") f.write(response.content) f.close else: file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" if not web_content: print('other downloading approaches did not work, trying requests') try: from requests_html import HTMLSession session = HTMLSession() r = session.get(entry_link) r.html.render() web_content = r.text except Exception as e: print('requests_html HTMLSession did not work') os.makedirs(os.path.dirname(file_name), exist_ok=True) f = open(file_name, "w+") f.write(web_content) f.close def parse_entry_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: try: iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) except Exception as e: print( "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", e, ) for i in iteration_var_list: print("started to parse data of entry of " + fdb + " ..") f = open("spiders/output/" + fdb + str(i) + "entryList.txt") text = dictionary_entry_list = eval(text) fdb_conf = self.config.get(fdb) fdb_domain = fdb_conf.get("domain") fdb_conf_entry = fdb_conf.get("entry") #print('balubaluba', fdb_conf_entry) fdb_conf_entry_general = fdb_conf_entry.get("general") #print(fdb_conf_entry_general) for entry_id in dictionary_entry_list: print( "started to parse data of entry with name " + dictionary_entry_list[entry_id]["name"] + " .." ) file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" try: tree = lxml.html.soupparser.parse(file_name) except Exception as e: tree = html.parse(file_name) print( "parsing the xml files did not work with the soupparser. The original error message is:", e, ) if fdb_conf_entry_general["uniform"] == 'TRUE': fdb_conf_entry_unitrue = fdb_conf_entry.get("unitrue") for key in fdb_conf_entry_unitrue: fdb_conf_entry_unitrue_child = fdb_conf_entry_unitrue.get(key) child = tree.xpath( fdb_conf_entry_unitrue_entry_child ) #print("oi", child) if len(child) > 0: dictionary_entry_list[entry_id][key] = child[ 0 ] else: fdb_conf_entry_unifalse = fdb_conf_entry.get("unifalse") fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist") if '.pdf' in dictionary_entry_list[entry_id]["link"]: print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id) try: generaltext = '' for page_layout in extract_pages(file_name): for element in page_layout: if isinstance(element, LTTextContainer): generaltext += element.get_text() except Exception as e: generaltext = 'NONE' print('parsing pdf did not work, the original error is:', e ) else: p_text = tree.xpath( "//p//text()" ) div_text = tree.xpath( "//div//text()" ) #print("oi", text) generaltext = '' for n in range(len(p_text)): if len(p_text[n]) > 0: generaltext += p_text[n] + ' ' for n in range(len(div_text)): if len(div_text[n]) > 0 and div_text[n] not in p_text: generaltext += div_text[n] + ' ' generaltextlist = generaltext.split(' ') if len(generaltextlist) > 5000: print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" try: with open(file_name , 'r', encoding='utf-8') as file: html_content = except Exception as e: with open(file_name , 'r', encoding='latin-1') as file: html_content = print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) generaltext = extract(html_content) print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) if len(generaltextlist) < 2: print('no text parsed, the wc is', len(generaltextlist)) print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" try: with open(file_name , 'r', encoding='utf-8') as file: html_content = except Exception as e: with open(file_name , 'r', encoding='latin-1') as file: html_content = print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) generaltext = extract(html_content) try: if len(generaltext) > 2: print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) except: print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') generaltext = 'NONE' dictionary_entry_list[entry_id]["text"] = generaltext dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist) f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") f.write(str(dictionary_entry_list)) f.close