diff --git a/main.py b/main.py index 9d6075b..3cc9312 100644 --- a/main.py +++ b/main.py @@ -14,11 +14,11 @@ spider = fdb_spider(config) #spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) -spider.find_config_parameter(list_of_fdbs) +#spider.find_config_parameter(list_of_fdbs) #spider.parse_entry_list_data2dictionary(list_of_fdbs) -spider.download_entry_data_htmls(list_of_fdbs) +#spider.download_entry_data_htmls(list_of_fdbs) spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 7a05bea..f4a1860 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/config.yaml b/spiders/config.yaml index fb62bfe..2d1a81a 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -65,10 +65,10 @@ giz: child-sponsor: "/tr/td[4]/text()" entry: general: - uniform: 'FALSE' + uniform: 'TRUE' unitrue: - parent: '//html//body//form//table' - #child-name: '//html//body//form//table//tr[1]//td[2]//span' + #parent: '//html//body//form//table' + text: '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a/@href' #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' unifalse: diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index bbd91d1..e27cecc 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -358,9 +358,9 @@ class fdb_spider(object): "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", e, ) - + print('starting to download the entry html pages..') for i in iteration_var_list: - + print(i) f = open("spiders/output/" + fdb + str(i) + "entryList.txt") text = f.read() @@ -398,11 +398,24 @@ class fdb_spider(object): + fdb_conf_entry_list_javascript_link ) + # to time.sleep was suggested for errors + #import time + #time.sleep(1) + element.click() window_after = driver.window_handles[1] driver.switch_to.window(window_after) - element = driver.find_element("xpath", "//html") - web_content = element.text + #element = driver.find_element("xpath", "//html") + #web_content = element.text + + #entry_domain = driver.getCurrentUrl() + entry_domain = driver.current_url + + dictionary_entry_list[entry_id]["domain"] = entry_domain + + + + web_content = driver.page_source file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" os.makedirs(os.path.dirname(file_name), exist_ok=True) @@ -479,6 +492,12 @@ class fdb_spider(object): f = open(file_name, "w+") f.write(web_content) f.close + + # save the entry_domain, implemented first for further downloads in javascript links + f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") + f.write(str(dictionary_entry_list)) + f.close + def parse_entry_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: @@ -541,12 +560,80 @@ class fdb_spider(object): child = tree.xpath( - fdb_conf_entry_unitrue_entry_child - ) - - #print("oi", child) + fdb_conf_entry_unitrue_child + )[0] - if len(child) > 0: + print("oi", child) + + if '.pdf' in child: + + print('child in entry data is pdf, downloading it..') + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".pdf" + entry_link = dictionary_entry_list[entry_id]["link"] + if 'http' not in child: + if 'javascript' or 'js' not in entry_link and 'http' in entry_link: + try: + response = requests.get(entry_link + child) + except Exception as e: + print(entry_link + child + ' seems not a valid pdf link to download, orginal error message is:', e) + + if 'javascript' or 'js' in entry_link: + entry_domain = dictionary_entry_list[entry_id]["domain"] + if child[0] == '.' and child[1] == '/': + if entry_domain[-1] == '/': + pdf_link = entry_domain[:-1] + child[1:] + if entry_domain[-1] != '/': + for n in range(len(entry_domain)): + if entry_domain[-1] != '/': + entry_domain = entry_domain[:-1] + else: + break + + + + pdf_link = entry_domain + child[1:] + + if child[0] == '/': + if entry_domain[-1] == '/': + pdf_link = entry_domain[:-1] + child + if entry_domain[-1] != '/': + pdf_link = entry_domain + child + + print('pdf_link', pdf_link) + try: + response = requests.get(pdf_link) + except Exception as e: + print(pdf_link + ' seems not a valid pdf link to download, orginal error message is:', e) + + + + #response = requests.get(child) + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "bw") + f.write(response.content) + f.close + + print('parsing a pdf', pdf_link, entry_id) + + try: + + generaltext = '' + + for page_layout in extract_pages(file_name): + for element in page_layout: + if isinstance(element, LTTextContainer): + generaltext += element.get_text() + + except Exception as e: + generaltext = 'NONE' + print('parsing pdf did not work, the original error is:', e ) + + + + dictionary_entry_list[entry_id][key] = generaltext + + if len(child) > 0 and '.pdf' not in child: dictionary_entry_list[entry_id][key] = child[ 0 ]