added pdf child text downloading and parse to json exceptions/cases for javascript entry data and normal data

2023-12-06 13:46:54 +00:00 · 2023-12-06 13:46:54 +00:00 · d2324d265a
commit d2324d265a
parent 885c210971
4 changed files with 101 additions and 14 deletions
--- a/main.py
+++ b/main.py
@ -14,11 +14,11 @@ spider = fdb_spider(config)
 #spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
-spider.find_config_parameter(list_of_fdbs)
+#spider.find_config_parameter(list_of_fdbs)
 #spider.parse_entry_list_data2dictionary(list_of_fdbs)
-spider.download_entry_data_htmls(list_of_fdbs)
+#spider.download_entry_data_htmls(list_of_fdbs)
 spider.parse_entry_data2dictionary(list_of_fdbs)
--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -65,10 +65,10 @@ giz:
    child-sponsor: "/tr/td[4]/text()"
  entry:
    general:
-      uniform: 'FALSE'
+      uniform: 'TRUE'
    unitrue:
-      parent:  '//html//body//form//table'
+      #parent:  '//html//body//form//table'
-      #child-name:  '//html//body//form//table//tr[1]//td[2]//span'
+      text:  '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a/@href'
      #child-sum:  '//html//body//form//table//tr[2]//td[1]//span//img'
      #child-deadline:  '//html/body/form/table/tr[2]/td[3]/span + label.1'
    unifalse:
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -358,9 +358,9 @@ class fdb_spider(object):
                    "There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
                    e,
                )
-            
+            print('starting to download the entry html pages..')
            for i in iteration_var_list:
-            
+                print(i)
                f = open("spiders/output/" + fdb + str(i) + "entryList.txt")
                text = f.read()
@ -398,11 +398,24 @@ class fdb_spider(object):
                            + fdb_conf_entry_list_javascript_link
                        )
                        # to time.sleep was suggested for errors
                        #import time
                        #time.sleep(1)
                        element.click()
                        window_after = driver.window_handles[1]
                        driver.switch_to.window(window_after)
-                        element = driver.find_element("xpath", "//html")
+                        #element = driver.find_element("xpath", "//html")
-                        web_content = element.text
+                        #web_content = element.text
                        #entry_domain = driver.getCurrentUrl()
                        entry_domain = driver.current_url
                        dictionary_entry_list[entry_id]["domain"] = entry_domain
                        web_content = driver.page_source
                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
                        os.makedirs(os.path.dirname(file_name), exist_ok=True)
@ -479,6 +492,12 @@ class fdb_spider(object):
                        f = open(file_name, "w+")
                        f.write(web_content)
                        f.close
                # save the entry_domain, implemented first for further downloads in javascript links
                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
                f.write(str(dictionary_entry_list))
                f.close
    def parse_entry_data2dictionary(self, list_of_fdbs):
        for fdb in list_of_fdbs:
@ -541,12 +560,80 @@ class fdb_spider(object):
                            child = tree.xpath(
-                                fdb_conf_entry_unitrue_entry_child
+                                fdb_conf_entry_unitrue_child
-                            )
+                            )[0]
-                            #print("oi", child)
+                            print("oi", child)
-
+                            
-                            if len(child) > 0:
+                            if '.pdf' in child:
                                print('child in entry data is pdf, downloading it..')
                                file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".pdf"
                                entry_link = dictionary_entry_list[entry_id]["link"]
                                if 'http' not in child:
                                    if 'javascript' or 'js' not in entry_link and 'http' in entry_link:
                                        try:
                                            response = requests.get(entry_link + child)
                                        except Exception as e:
                                            print(entry_link + child + ' seems not a valid pdf link to download, orginal error message is:', e)
                                    if 'javascript' or 'js' in entry_link:
                                        entry_domain = dictionary_entry_list[entry_id]["domain"]
                                        if child[0] == '.' and child[1] == '/':
                                            if entry_domain[-1] == '/':
                                                pdf_link = entry_domain[:-1] + child[1:]
                                            if entry_domain[-1] != '/':
                                                for n in range(len(entry_domain)):
                                                    if entry_domain[-1] != '/':
                                                        entry_domain = entry_domain[:-1]
                                                    else:
                                                        break
                                                pdf_link = entry_domain + child[1:]
                                        if child[0] == '/':
                                            if entry_domain[-1] == '/':
                                                pdf_link = entry_domain[:-1] + child
                                            if entry_domain[-1] != '/':
                                                pdf_link = entry_domain + child
                                        print('pdf_link', pdf_link)
                                        try:
                                                response = requests.get(pdf_link)
                                        except Exception as e:
                                            print(pdf_link + ' seems not a valid pdf link to download, orginal error message is:', e)
                                #response = requests.get(child)
                                os.makedirs(os.path.dirname(file_name), exist_ok=True)
                                f = open(file_name, "bw")
                                f.write(response.content)
                                f.close
                                print('parsing a pdf', pdf_link, entry_id)
                                try:
                                    generaltext = ''
                                    for page_layout in extract_pages(file_name):
                                        for element in page_layout:
                                            if isinstance(element, LTTextContainer):
                                                generaltext += element.get_text()    
                                except Exception as e:
                                    generaltext = 'NONE'
                                    print('parsing pdf did not work, the original error is:', e )
                                dictionary_entry_list[entry_id][key] = generaltext
                            if len(child) > 0 and '.pdf' not in child:
                                dictionary_entry_list[entry_id][key] = child[
                                    0
                                ]