added pdf child text downloading and parse to json exceptions/cases for javascript entry data and normal data
This commit is contained in:
parent
885c210971
commit
d2324d265a
4 changed files with 101 additions and 14 deletions
4
main.py
4
main.py
|
@ -14,11 +14,11 @@ spider = fdb_spider(config)
|
||||||
|
|
||||||
#spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
#spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||||
|
|
||||||
spider.find_config_parameter(list_of_fdbs)
|
#spider.find_config_parameter(list_of_fdbs)
|
||||||
|
|
||||||
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
spider.download_entry_data_htmls(list_of_fdbs)
|
#spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_data2dictionary(list_of_fdbs)
|
spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -65,10 +65,10 @@ giz:
|
||||||
child-sponsor: "/tr/td[4]/text()"
|
child-sponsor: "/tr/td[4]/text()"
|
||||||
entry:
|
entry:
|
||||||
general:
|
general:
|
||||||
uniform: 'FALSE'
|
uniform: 'TRUE'
|
||||||
unitrue:
|
unitrue:
|
||||||
parent: '//html//body//form//table'
|
#parent: '//html//body//form//table'
|
||||||
#child-name: '//html//body//form//table//tr[1]//td[2]//span'
|
text: '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a/@href'
|
||||||
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
|
#child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
|
||||||
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
#child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
|
||||||
unifalse:
|
unifalse:
|
||||||
|
|
|
@ -358,9 +358,9 @@ class fdb_spider(object):
|
||||||
"There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
|
"There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:",
|
||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
|
print('starting to download the entry html pages..')
|
||||||
for i in iteration_var_list:
|
for i in iteration_var_list:
|
||||||
|
print(i)
|
||||||
|
|
||||||
f = open("spiders/output/" + fdb + str(i) + "entryList.txt")
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt")
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
@ -398,11 +398,24 @@ class fdb_spider(object):
|
||||||
+ fdb_conf_entry_list_javascript_link
|
+ fdb_conf_entry_list_javascript_link
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# to time.sleep was suggested for errors
|
||||||
|
#import time
|
||||||
|
#time.sleep(1)
|
||||||
|
|
||||||
element.click()
|
element.click()
|
||||||
window_after = driver.window_handles[1]
|
window_after = driver.window_handles[1]
|
||||||
driver.switch_to.window(window_after)
|
driver.switch_to.window(window_after)
|
||||||
element = driver.find_element("xpath", "//html")
|
#element = driver.find_element("xpath", "//html")
|
||||||
web_content = element.text
|
#web_content = element.text
|
||||||
|
|
||||||
|
#entry_domain = driver.getCurrentUrl()
|
||||||
|
entry_domain = driver.current_url
|
||||||
|
|
||||||
|
dictionary_entry_list[entry_id]["domain"] = entry_domain
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
web_content = driver.page_source
|
||||||
|
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
|
@ -480,6 +493,12 @@ class fdb_spider(object):
|
||||||
f.write(web_content)
|
f.write(web_content)
|
||||||
f.close
|
f.close
|
||||||
|
|
||||||
|
# save the entry_domain, implemented first for further downloads in javascript links
|
||||||
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
|
||||||
|
f.write(str(dictionary_entry_list))
|
||||||
|
f.close
|
||||||
|
|
||||||
|
|
||||||
def parse_entry_data2dictionary(self, list_of_fdbs):
|
def parse_entry_data2dictionary(self, list_of_fdbs):
|
||||||
for fdb in list_of_fdbs:
|
for fdb in list_of_fdbs:
|
||||||
|
|
||||||
|
@ -541,12 +560,80 @@ class fdb_spider(object):
|
||||||
|
|
||||||
|
|
||||||
child = tree.xpath(
|
child = tree.xpath(
|
||||||
fdb_conf_entry_unitrue_entry_child
|
fdb_conf_entry_unitrue_child
|
||||||
)
|
)[0]
|
||||||
|
|
||||||
#print("oi", child)
|
print("oi", child)
|
||||||
|
|
||||||
if len(child) > 0:
|
if '.pdf' in child:
|
||||||
|
|
||||||
|
print('child in entry data is pdf, downloading it..')
|
||||||
|
|
||||||
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".pdf"
|
||||||
|
entry_link = dictionary_entry_list[entry_id]["link"]
|
||||||
|
if 'http' not in child:
|
||||||
|
if 'javascript' or 'js' not in entry_link and 'http' in entry_link:
|
||||||
|
try:
|
||||||
|
response = requests.get(entry_link + child)
|
||||||
|
except Exception as e:
|
||||||
|
print(entry_link + child + ' seems not a valid pdf link to download, orginal error message is:', e)
|
||||||
|
|
||||||
|
if 'javascript' or 'js' in entry_link:
|
||||||
|
entry_domain = dictionary_entry_list[entry_id]["domain"]
|
||||||
|
if child[0] == '.' and child[1] == '/':
|
||||||
|
if entry_domain[-1] == '/':
|
||||||
|
pdf_link = entry_domain[:-1] + child[1:]
|
||||||
|
if entry_domain[-1] != '/':
|
||||||
|
for n in range(len(entry_domain)):
|
||||||
|
if entry_domain[-1] != '/':
|
||||||
|
entry_domain = entry_domain[:-1]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pdf_link = entry_domain + child[1:]
|
||||||
|
|
||||||
|
if child[0] == '/':
|
||||||
|
if entry_domain[-1] == '/':
|
||||||
|
pdf_link = entry_domain[:-1] + child
|
||||||
|
if entry_domain[-1] != '/':
|
||||||
|
pdf_link = entry_domain + child
|
||||||
|
|
||||||
|
print('pdf_link', pdf_link)
|
||||||
|
try:
|
||||||
|
response = requests.get(pdf_link)
|
||||||
|
except Exception as e:
|
||||||
|
print(pdf_link + ' seems not a valid pdf link to download, orginal error message is:', e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#response = requests.get(child)
|
||||||
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
|
f = open(file_name, "bw")
|
||||||
|
f.write(response.content)
|
||||||
|
f.close
|
||||||
|
|
||||||
|
print('parsing a pdf', pdf_link, entry_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
generaltext = ''
|
||||||
|
|
||||||
|
for page_layout in extract_pages(file_name):
|
||||||
|
for element in page_layout:
|
||||||
|
if isinstance(element, LTTextContainer):
|
||||||
|
generaltext += element.get_text()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
generaltext = 'NONE'
|
||||||
|
print('parsing pdf did not work, the original error is:', e )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
dictionary_entry_list[entry_id][key] = generaltext
|
||||||
|
|
||||||
|
if len(child) > 0 and '.pdf' not in child:
|
||||||
dictionary_entry_list[entry_id][key] = child[
|
dictionary_entry_list[entry_id][key] = child[
|
||||||
0
|
0
|
||||||
]
|
]
|
||||||
|
|
Loading…
Reference in a new issue