|
@ -358,9 +358,9 @@ class fdb_spider(object): |
|
|
"There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", |
|
|
"There is a problem with the configuration variable entryList iteration var list in the config.yaml - the original error message is:", |
|
|
e, |
|
|
e, |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print('starting to download the entry html pages..') |
|
|
for i in iteration_var_list: |
|
|
for i in iteration_var_list: |
|
|
|
|
|
|
|
|
|
|
|
print(i) |
|
|
|
|
|
|
|
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt") |
|
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt") |
|
|
text = f.read() |
|
|
text = f.read() |
|
@ -398,11 +398,24 @@ class fdb_spider(object): |
|
|
+ fdb_conf_entry_list_javascript_link |
|
|
+ fdb_conf_entry_list_javascript_link |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
# to time.sleep was suggested for errors |
|
|
|
|
|
#import time |
|
|
|
|
|
#time.sleep(1) |
|
|
|
|
|
|
|
|
element.click() |
|
|
element.click() |
|
|
window_after = driver.window_handles[1] |
|
|
window_after = driver.window_handles[1] |
|
|
driver.switch_to.window(window_after) |
|
|
driver.switch_to.window(window_after) |
|
|
element = driver.find_element("xpath", "//html") |
|
|
|
|
|
web_content = element.text |
|
|
|
|
|
|
|
|
#element = driver.find_element("xpath", "//html") |
|
|
|
|
|
#web_content = element.text |
|
|
|
|
|
|
|
|
|
|
|
#entry_domain = driver.getCurrentUrl() |
|
|
|
|
|
entry_domain = driver.current_url |
|
|
|
|
|
|
|
|
|
|
|
dictionary_entry_list[entry_id]["domain"] = entry_domain |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
web_content = driver.page_source |
|
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
@ -479,6 +492,12 @@ class fdb_spider(object): |
|
|
f = open(file_name, "w+") |
|
|
f = open(file_name, "w+") |
|
|
f.write(web_content) |
|
|
f.write(web_content) |
|
|
f.close |
|
|
f.close |
|
|
|
|
|
|
|
|
|
|
|
# save the entry_domain, implemented first for further downloads in javascript links |
|
|
|
|
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") |
|
|
|
|
|
f.write(str(dictionary_entry_list)) |
|
|
|
|
|
f.close |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_entry_data2dictionary(self, list_of_fdbs): |
|
|
def parse_entry_data2dictionary(self, list_of_fdbs): |
|
|
for fdb in list_of_fdbs: |
|
|
for fdb in list_of_fdbs: |
|
@ -541,12 +560,80 @@ class fdb_spider(object): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
child = tree.xpath( |
|
|
child = tree.xpath( |
|
|
fdb_conf_entry_unitrue_entry_child |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
#print("oi", child) |
|
|
|
|
|
|
|
|
fdb_conf_entry_unitrue_child |
|
|
|
|
|
)[0] |
|
|
|
|
|
|
|
|
if len(child) > 0: |
|
|
|
|
|
|
|
|
print("oi", child) |
|
|
|
|
|
|
|
|
|
|
|
if '.pdf' in child: |
|
|
|
|
|
|
|
|
|
|
|
print('child in entry data is pdf, downloading it..') |
|
|
|
|
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".pdf" |
|
|
|
|
|
entry_link = dictionary_entry_list[entry_id]["link"] |
|
|
|
|
|
if 'http' not in child: |
|
|
|
|
|
if 'javascript' or 'js' not in entry_link and 'http' in entry_link: |
|
|
|
|
|
try: |
|
|
|
|
|
response = requests.get(entry_link + child) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(entry_link + child + ' seems not a valid pdf link to download, orginal error message is:', e) |
|
|
|
|
|
|
|
|
|
|
|
if 'javascript' or 'js' in entry_link: |
|
|
|
|
|
entry_domain = dictionary_entry_list[entry_id]["domain"] |
|
|
|
|
|
if child[0] == '.' and child[1] == '/': |
|
|
|
|
|
if entry_domain[-1] == '/': |
|
|
|
|
|
pdf_link = entry_domain[:-1] + child[1:] |
|
|
|
|
|
if entry_domain[-1] != '/': |
|
|
|
|
|
for n in range(len(entry_domain)): |
|
|
|
|
|
if entry_domain[-1] != '/': |
|
|
|
|
|
entry_domain = entry_domain[:-1] |
|
|
|
|
|
else: |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_link = entry_domain + child[1:] |
|
|
|
|
|
|
|
|
|
|
|
if child[0] == '/': |
|
|
|
|
|
if entry_domain[-1] == '/': |
|
|
|
|
|
pdf_link = entry_domain[:-1] + child |
|
|
|
|
|
if entry_domain[-1] != '/': |
|
|
|
|
|
pdf_link = entry_domain + child |
|
|
|
|
|
|
|
|
|
|
|
print('pdf_link', pdf_link) |
|
|
|
|
|
try: |
|
|
|
|
|
response = requests.get(pdf_link) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(pdf_link + ' seems not a valid pdf link to download, orginal error message is:', e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#response = requests.get(child) |
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
|
|
f = open(file_name, "bw") |
|
|
|
|
|
f.write(response.content) |
|
|
|
|
|
f.close |
|
|
|
|
|
|
|
|
|
|
|
print('parsing a pdf', pdf_link, entry_id) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
generaltext = '' |
|
|
|
|
|
|
|
|
|
|
|
for page_layout in extract_pages(file_name): |
|
|
|
|
|
for element in page_layout: |
|
|
|
|
|
if isinstance(element, LTTextContainer): |
|
|
|
|
|
generaltext += element.get_text() |
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
generaltext = 'NONE' |
|
|
|
|
|
print('parsing pdf did not work, the original error is:', e ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dictionary_entry_list[entry_id][key] = generaltext |
|
|
|
|
|
|
|
|
|
|
|
if len(child) > 0 and '.pdf' not in child: |
|
|
dictionary_entry_list[entry_id][key] = child[ |
|
|
dictionary_entry_list[entry_id][key] = child[ |
|
|
0 |
|
|
0 |
|
|
] |
|
|
] |
|
|