|
@ -296,6 +296,15 @@ class fdb_spider(object): |
|
|
+ "]" |
|
|
+ "]" |
|
|
+ fdb_conf_entry_list_child_link |
|
|
+ fdb_conf_entry_list_child_link |
|
|
)[0] |
|
|
)[0] |
|
|
|
|
|
|
|
|
|
|
|
if 'javascript:' in link: |
|
|
|
|
|
#from selenium import webdriver |
|
|
|
|
|
print('link is javascript element, not url to parse') |
|
|
|
|
|
#url = 'https://example.com' |
|
|
|
|
|
#driver = webdriver.Chrome() |
|
|
|
|
|
#driver.get(url) |
|
|
|
|
|
#links = [link.get_attribute('href') for link in driver.find_elements_by_tag_name('a')] |
|
|
|
|
|
|
|
|
#print('link', link) |
|
|
#print('link', link) |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
except Exception as e: |
|
@ -313,7 +322,8 @@ class fdb_spider(object): |
|
|
dictionary_entry_list[n]["link"] = link |
|
|
dictionary_entry_list[n]["link"] = link |
|
|
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): |
|
|
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): |
|
|
dictionary_entry_list[n]["link"] = link |
|
|
dictionary_entry_list[n]["link"] = link |
|
|
|
|
|
|
|
|
|
|
|
if 'javascript:' in link: |
|
|
|
|
|
dictionary_entry_list[n]["link"] = link |
|
|
else: |
|
|
else: |
|
|
if link[-1] == '/': |
|
|
if link[-1] == '/': |
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link |
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link |
|
@ -397,7 +407,24 @@ class fdb_spider(object): |
|
|
|
|
|
|
|
|
else: |
|
|
else: |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not web_content: |
|
|
|
|
|
print('other downloading approaches did not work, trying requests') |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
from requests_html import HTMLSession |
|
|
|
|
|
session = HTMLSession() |
|
|
|
|
|
|
|
|
|
|
|
r = session.get(entry_link) |
|
|
|
|
|
|
|
|
|
|
|
r.html.render() |
|
|
|
|
|
web_content = r.text |
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print('requests_html HTMLSession did not work') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
f = open(file_name, "w+") |
|
|
f = open(file_name, "w+") |
|
|
f.write(web_content) |
|
|
f.write(web_content) |
|
|