|
@ -96,7 +96,7 @@ class fdb_spider(object): |
|
|
print('also requests library did not work, original error is:', e) |
|
|
print('also requests library did not work, original error is:', e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(web_content) |
|
|
|
|
|
|
|
|
# print(web_content) |
|
|
|
|
|
|
|
|
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") |
|
|
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") |
|
|
f.write(web_content) |
|
|
f.write(web_content) |
|
@ -343,6 +343,12 @@ class fdb_spider(object): |
|
|
f.close |
|
|
f.close |
|
|
|
|
|
|
|
|
def download_entry_data_htmls(self, list_of_fdbs): |
|
|
def download_entry_data_htmls(self, list_of_fdbs): |
|
|
|
|
|
|
|
|
|
|
|
from selenium import webdriver |
|
|
|
|
|
|
|
|
|
|
|
options = webdriver.ChromeOptions() |
|
|
|
|
|
options.add_argument('headless') |
|
|
|
|
|
driver = webdriver.Chrome(options=options) |
|
|
for fdb in list_of_fdbs: |
|
|
for fdb in list_of_fdbs: |
|
|
|
|
|
|
|
|
try: |
|
|
try: |
|
@ -361,40 +367,84 @@ class fdb_spider(object): |
|
|
|
|
|
|
|
|
dictionary_entry_list = eval(text) |
|
|
dictionary_entry_list = eval(text) |
|
|
|
|
|
|
|
|
|
|
|
fdb_conf = self.config.get(fdb) |
|
|
|
|
|
fdb_domain = fdb_conf.get("domain") |
|
|
|
|
|
fdb_conf_entry_list = fdb_conf.get("entry-list") |
|
|
|
|
|
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") |
|
|
|
|
|
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") |
|
|
|
|
|
try: |
|
|
|
|
|
fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print('the javascript link in the config is missing, original error message is:', e) |
|
|
|
|
|
fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1") |
|
|
|
|
|
fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2") |
|
|
|
|
|
|
|
|
|
|
|
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2) |
|
|
|
|
|
|
|
|
for entry_id in dictionary_entry_list: |
|
|
for entry_id in dictionary_entry_list: |
|
|
entry_link = dictionary_entry_list[entry_id]["link"] |
|
|
entry_link = dictionary_entry_list[entry_id]["link"] |
|
|
|
|
|
|
|
|
|
|
|
web_content = 'NONE' |
|
|
# download the html page of the entry |
|
|
# download the html page of the entry |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects |
|
|
|
|
|
url = entry_link |
|
|
|
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) |
|
|
|
|
|
response = urllib.request.urlopen(req) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
|
|
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) |
|
|
|
|
|
print( |
|
|
|
|
|
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'javascript' in entry_link: |
|
|
|
|
|
|
|
|
|
|
|
element = driver.find_element( |
|
|
|
|
|
"xpath", |
|
|
|
|
|
fdb_conf_entry_list_parent |
|
|
|
|
|
+ "[" |
|
|
|
|
|
+ str(entry_id+1) |
|
|
|
|
|
+ "]" |
|
|
|
|
|
+ fdb_conf_entry_list_javascript_link |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
element.click() |
|
|
|
|
|
window_after = driver.window_handles[1] |
|
|
|
|
|
driver.switch_to.window(window_after) |
|
|
|
|
|
element = driver.find_element("xpath", "//html") |
|
|
|
|
|
web_content = element.text |
|
|
|
|
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
|
|
f = open(file_name, "w+") |
|
|
|
|
|
f.write(web_content) |
|
|
|
|
|
f.close |
|
|
|
|
|
|
|
|
|
|
|
window_before = driver.window_handles[0] |
|
|
|
|
|
driver.switch_to.window(window_before) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ('http' or 'www') in entry_link and ('javascript' or 'js' or '.pdf') not in enry_link: |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
web_content = response.read().decode("UTF-8") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
try: |
|
|
web_content = response.read().decode("latin-1") |
|
|
|
|
|
print( |
|
|
|
|
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(ex) |
|
|
|
|
|
|
|
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects |
|
|
|
|
|
url = entry_link |
|
|
|
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) |
|
|
|
|
|
response = urllib.request.urlopen(req) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
|
|
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) |
|
|
|
|
|
print( |
|
|
|
|
|
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
web_content = response.read().decode("UTF-8") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
|
|
|
web_content = response.read().decode("latin-1") |
|
|
|
|
|
print( |
|
|
|
|
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(ex) |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
|
|
|
|
|
|
if '.pdf' in entry_link: |
|
|
if '.pdf' in entry_link: |
|
|
|
|
|
|
|
@ -409,7 +459,7 @@ class fdb_spider(object): |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not web_content: |
|
|
|
|
|
|
|
|
if web_content == 'NONE': |
|
|
print('other downloading approaches did not work, trying requests') |
|
|
print('other downloading approaches did not work, trying requests') |
|
|
|
|
|
|
|
|
try: |
|
|
try: |
|
|