|
@ -19,6 +19,8 @@ from pdfminer.layout import LTTextContainer |
|
|
|
|
|
|
|
|
import time |
|
|
import time |
|
|
|
|
|
|
|
|
|
|
|
import subprocess |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class fdb_spider(object): |
|
|
class fdb_spider(object): |
|
|
def __init__(self, config_file): |
|
|
def __init__(self, config_file): |
|
@ -99,7 +101,7 @@ class fdb_spider(object): |
|
|
) |
|
|
) |
|
|
entry_jsdomain = 'NONE' |
|
|
entry_jsdomain = 'NONE' |
|
|
|
|
|
|
|
|
if entry_jsdomain == 'NONE': |
|
|
|
|
|
|
|
|
if entry_jsdomain == 'NONE' or entry_jsdomain == 'None': |
|
|
|
|
|
|
|
|
for i in entry_iteration_var_list: |
|
|
for i in entry_iteration_var_list: |
|
|
|
|
|
|
|
@ -417,8 +419,17 @@ class fdb_spider(object): |
|
|
if 'javascript:' in link: |
|
|
if 'javascript:' in link: |
|
|
dictionary_entry_list[n]["link"] = link |
|
|
dictionary_entry_list[n]["link"] = link |
|
|
if fdb_domain not in link and ('http' or 'https' or 'www.') not in link: |
|
|
if fdb_domain not in link and ('http' or 'https' or 'www.') not in link: |
|
|
if link[-1] == '/': |
|
|
|
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link |
|
|
|
|
|
|
|
|
if link[0] == '/': |
|
|
|
|
|
if fdb_domain[-1] != '/': |
|
|
|
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link |
|
|
|
|
|
if fdb_domain[-1] == '/': |
|
|
|
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link[1:] |
|
|
|
|
|
if link[0] == '.' and link[1] == '/': |
|
|
|
|
|
if fdb_domain[-1] != '/': |
|
|
|
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link[1:] |
|
|
|
|
|
if fdb_domain[-1] == '/': |
|
|
|
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link[2:] |
|
|
|
|
|
|
|
|
else: |
|
|
else: |
|
|
dictionary_entry_list[n]["link"] = fdb_domain + '/' + link |
|
|
dictionary_entry_list[n]["link"] = fdb_domain + '/' + link |
|
|
|
|
|
|
|
@ -534,14 +545,17 @@ class fdb_spider(object): |
|
|
driver.switch_to.window(window_before) |
|
|
driver.switch_to.window(window_before) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link: |
|
|
|
|
|
|
|
|
if 'javascript' not in entry_link and '.pdf' not in entry_link: |
|
|
|
|
|
|
|
|
|
|
|
#print('oi') |
|
|
try: |
|
|
try: |
|
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects |
|
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects |
|
|
url = entry_link |
|
|
url = entry_link |
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) |
|
|
|
|
|
|
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) |
|
|
response = urllib.request.urlopen(req) |
|
|
response = urllib.request.urlopen(req) |
|
|
|
|
|
#print('response from first one', response) |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
|
|
|
print('cookie giving then downloading did not work, original error is:', e) |
|
|
try: |
|
|
try: |
|
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) |
|
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) |
|
|
print( |
|
|
print( |
|
@ -575,30 +589,38 @@ class fdb_spider(object): |
|
|
f.write(response.content) |
|
|
f.write(response.content) |
|
|
f.close |
|
|
f.close |
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if web_content == 'NONE': |
|
|
|
|
|
print('other downloading approaches did not work, trying requests') |
|
|
|
|
|
|
|
|
if web_content == 'NONE': |
|
|
|
|
|
print('other downloading approaches did not work, trying requests') |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
from requests_html import HTMLSession |
|
|
|
|
|
session = HTMLSession() |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
from requests_html import HTMLSession |
|
|
|
|
|
session = HTMLSession() |
|
|
|
|
|
|
|
|
r = session.get(entry_link) |
|
|
|
|
|
|
|
|
r = session.get(entry_link) |
|
|
|
|
|
|
|
|
r.html.render() |
|
|
|
|
|
web_content = r.text |
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print('requests_html HTMLSession did not work') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
|
|
f = open(file_name, "w+") |
|
|
|
|
|
f.write(web_content) |
|
|
|
|
|
f.close |
|
|
|
|
|
|
|
|
r.html.render() |
|
|
|
|
|
web_content = r.text |
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print('requests_html HTMLSession did not work trying wget, ori error is:', e) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
|
|
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name]) |
|
|
|
|
|
|
|
|
|
|
|
except subprocess.CalledProcessError: |
|
|
|
|
|
print('wget downloading did not work.. saving NONE to file now') |
|
|
|
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
|
|
f = open(file_name, "w+") |
|
|
|
|
|
f.write(web_content) |
|
|
|
|
|
f.close |
|
|
|
|
|
|
|
|
# save the entry_domain, implemented first for further downloads in javascript links |
|
|
# save the entry_domain, implemented first for further downloads in javascript links |
|
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") |
|
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") |
|
@ -692,7 +714,7 @@ class fdb_spider(object): |
|
|
pdf_link = entry_domain[:-1] + child[1:] |
|
|
pdf_link = entry_domain[:-1] + child[1:] |
|
|
if entry_domain[-1] != '/': |
|
|
if entry_domain[-1] != '/': |
|
|
for n in range(len(entry_domain)): |
|
|
for n in range(len(entry_domain)): |
|
|
if entry_domain[-1] != '/': |
|
|
|
|
|
|
|
|
if entry_domain[-n] != '/': |
|
|
entry_domain = entry_domain[:-1] |
|
|
entry_domain = entry_domain[:-1] |
|
|
else: |
|
|
else: |
|
|
break |
|
|
break |
|
|