|
@ -510,6 +510,7 @@ class fdb_spider(object): |
|
|
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2) |
|
|
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2) |
|
|
|
|
|
|
|
|
for entry_id in dictionary_entry_list: |
|
|
for entry_id in dictionary_entry_list: |
|
|
|
|
|
print(entry_id) |
|
|
entry_link = dictionary_entry_list[entry_id]["link"] |
|
|
entry_link = dictionary_entry_list[entry_id]["link"] |
|
|
web_content = 'NONE' |
|
|
web_content = 'NONE' |
|
|
# download the html page of the entry |
|
|
# download the html page of the entry |
|
@ -556,14 +557,14 @@ class fdb_spider(object): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'javascript' not in entry_link and '.pdf' not in entry_link: |
|
|
if 'javascript' not in entry_link and '.pdf' not in entry_link: |
|
|
|
|
|
|
|
|
|
|
|
print('blabuuuuuba') |
|
|
#print('oi') |
|
|
#print('oi') |
|
|
try: |
|
|
try: |
|
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects |
|
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects |
|
|
url = entry_link |
|
|
url = entry_link |
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) |
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) |
|
|
response = urllib.request.urlopen(req) |
|
|
response = urllib.request.urlopen(req) |
|
|
#print('response from first one', response) |
|
|
|
|
|
|
|
|
print('response from first one', response) |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print('cookie giving then downloading did not work, original error is:', e) |
|
|
print('cookie giving then downloading did not work, original error is:', e) |
|
|
try: |
|
|
try: |
|
@ -604,7 +605,7 @@ class fdb_spider(object): |
|
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wget_wrote = False |
|
|
if web_content == 'NONE': |
|
|
if web_content == 'NONE': |
|
|
print('other downloading approaches did not work, trying requests') |
|
|
print('other downloading approaches did not work, trying requests') |
|
|
|
|
|
|
|
@ -623,14 +624,16 @@ class fdb_spider(object): |
|
|
try: |
|
|
try: |
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name]) |
|
|
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name]) |
|
|
|
|
|
|
|
|
|
|
|
wget_wrote = True |
|
|
except subprocess.CalledProcessError: |
|
|
except subprocess.CalledProcessError: |
|
|
print('wget downloading did not work.. saving NONE to file now') |
|
|
print('wget downloading did not work.. saving NONE to file now') |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
|
|
f = open(file_name, "w+") |
|
|
|
|
|
f.write(web_content) |
|
|
|
|
|
f.close |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if wget_wrote == False: |
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
|
|
f = open(file_name, "w+") |
|
|
|
|
|
f.write(web_content) |
|
|
|
|
|
f.close |
|
|
|
|
|
|
|
|
# save the entry_domain, implemented first for further downloads in javascript links |
|
|
# save the entry_domain, implemented first for further downloads in javascript links |
|
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") |
|
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") |
|
|