corrected error which arised in logic of wget backup get

This commit is contained in:
alpcentaur 2023-12-15 14:36:08 +01:00
parent 92c238a2ed
commit 5d17f4e421
10 changed files with 11 additions and 17439 deletions

View file

@ -510,6 +510,7 @@ class fdb_spider(object):
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2) driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
for entry_id in dictionary_entry_list: for entry_id in dictionary_entry_list:
print(entry_id)
entry_link = dictionary_entry_list[entry_id]["link"] entry_link = dictionary_entry_list[entry_id]["link"]
web_content = 'NONE' web_content = 'NONE'
# download the html page of the entry # download the html page of the entry
@ -556,14 +557,14 @@ class fdb_spider(object):
if 'javascript' not in entry_link and '.pdf' not in entry_link: if 'javascript' not in entry_link and '.pdf' not in entry_link:
print('blabuuuuuba')
#print('oi') #print('oi')
try: try:
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
url = entry_link url = entry_link
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
response = urllib.request.urlopen(req) response = urllib.request.urlopen(req)
#print('response from first one', response) print('response from first one', response)
except Exception as e: except Exception as e:
print('cookie giving then downloading did not work, original error is:', e) print('cookie giving then downloading did not work, original error is:', e)
try: try:
@ -604,7 +605,7 @@ class fdb_spider(object):
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
wget_wrote = False
if web_content == 'NONE': if web_content == 'NONE':
print('other downloading approaches did not work, trying requests') print('other downloading approaches did not work, trying requests')
@ -623,14 +624,16 @@ class fdb_spider(object):
try: try:
os.makedirs(os.path.dirname(file_name), exist_ok=True) os.makedirs(os.path.dirname(file_name), exist_ok=True)
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name]) oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
wget_wrote = True
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
print('wget downloading did not work.. saving NONE to file now') print('wget downloading did not work.. saving NONE to file now')
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+") if wget_wrote == False:
f.write(web_content) os.makedirs(os.path.dirname(file_name), exist_ok=True)
f.close f = open(file_name, "w+")
f.write(web_content)
f.close
# save the entry_domain, implemented first for further downloads in javascript links # save the entry_domain, implemented first for further downloads in javascript links
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long