corrected error which arised in logic of wget backup get
This commit is contained in:
parent
92c238a2ed
commit
5d17f4e421
10 changed files with 11 additions and 17439 deletions
Binary file not shown.
|
@ -510,6 +510,7 @@ class fdb_spider(object):
|
||||||
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
|
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
|
||||||
|
|
||||||
for entry_id in dictionary_entry_list:
|
for entry_id in dictionary_entry_list:
|
||||||
|
print(entry_id)
|
||||||
entry_link = dictionary_entry_list[entry_id]["link"]
|
entry_link = dictionary_entry_list[entry_id]["link"]
|
||||||
web_content = 'NONE'
|
web_content = 'NONE'
|
||||||
# download the html page of the entry
|
# download the html page of the entry
|
||||||
|
@ -556,14 +557,14 @@ class fdb_spider(object):
|
||||||
|
|
||||||
|
|
||||||
if 'javascript' not in entry_link and '.pdf' not in entry_link:
|
if 'javascript' not in entry_link and '.pdf' not in entry_link:
|
||||||
|
print('blabuuuuuba')
|
||||||
#print('oi')
|
#print('oi')
|
||||||
try:
|
try:
|
||||||
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
||||||
url = entry_link
|
url = entry_link
|
||||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
|
||||||
response = urllib.request.urlopen(req)
|
response = urllib.request.urlopen(req)
|
||||||
#print('response from first one', response)
|
print('response from first one', response)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('cookie giving then downloading did not work, original error is:', e)
|
print('cookie giving then downloading did not work, original error is:', e)
|
||||||
try:
|
try:
|
||||||
|
@ -604,7 +605,7 @@ class fdb_spider(object):
|
||||||
|
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
|
||||||
|
wget_wrote = False
|
||||||
if web_content == 'NONE':
|
if web_content == 'NONE':
|
||||||
print('other downloading approaches did not work, trying requests')
|
print('other downloading approaches did not work, trying requests')
|
||||||
|
|
||||||
|
@ -623,10 +624,12 @@ class fdb_spider(object):
|
||||||
try:
|
try:
|
||||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
|
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
|
||||||
|
wget_wrote = True
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
print('wget downloading did not work.. saving NONE to file now')
|
print('wget downloading did not work.. saving NONE to file now')
|
||||||
|
|
||||||
|
|
||||||
|
if wget_wrote == False:
|
||||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
f = open(file_name, "w+")
|
f = open(file_name, "w+")
|
||||||
f.write(web_content)
|
f.write(web_content)
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue