added last resort exception for entry page downloading with wget, also implemented some further logic regarding getting the right links
This commit is contained in:
parent
16199256e3
commit
0e58756600
3 changed files with 50 additions and 28 deletions
4
main.py
4
main.py
|
@ -16,9 +16,9 @@ spider = fdb_spider(config)
|
||||||
|
|
||||||
#spider.find_config_parameter(list_of_fdbs)
|
#spider.find_config_parameter(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
spider.download_entry_data_htmls(list_of_fdbs)
|
spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_data2dictionary(list_of_fdbs)
|
#spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -19,6 +19,8 @@ from pdfminer.layout import LTTextContainer
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
class fdb_spider(object):
|
class fdb_spider(object):
|
||||||
def __init__(self, config_file):
|
def __init__(self, config_file):
|
||||||
|
@ -99,7 +101,7 @@ class fdb_spider(object):
|
||||||
)
|
)
|
||||||
entry_jsdomain = 'NONE'
|
entry_jsdomain = 'NONE'
|
||||||
|
|
||||||
if entry_jsdomain == 'NONE':
|
if entry_jsdomain == 'NONE' or entry_jsdomain == 'None':
|
||||||
|
|
||||||
for i in entry_iteration_var_list:
|
for i in entry_iteration_var_list:
|
||||||
|
|
||||||
|
@ -417,8 +419,17 @@ class fdb_spider(object):
|
||||||
if 'javascript:' in link:
|
if 'javascript:' in link:
|
||||||
dictionary_entry_list[n]["link"] = link
|
dictionary_entry_list[n]["link"] = link
|
||||||
if fdb_domain not in link and ('http' or 'https' or 'www.') not in link:
|
if fdb_domain not in link and ('http' or 'https' or 'www.') not in link:
|
||||||
if link[-1] == '/':
|
if link[0] == '/':
|
||||||
dictionary_entry_list[n]["link"] = fdb_domain + link
|
if fdb_domain[-1] != '/':
|
||||||
|
dictionary_entry_list[n]["link"] = fdb_domain + link
|
||||||
|
if fdb_domain[-1] == '/':
|
||||||
|
dictionary_entry_list[n]["link"] = fdb_domain + link[1:]
|
||||||
|
if link[0] == '.' and link[1] == '/':
|
||||||
|
if fdb_domain[-1] != '/':
|
||||||
|
dictionary_entry_list[n]["link"] = fdb_domain + link[1:]
|
||||||
|
if fdb_domain[-1] == '/':
|
||||||
|
dictionary_entry_list[n]["link"] = fdb_domain + link[2:]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
|
dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
|
||||||
|
|
||||||
|
@ -534,14 +545,17 @@ class fdb_spider(object):
|
||||||
driver.switch_to.window(window_before)
|
driver.switch_to.window(window_before)
|
||||||
|
|
||||||
|
|
||||||
if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link:
|
if 'javascript' not in entry_link and '.pdf' not in entry_link:
|
||||||
|
|
||||||
|
#print('oi')
|
||||||
try:
|
try:
|
||||||
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
||||||
url = entry_link
|
url = entry_link
|
||||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
|
||||||
response = urllib.request.urlopen(req)
|
response = urllib.request.urlopen(req)
|
||||||
|
#print('response from first one', response)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
print('cookie giving then downloading did not work, original error is:', e)
|
||||||
try:
|
try:
|
||||||
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
|
||||||
print(
|
print(
|
||||||
|
@ -575,30 +589,38 @@ class fdb_spider(object):
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
f.close
|
f.close
|
||||||
|
|
||||||
else:
|
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
|
||||||
|
|
||||||
|
|
||||||
if web_content == 'NONE':
|
|
||||||
print('other downloading approaches did not work, trying requests')
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
|
||||||
|
|
||||||
|
if web_content == 'NONE':
|
||||||
|
print('other downloading approaches did not work, trying requests')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from requests_html import HTMLSession
|
||||||
|
session = HTMLSession()
|
||||||
|
|
||||||
|
r = session.get(entry_link)
|
||||||
|
|
||||||
|
r.html.render()
|
||||||
|
web_content = r.text
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('requests_html HTMLSession did not work trying wget, ori error is:', e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from requests_html import HTMLSession
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
session = HTMLSession()
|
oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
|
||||||
|
|
||||||
r = session.get(entry_link)
|
except subprocess.CalledProcessError:
|
||||||
|
print('wget downloading did not work.. saving NONE to file now')
|
||||||
|
|
||||||
r.html.render()
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
web_content = r.text
|
f = open(file_name, "w+")
|
||||||
|
f.write(web_content)
|
||||||
except Exception as e:
|
f.close
|
||||||
print('requests_html HTMLSession did not work')
|
|
||||||
|
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
|
||||||
f = open(file_name, "w+")
|
|
||||||
f.write(web_content)
|
|
||||||
f.close
|
|
||||||
|
|
||||||
# save the entry_domain, implemented first for further downloads in javascript links
|
# save the entry_domain, implemented first for further downloads in javascript links
|
||||||
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
|
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
|
||||||
|
@ -692,7 +714,7 @@ class fdb_spider(object):
|
||||||
pdf_link = entry_domain[:-1] + child[1:]
|
pdf_link = entry_domain[:-1] + child[1:]
|
||||||
if entry_domain[-1] != '/':
|
if entry_domain[-1] != '/':
|
||||||
for n in range(len(entry_domain)):
|
for n in range(len(entry_domain)):
|
||||||
if entry_domain[-1] != '/':
|
if entry_domain[-n] != '/':
|
||||||
entry_domain = entry_domain[:-1]
|
entry_domain = entry_domain[:-1]
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
Loading…
Reference in a new issue