added some exceptions for bad encoding and get errors
This commit is contained in:
parent
317ef99720
commit
42841ee650
3 changed files with 32 additions and 6 deletions
4
main.py
4
main.py
|
@ -13,9 +13,9 @@ spider = fdb_spider(config)
|
||||||
|
|
||||||
#spider.find_config_parameter(list_of_fdbs)
|
#spider.find_config_parameter(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
# spider.download_entry_data_htmls(list_of_fdbs)
|
spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -281,8 +281,10 @@ class fdb_spider(object):
|
||||||
|
|
||||||
if fdb_domain in link:
|
if fdb_domain in link:
|
||||||
dictionary_entry_list[n]["link"] = link
|
dictionary_entry_list[n]["link"] = link
|
||||||
|
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
|
||||||
|
dictionary_entry_list[n]["link"] = link
|
||||||
|
|
||||||
if fdb_domain not in link:
|
else:
|
||||||
if link[-1] == '/':
|
if link[-1] == '/':
|
||||||
dictionary_entry_list[n]["link"] = fdb_domain + link
|
dictionary_entry_list[n]["link"] = fdb_domain + link
|
||||||
else:
|
else:
|
||||||
|
@ -323,9 +325,33 @@ class fdb_spider(object):
|
||||||
entry_link = dictionary_entry_list[entry_id]["link"]
|
entry_link = dictionary_entry_list[entry_id]["link"]
|
||||||
|
|
||||||
# download the html page of the entry
|
# download the html page of the entry
|
||||||
|
|
||||||
response = urllib.request.urlopen(entry_link)
|
try:
|
||||||
web_content = response.read().decode("UTF-8")
|
url = entry_link
|
||||||
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
|
||||||
|
response = urllib.request.urlopen(req)
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
|
||||||
|
print(
|
||||||
|
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
web_content = response.read().decode("UTF-8")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
web_content = response.read().decode("latin-1")
|
||||||
|
print(
|
||||||
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
print(ex)
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue