|
@ -281,8 +281,10 @@ class fdb_spider(object): |
|
|
|
|
|
|
|
|
if fdb_domain in link: |
|
|
if fdb_domain in link: |
|
|
dictionary_entry_list[n]["link"] = link |
|
|
dictionary_entry_list[n]["link"] = link |
|
|
|
|
|
if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): |
|
|
|
|
|
dictionary_entry_list[n]["link"] = link |
|
|
|
|
|
|
|
|
if fdb_domain not in link: |
|
|
|
|
|
|
|
|
else: |
|
|
if link[-1] == '/': |
|
|
if link[-1] == '/': |
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link |
|
|
dictionary_entry_list[n]["link"] = fdb_domain + link |
|
|
else: |
|
|
else: |
|
@ -323,9 +325,33 @@ class fdb_spider(object): |
|
|
entry_link = dictionary_entry_list[entry_id]["link"] |
|
|
entry_link = dictionary_entry_list[entry_id]["link"] |
|
|
|
|
|
|
|
|
# download the html page of the entry |
|
|
# download the html page of the entry |
|
|
|
|
|
|
|
|
response = urllib.request.urlopen(entry_link) |
|
|
|
|
|
web_content = response.read().decode("UTF-8") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
url = entry_link |
|
|
|
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) |
|
|
|
|
|
response = urllib.request.urlopen(req) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
|
|
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) |
|
|
|
|
|
print( |
|
|
|
|
|
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
web_content = response.read().decode("UTF-8") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
|
|
|
web_content = response.read().decode("latin-1") |
|
|
|
|
|
print( |
|
|
|
|
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(ex) |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
# save interim results to files |
|
|
|
|
|
|
|
|