diff --git a/main.py b/main.py index e83d588..8b8bd6f 100644 --- a/main.py +++ b/main.py @@ -13,9 +13,9 @@ spider = fdb_spider(config) #spider.find_config_parameter(list_of_fdbs) -spider.parse_entry_list_data2dictionary(list_of_fdbs) +#spider.parse_entry_list_data2dictionary(list_of_fdbs) -# spider.download_entry_data_htmls(list_of_fdbs) +spider.download_entry_data_htmls(list_of_fdbs) # spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index ebbced9..e09169f 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index ef76a6a..6ec37cf 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -281,8 +281,10 @@ class fdb_spider(object): if fdb_domain in link: dictionary_entry_list[n]["link"] = link + if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): + dictionary_entry_list[n]["link"] = link - if fdb_domain not in link: + else: if link[-1] == '/': dictionary_entry_list[n]["link"] = fdb_domain + link else: @@ -323,9 +325,33 @@ class fdb_spider(object): entry_link = dictionary_entry_list[entry_id]["link"] # download the html page of the entry - - response = urllib.request.urlopen(entry_link) - web_content = response.read().decode("UTF-8") + + try: + url = entry_link + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) + response = urllib.request.urlopen(req) + except Exception as e: + try: + response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) + print( + "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:", + e, + ) + except Exception as ex: + print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex ) + + + try: + web_content = response.read().decode("UTF-8") + except Exception as e: + try: + web_content = response.read().decode("latin-1") + print( + "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", + e, + ) + except Exception as ex: + print(ex) # save interim results to files