added some exceptions for bad encoding and get errors

2023-11-14 14:38:45 +00:00 · 2023-11-14 14:38:45 +00:00 · 42841ee650
commit 42841ee650
parent 317ef99720
3 changed files with 32 additions and 6 deletions
--- a/main.py
+++ b/main.py
@ -13,9 +13,9 @@ spider = fdb_spider(config)

 #spider.find_config_parameter(list_of_fdbs)

-spider.parse_entry_list_data2dictionary(list_of_fdbs)
+#spider.parse_entry_list_data2dictionary(list_of_fdbs)

-# spider.download_entry_data_htmls(list_of_fdbs)
+spider.download_entry_data_htmls(list_of_fdbs)

 # spider.parse_entry_data2dictionary(list_of_fdbs)

--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -281,8 +281,10 @@ class fdb_spider(object):

                            if fdb_domain in link:
                                dictionary_entry_list[n]["link"] = link
+                            if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link):
+                                dictionary_entry_list[n]["link"] = link

-                            if fdb_domain not in link:
+                            else:
                                if link[-1] == '/':
                                    dictionary_entry_list[n]["link"] = fdb_domain + link
                                else:
@ -323,9 +325,33 @@ class fdb_spider(object):
                    entry_link = dictionary_entry_list[entry_id]["link"]

                    # download the html page of the entry
-
-                    response = urllib.request.urlopen(entry_link)
-                    web_content = response.read().decode("UTF-8")
+                    
+                    try:
+                        url = entry_link
+                        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
+                        response = urllib.request.urlopen(req)
+                    except Exception as e:
+                        try:
+                            response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
+                            print(
+                                "opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
+                                e,
+                            )
+                        except Exception as ex:
+                            print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
+                    
+                    
+                    try:
+                        web_content = response.read().decode("UTF-8")
+                    except Exception as e:
+                        try:
+                            web_content = response.read().decode("latin-1")
+                            print(
+                                "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
+                                e,
+                            )
+                        except Exception as ex:
+                            print(ex)

                    # save interim results to files