added last resort exception for entry page downloading with wget, also implemented some further logic regarding getting the right links

2023-12-15 11:33:50 +00:00 · 2023-12-15 11:33:50 +00:00 · 0e58756600
commit 0e58756600
parent 16199256e3
3 changed files with 50 additions and 28 deletions
--- a/main.py
+++ b/main.py
@ -16,9 +16,9 @@ spider = fdb_spider(config)

 #spider.find_config_parameter(list_of_fdbs)

-spider.parse_entry_list_data2dictionary(list_of_fdbs)
+#spider.parse_entry_list_data2dictionary(list_of_fdbs)

 spider.download_entry_data_htmls(list_of_fdbs)

-spider.parse_entry_data2dictionary(list_of_fdbs)
+#spider.parse_entry_data2dictionary(list_of_fdbs)

--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -19,6 +19,8 @@ from pdfminer.layout import LTTextContainer

 import time

+import subprocess
+

 class fdb_spider(object):
    def __init__(self, config_file):
@ -99,7 +101,7 @@ class fdb_spider(object):
                        )
                        entry_jsdomain = 'NONE'

-                    if entry_jsdomain == 'NONE':
+                    if entry_jsdomain == 'NONE' or entry_jsdomain == 'None':

                        for i in entry_iteration_var_list:

@ -417,8 +419,17 @@ class fdb_spider(object):
                            if 'javascript:' in link:
                                dictionary_entry_list[n]["link"] = link
                            if fdb_domain not in link and ('http' or 'https' or 'www.') not in link:
-                                if link[-1] == '/':
-                                    dictionary_entry_list[n]["link"] = fdb_domain + link
+                                if link[0] == '/':
+                                    if fdb_domain[-1] != '/':
+                                        dictionary_entry_list[n]["link"] = fdb_domain + link
+                                    if fdb_domain[-1] == '/':
+                                        dictionary_entry_list[n]["link"] = fdb_domain + link[1:]
+                                if link[0] == '.' and link[1] == '/':
+                                    if fdb_domain[-1] != '/':
+                                        dictionary_entry_list[n]["link"] = fdb_domain + link[1:]
+                                    if fdb_domain[-1] == '/':
+                                        dictionary_entry_list[n]["link"] = fdb_domain + link[2:]
+                                
                                else:
                                    dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
                                
@ -534,14 +545,17 @@ class fdb_spider(object):
                        driver.switch_to.window(window_before)
                        
                    
-                    if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link:
+                    if 'javascript' not in entry_link and '.pdf' not in entry_link:
                    
+                        #print('oi')
                        try:
                            # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
                            url = entry_link
-                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
+                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
                            response = urllib.request.urlopen(req)
+                            #print('response from first one', response)
                        except Exception as e:
+                            print('cookie giving then downloading did not work, original error is:', e)
                            try:
                                response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
                                print(
@ -575,30 +589,38 @@ class fdb_spider(object):
                        f.write(response.content)
                        f.close
                    
-                    else:
-                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
+                    
                        
                        
-                        if web_content == 'NONE':
-                            print('other downloading approaches did not work, trying requests')
+                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
+                        
+                        
+                    if web_content == 'NONE':
+                        print('other downloading approaches did not work, trying requests')
+                            
+                        try:
+                            from requests_html import HTMLSession
+                            session = HTMLSession()
+
+                            r = session.get(entry_link)
+
+                            r.html.render()
+                            web_content = r.text
+                    
+                        except Exception as e:
+                            print('requests_html HTMLSession did not work trying wget, ori error is:', e)
                            
                            try:
-                                from requests_html import HTMLSession
-                                session = HTMLSession()
-
-                                r = session.get(entry_link)
-
-                                r.html.render()
-                                web_content = r.text
-                        
-                            except Exception as e:
-                                print('requests_html HTMLSession did not work')
-                        
-                        
-                        os.makedirs(os.path.dirname(file_name), exist_ok=True)
-                        f = open(file_name, "w+")
-                        f.write(web_content)
-                        f.close
+                                os.makedirs(os.path.dirname(file_name), exist_ok=True)
+                                oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
+                            
+                            except subprocess.CalledProcessError:
+                                print('wget downloading did not work.. saving NONE to file now')
+                    
+                                os.makedirs(os.path.dirname(file_name), exist_ok=True)
+                                f = open(file_name, "w+")
+                                f.write(web_content)
+                                f.close
                
                # save the entry_domain, implemented first for further downloads in javascript links
                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
@ -692,7 +714,7 @@ class fdb_spider(object):
                                                pdf_link = entry_domain[:-1] + child[1:]
                                            if entry_domain[-1] != '/':
                                                for n in range(len(entry_domain)):
-                                                    if entry_domain[-1] != '/':
+                                                    if entry_domain[-n] != '/':
                                                        entry_domain = entry_domain[:-1]
                                                    else:
                                                        break