added last resort exception for entry page downloading with wget, also implemented some further logic regarding getting the right links

2023-12-15 11:33:50 +00:00 · 2023-12-15 11:33:50 +00:00 · 0e58756600
commit 0e58756600
parent 16199256e3
3 changed files with 50 additions and 28 deletions
--- a/main.py
+++ b/main.py
@ -16,9 +16,9 @@ spider = fdb_spider(config)
 #spider.find_config_parameter(list_of_fdbs)
-spider.parse_entry_list_data2dictionary(list_of_fdbs)
+#spider.parse_entry_list_data2dictionary(list_of_fdbs)
 spider.download_entry_data_htmls(list_of_fdbs)
-spider.parse_entry_data2dictionary(list_of_fdbs)
+#spider.parse_entry_data2dictionary(list_of_fdbs)
--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -19,6 +19,8 @@ from pdfminer.layout import LTTextContainer
 import time
 import subprocess
 class fdb_spider(object):
    def __init__(self, config_file):
@ -99,7 +101,7 @@ class fdb_spider(object):
                        )
                        entry_jsdomain = 'NONE'
-                    if entry_jsdomain == 'NONE':
+                    if entry_jsdomain == 'NONE' or entry_jsdomain == 'None':
                        for i in entry_iteration_var_list:
@ -417,8 +419,17 @@ class fdb_spider(object):
                            if 'javascript:' in link:
                                dictionary_entry_list[n]["link"] = link
                            if fdb_domain not in link and ('http' or 'https' or 'www.') not in link:
-                                if link[-1] == '/':
+                                if link[0] == '/':
-                                    dictionary_entry_list[n]["link"] = fdb_domain + link
+                                    if fdb_domain[-1] != '/':
                                        dictionary_entry_list[n]["link"] = fdb_domain + link
                                    if fdb_domain[-1] == '/':
                                        dictionary_entry_list[n]["link"] = fdb_domain + link[1:]
                                if link[0] == '.' and link[1] == '/':
                                    if fdb_domain[-1] != '/':
                                        dictionary_entry_list[n]["link"] = fdb_domain + link[1:]
                                    if fdb_domain[-1] == '/':
                                        dictionary_entry_list[n]["link"] = fdb_domain + link[2:]
                                else:
                                    dictionary_entry_list[n]["link"] = fdb_domain + '/' + link
@ -534,14 +545,17 @@ class fdb_spider(object):
                        driver.switch_to.window(window_before)
-                    if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link:
+                    if 'javascript' not in entry_link and '.pdf' not in entry_link:
                        #print('oi')
                        try:
                            # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
                            url = entry_link
-                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'})
+                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
                            response = urllib.request.urlopen(req)
                            #print('response from first one', response)
                        except Exception as e:
                            print('cookie giving then downloading did not work, original error is:', e)
                            try:
                                response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
                                print(
@ -575,30 +589,38 @@ class fdb_spider(object):
                        f.write(response.content)
                        f.close
-                    else:
+                    
                        file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
-                        if web_content == 'NONE':
+                    file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
-                            print('other downloading approaches did not work, trying requests')
+                        
                    if web_content == 'NONE':
                        print('other downloading approaches did not work, trying requests')
                        try:
                            from requests_html import HTMLSession
                            session = HTMLSession()
                            r = session.get(entry_link)
                            r.html.render()
                            web_content = r.text
                        except Exception as e:
                            print('requests_html HTMLSession did not work trying wget, ori error is:', e)
                            try:
-                                from requests_html import HTMLSession
+                                os.makedirs(os.path.dirname(file_name), exist_ok=True)
-                                session = HTMLSession()
+                                oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name])
-
+                            
-                                r = session.get(entry_link)
+                            except subprocess.CalledProcessError:
-
+                                print('wget downloading did not work.. saving NONE to file now')
-                                r.html.render()
+                    
-                                web_content = r.text
+                                os.makedirs(os.path.dirname(file_name), exist_ok=True)
-                        
+                                f = open(file_name, "w+")
-                            except Exception as e:
+                                f.write(web_content)
-                                print('requests_html HTMLSession did not work')
+                                f.close
                        os.makedirs(os.path.dirname(file_name), exist_ok=True)
                        f = open(file_name, "w+")
                        f.write(web_content)
                        f.close
                # save the entry_domain, implemented first for further downloads in javascript links
                f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
@ -692,7 +714,7 @@ class fdb_spider(object):
                                                pdf_link = entry_domain[:-1] + child[1:]
                                            if entry_domain[-1] != '/':
                                                for n in range(len(entry_domain)):
-                                                    if entry_domain[-1] != '/':
+                                                    if entry_domain[-n] != '/':
                                                        entry_domain = entry_domain[:-1]
                                                    else:
                                                        break