diff --git a/main.py b/main.py index 4a2d91f..cab6c07 100644 --- a/main.py +++ b/main.py @@ -16,9 +16,9 @@ spider = fdb_spider(config) #spider.find_config_parameter(list_of_fdbs) -spider.parse_entry_list_data2dictionary(list_of_fdbs) +#spider.parse_entry_list_data2dictionary(list_of_fdbs) spider.download_entry_data_htmls(list_of_fdbs) -spider.parse_entry_data2dictionary(list_of_fdbs) +#spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index f60df44..a8d26b3 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 9569fcf..7c197f0 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -19,6 +19,8 @@ from pdfminer.layout import LTTextContainer import time +import subprocess + class fdb_spider(object): def __init__(self, config_file): @@ -99,7 +101,7 @@ class fdb_spider(object): ) entry_jsdomain = 'NONE' - if entry_jsdomain == 'NONE': + if entry_jsdomain == 'NONE' or entry_jsdomain == 'None': for i in entry_iteration_var_list: @@ -417,8 +419,17 @@ class fdb_spider(object): if 'javascript:' in link: dictionary_entry_list[n]["link"] = link if fdb_domain not in link and ('http' or 'https' or 'www.') not in link: - if link[-1] == '/': - dictionary_entry_list[n]["link"] = fdb_domain + link + if link[0] == '/': + if fdb_domain[-1] != '/': + dictionary_entry_list[n]["link"] = fdb_domain + link + if fdb_domain[-1] == '/': + dictionary_entry_list[n]["link"] = fdb_domain + link[1:] + if link[0] == '.' and link[1] == '/': + if fdb_domain[-1] != '/': + dictionary_entry_list[n]["link"] = fdb_domain + link[1:] + if fdb_domain[-1] == '/': + dictionary_entry_list[n]["link"] = fdb_domain + link[2:] + else: dictionary_entry_list[n]["link"] = fdb_domain + '/' + link @@ -534,14 +545,17 @@ class fdb_spider(object): driver.switch_to.window(window_before) - if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link: + if 'javascript' not in entry_link and '.pdf' not in entry_link: + #print('oi') try: # defining cookie to not end up in endless loop because of cookie banners pointing to redirects url = entry_link - req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) response = urllib.request.urlopen(req) + #print('response from first one', response) except Exception as e: + print('cookie giving then downloading did not work, original error is:', e) try: response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) print( @@ -575,30 +589,38 @@ class fdb_spider(object): f.write(response.content) f.close - else: - file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" - if web_content == 'NONE': - print('other downloading approaches did not work, trying requests') + if web_content == 'NONE': + print('other downloading approaches did not work, trying requests') - try: - from requests_html import HTMLSession - session = HTMLSession() + try: + from requests_html import HTMLSession + session = HTMLSession() - r = session.get(entry_link) + r = session.get(entry_link) - r.html.render() - web_content = r.text - - except Exception as e: - print('requests_html HTMLSession did not work') - - - os.makedirs(os.path.dirname(file_name), exist_ok=True) - f = open(file_name, "w+") - f.write(web_content) - f.close + r.html.render() + web_content = r.text + + except Exception as e: + print('requests_html HTMLSession did not work trying wget, ori error is:', e) + + try: + os.makedirs(os.path.dirname(file_name), exist_ok=True) + oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name]) + + except subprocess.CalledProcessError: + print('wget downloading did not work.. saving NONE to file now') + + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "w+") + f.write(web_content) + f.close # save the entry_domain, implemented first for further downloads in javascript links f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") @@ -692,7 +714,7 @@ class fdb_spider(object): pdf_link = entry_domain[:-1] + child[1:] if entry_domain[-1] != '/': for n in range(len(entry_domain)): - if entry_domain[-1] != '/': + if entry_domain[-n] != '/': entry_domain = entry_domain[:-1] else: break