diff --git a/main.py b/main.py index f1f9f17..15dcd94 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,7 @@ spider = fdb_spider(config) #spider.parse_entry_list_data2dictionary(list_of_fdbs) -# spider.download_entry_data_htmls(list_of_fdbs) +#spider.download_entry_data_htmls(list_of_fdbs) spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index d093fac..767558c 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 5da1a6d..4f97c90 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -10,8 +10,12 @@ import lxml.html import lxml.html.soupparser from lxml import html +import requests + from trafilatura import extract +from pdfminer.high_level import extract_pages +from pdfminer.layout import LTTextContainer class fdb_spider(object): def __init__(self, config_file): @@ -356,13 +360,23 @@ class fdb_spider(object): print(ex) # save interim results to files + + if '.pdf' in entry_link: + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + response = requests.get(entry_link) + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "bw") + f.write(response.content) + f.close + + else: + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" - file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" - - os.makedirs(os.path.dirname(file_name), exist_ok=True) - f = open(file_name, "w+") - f.write(web_content) - f.close + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "w+") + f.write(web_content) + f.close def parse_entry_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: @@ -441,74 +455,92 @@ class fdb_spider(object): fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist") + if '.pdf' in dictionary_entry_list[entry_id]["link"]: + print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id) + try: - - p_text = tree.xpath( - "//p//text()" - ) - - div_text = tree.xpath( - "//div//text()" - ) - - - #print("oi", text) - generaltext = '' - for n in range(len(p_text)): - - if len(p_text[n]) > 0: - generaltext += p_text[n] + ' ' - - for n in range(len(div_text)): - - if len(div_text[n]) > 0 and div_text[n] not in p_text: - generaltext += div_text[n] + ' ' + generaltext = '' - - generaltextlist = generaltext.split(' ') - if len(generaltextlist) > 5000: - print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) - - file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + for page_layout in extract_pages(file_name): + for element in page_layout: + if isinstance(element, LTTextContainer): + generaltext += element.get_text() - try: - with open(file_name , 'r', encoding='utf-8') as file: - html_content = file.read() except Exception as e: + generaltext = 'NONE' + print('parsing pdf did not work, the original error is:', e ) - with open(file_name , 'r', encoding='latin-1') as file: - html_content = file.read() - print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) - generaltext = extract(html_content) - print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + else: + + p_text = tree.xpath( + "//p//text()" + ) - if len(generaltextlist) < 2: - print('no text parsed, the wc is', len(generaltextlist)) + div_text = tree.xpath( + "//div//text()" + ) - print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) - file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + #print("oi", text) + generaltext = '' + for n in range(len(p_text)): + + if len(p_text[n]) > 0: + generaltext += p_text[n] + ' ' - try: - with open(file_name , 'r', encoding='utf-8') as file: - html_content = file.read() - except Exception as e: + for n in range(len(div_text)): - with open(file_name , 'r', encoding='latin-1') as file: - html_content = file.read() - print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + if len(div_text[n]) > 0 and div_text[n] not in p_text: + generaltext += div_text[n] + ' ' - generaltext = extract(html_content) - try: - if len(generaltext) > 2: - print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) - except: + + generaltextlist = generaltext.split(' ') + if len(generaltextlist) > 5000: + print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) - print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') - generaltext = 'NONE' + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + try: + with open(file_name , 'r', encoding='utf-8') as file: + html_content = file.read() + except Exception as e: + + with open(file_name , 'r', encoding='latin-1') as file: + html_content = file.read() + print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + + generaltext = extract(html_content) + print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + + if len(generaltextlist) < 2: + + + print('no text parsed, the wc is', len(generaltextlist)) + + print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + try: + with open(file_name , 'r', encoding='utf-8') as file: + html_content = file.read() + except Exception as e: + + with open(file_name , 'r', encoding='latin-1') as file: + html_content = file.read() + print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + + generaltext = extract(html_content) + try: + if len(generaltext) > 2: + print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + except: + + print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') + generaltext = 'NONE' dictionary_entry_list[entry_id]["text"] = generaltext dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)