|
|
@ -10,8 +10,12 @@ import lxml.html |
|
|
|
import lxml.html.soupparser |
|
|
|
from lxml import html |
|
|
|
|
|
|
|
import requests |
|
|
|
|
|
|
|
from trafilatura import extract |
|
|
|
|
|
|
|
from pdfminer.high_level import extract_pages |
|
|
|
from pdfminer.layout import LTTextContainer |
|
|
|
|
|
|
|
class fdb_spider(object): |
|
|
|
def __init__(self, config_file): |
|
|
@ -356,13 +360,23 @@ class fdb_spider(object): |
|
|
|
print(ex) |
|
|
|
|
|
|
|
# save interim results to files |
|
|
|
|
|
|
|
if '.pdf' in entry_link: |
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
response = requests.get(entry_link) |
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
f = open(file_name, "bw") |
|
|
|
f.write(response.content) |
|
|
|
f.close |
|
|
|
|
|
|
|
else: |
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
f = open(file_name, "w+") |
|
|
|
f.write(web_content) |
|
|
|
f.close |
|
|
|
os.makedirs(os.path.dirname(file_name), exist_ok=True) |
|
|
|
f = open(file_name, "w+") |
|
|
|
f.write(web_content) |
|
|
|
f.close |
|
|
|
|
|
|
|
def parse_entry_data2dictionary(self, list_of_fdbs): |
|
|
|
for fdb in list_of_fdbs: |
|
|
@ -441,74 +455,92 @@ class fdb_spider(object): |
|
|
|
fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist") |
|
|
|
|
|
|
|
|
|
|
|
if '.pdf' in dictionary_entry_list[entry_id]["link"]: |
|
|
|
|
|
|
|
print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id) |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
p_text = tree.xpath( |
|
|
|
"//p//text()" |
|
|
|
) |
|
|
|
|
|
|
|
div_text = tree.xpath( |
|
|
|
"//div//text()" |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
#print("oi", text) |
|
|
|
generaltext = '' |
|
|
|
for n in range(len(p_text)): |
|
|
|
|
|
|
|
if len(p_text[n]) > 0: |
|
|
|
generaltext += p_text[n] + ' ' |
|
|
|
|
|
|
|
for n in range(len(div_text)): |
|
|
|
|
|
|
|
if len(div_text[n]) > 0 and div_text[n] not in p_text: |
|
|
|
generaltext += div_text[n] + ' ' |
|
|
|
generaltext = '' |
|
|
|
|
|
|
|
|
|
|
|
generaltextlist = generaltext.split(' ') |
|
|
|
if len(generaltextlist) > 5000: |
|
|
|
print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) |
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
for page_layout in extract_pages(file_name): |
|
|
|
for element in page_layout: |
|
|
|
if isinstance(element, LTTextContainer): |
|
|
|
generaltext += element.get_text() |
|
|
|
|
|
|
|
try: |
|
|
|
with open(file_name , 'r', encoding='utf-8') as file: |
|
|
|
html_content = file.read() |
|
|
|
except Exception as e: |
|
|
|
generaltext = 'NONE' |
|
|
|
print('parsing pdf did not work, the original error is:', e ) |
|
|
|
|
|
|
|
with open(file_name , 'r', encoding='latin-1') as file: |
|
|
|
html_content = file.read() |
|
|
|
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) |
|
|
|
|
|
|
|
generaltext = extract(html_content) |
|
|
|
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) |
|
|
|
else: |
|
|
|
|
|
|
|
p_text = tree.xpath( |
|
|
|
"//p//text()" |
|
|
|
) |
|
|
|
|
|
|
|
if len(generaltextlist) < 2: |
|
|
|
print('no text parsed, the wc is', len(generaltextlist)) |
|
|
|
div_text = tree.xpath( |
|
|
|
"//div//text()" |
|
|
|
) |
|
|
|
|
|
|
|
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) |
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
#print("oi", text) |
|
|
|
generaltext = '' |
|
|
|
for n in range(len(p_text)): |
|
|
|
|
|
|
|
if len(p_text[n]) > 0: |
|
|
|
generaltext += p_text[n] + ' ' |
|
|
|
|
|
|
|
try: |
|
|
|
with open(file_name , 'r', encoding='utf-8') as file: |
|
|
|
html_content = file.read() |
|
|
|
except Exception as e: |
|
|
|
for n in range(len(div_text)): |
|
|
|
|
|
|
|
with open(file_name , 'r', encoding='latin-1') as file: |
|
|
|
html_content = file.read() |
|
|
|
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) |
|
|
|
if len(div_text[n]) > 0 and div_text[n] not in p_text: |
|
|
|
generaltext += div_text[n] + ' ' |
|
|
|
|
|
|
|
generaltext = extract(html_content) |
|
|
|
try: |
|
|
|
if len(generaltext) > 2: |
|
|
|
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) |
|
|
|
except: |
|
|
|
|
|
|
|
generaltextlist = generaltext.split(' ') |
|
|
|
if len(generaltextlist) > 5000: |
|
|
|
print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) |
|
|
|
|
|
|
|
print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') |
|
|
|
generaltext = 'NONE' |
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
try: |
|
|
|
with open(file_name , 'r', encoding='utf-8') as file: |
|
|
|
html_content = file.read() |
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
with open(file_name , 'r', encoding='latin-1') as file: |
|
|
|
html_content = file.read() |
|
|
|
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) |
|
|
|
|
|
|
|
generaltext = extract(html_content) |
|
|
|
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) |
|
|
|
|
|
|
|
if len(generaltextlist) < 2: |
|
|
|
|
|
|
|
|
|
|
|
print('no text parsed, the wc is', len(generaltextlist)) |
|
|
|
|
|
|
|
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) |
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
try: |
|
|
|
with open(file_name , 'r', encoding='utf-8') as file: |
|
|
|
html_content = file.read() |
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
with open(file_name , 'r', encoding='latin-1') as file: |
|
|
|
html_content = file.read() |
|
|
|
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) |
|
|
|
|
|
|
|
generaltext = extract(html_content) |
|
|
|
try: |
|
|
|
if len(generaltext) > 2: |
|
|
|
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) |
|
|
|
except: |
|
|
|
|
|
|
|
print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') |
|
|
|
generaltext = 'NONE' |
|
|
|
|
|
|
|
dictionary_entry_list[entry_id]["text"] = generaltext |
|
|
|
dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist) |
|
|
|