added pdf parser if entry link is direct pdf
This commit is contained in:
parent
677e54c0c2
commit
df4a8289b8
3 changed files with 102 additions and 70 deletions
2
main.py
2
main.py
|
@ -15,7 +15,7 @@ spider = fdb_spider(config)
|
||||||
|
|
||||||
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
# spider.download_entry_data_htmls(list_of_fdbs)
|
#spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_data2dictionary(list_of_fdbs)
|
spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -10,8 +10,12 @@ import lxml.html
|
||||||
import lxml.html.soupparser
|
import lxml.html.soupparser
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
from trafilatura import extract
|
from trafilatura import extract
|
||||||
|
|
||||||
|
from pdfminer.high_level import extract_pages
|
||||||
|
from pdfminer.layout import LTTextContainer
|
||||||
|
|
||||||
class fdb_spider(object):
|
class fdb_spider(object):
|
||||||
def __init__(self, config_file):
|
def __init__(self, config_file):
|
||||||
|
@ -357,12 +361,22 @@ class fdb_spider(object):
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
if '.pdf' in entry_link:
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
f = open(file_name, "w+")
|
response = requests.get(entry_link)
|
||||||
f.write(web_content)
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
f.close
|
f = open(file_name, "bw")
|
||||||
|
f.write(response.content)
|
||||||
|
f.close
|
||||||
|
|
||||||
|
else:
|
||||||
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(file_name), exist_ok=True)
|
||||||
|
f = open(file_name, "w+")
|
||||||
|
f.write(web_content)
|
||||||
|
f.close
|
||||||
|
|
||||||
def parse_entry_data2dictionary(self, list_of_fdbs):
|
def parse_entry_data2dictionary(self, list_of_fdbs):
|
||||||
for fdb in list_of_fdbs:
|
for fdb in list_of_fdbs:
|
||||||
|
@ -441,74 +455,92 @@ class fdb_spider(object):
|
||||||
fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist")
|
fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist")
|
||||||
|
|
||||||
|
|
||||||
|
if '.pdf' in dictionary_entry_list[entry_id]["link"]:
|
||||||
|
|
||||||
|
print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id)
|
||||||
|
|
||||||
|
|
||||||
p_text = tree.xpath(
|
|
||||||
"//p//text()"
|
|
||||||
)
|
|
||||||
|
|
||||||
div_text = tree.xpath(
|
|
||||||
"//div//text()"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
#print("oi", text)
|
|
||||||
generaltext = ''
|
|
||||||
for n in range(len(p_text)):
|
|
||||||
|
|
||||||
if len(p_text[n]) > 0:
|
|
||||||
generaltext += p_text[n] + ' '
|
|
||||||
|
|
||||||
for n in range(len(div_text)):
|
|
||||||
|
|
||||||
if len(div_text[n]) > 0 and div_text[n] not in p_text:
|
|
||||||
generaltext += div_text[n] + ' '
|
|
||||||
|
|
||||||
|
|
||||||
generaltextlist = generaltext.split(' ')
|
|
||||||
if len(generaltextlist) > 5000:
|
|
||||||
print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist))
|
|
||||||
|
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(file_name , 'r', encoding='utf-8') as file:
|
|
||||||
html_content = file.read()
|
generaltext = ''
|
||||||
|
|
||||||
|
for page_layout in extract_pages(file_name):
|
||||||
|
for element in page_layout:
|
||||||
|
if isinstance(element, LTTextContainer):
|
||||||
|
generaltext += element.get_text()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
with open(file_name , 'r', encoding='latin-1') as file:
|
|
||||||
html_content = file.read()
|
|
||||||
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
|
|
||||||
|
|
||||||
generaltext = extract(html_content)
|
|
||||||
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
|
|
||||||
|
|
||||||
if len(generaltextlist) < 2:
|
|
||||||
print('no text parsed, the wc is', len(generaltextlist))
|
|
||||||
|
|
||||||
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))
|
|
||||||
|
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(file_name , 'r', encoding='utf-8') as file:
|
|
||||||
html_content = file.read()
|
|
||||||
except Exception as e:
|
|
||||||
|
|
||||||
with open(file_name , 'r', encoding='latin-1') as file:
|
|
||||||
html_content = file.read()
|
|
||||||
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
|
|
||||||
|
|
||||||
generaltext = extract(html_content)
|
|
||||||
try:
|
|
||||||
if len(generaltext) > 2:
|
|
||||||
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
|
|
||||||
except:
|
|
||||||
|
|
||||||
print('trafilatura got this out:', generaltext , 'setting generaltext to NONE')
|
|
||||||
generaltext = 'NONE'
|
generaltext = 'NONE'
|
||||||
|
print('parsing pdf did not work, the original error is:', e )
|
||||||
|
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
p_text = tree.xpath(
|
||||||
|
"//p//text()"
|
||||||
|
)
|
||||||
|
|
||||||
|
div_text = tree.xpath(
|
||||||
|
"//div//text()"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
#print("oi", text)
|
||||||
|
generaltext = ''
|
||||||
|
for n in range(len(p_text)):
|
||||||
|
|
||||||
|
if len(p_text[n]) > 0:
|
||||||
|
generaltext += p_text[n] + ' '
|
||||||
|
|
||||||
|
for n in range(len(div_text)):
|
||||||
|
|
||||||
|
if len(div_text[n]) > 0 and div_text[n] not in p_text:
|
||||||
|
generaltext += div_text[n] + ' '
|
||||||
|
|
||||||
|
|
||||||
|
generaltextlist = generaltext.split(' ')
|
||||||
|
if len(generaltextlist) > 5000:
|
||||||
|
print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist))
|
||||||
|
|
||||||
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_name , 'r', encoding='utf-8') as file:
|
||||||
|
html_content = file.read()
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
with open(file_name , 'r', encoding='latin-1') as file:
|
||||||
|
html_content = file.read()
|
||||||
|
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
|
||||||
|
|
||||||
|
generaltext = extract(html_content)
|
||||||
|
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
|
||||||
|
|
||||||
|
if len(generaltextlist) < 2:
|
||||||
|
|
||||||
|
|
||||||
|
print('no text parsed, the wc is', len(generaltextlist))
|
||||||
|
|
||||||
|
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))
|
||||||
|
|
||||||
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_name , 'r', encoding='utf-8') as file:
|
||||||
|
html_content = file.read()
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
with open(file_name , 'r', encoding='latin-1') as file:
|
||||||
|
html_content = file.read()
|
||||||
|
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
|
||||||
|
|
||||||
|
generaltext = extract(html_content)
|
||||||
|
try:
|
||||||
|
if len(generaltext) > 2:
|
||||||
|
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
|
||||||
|
except:
|
||||||
|
|
||||||
|
print('trafilatura got this out:', generaltext , 'setting generaltext to NONE')
|
||||||
|
generaltext = 'NONE'
|
||||||
|
|
||||||
dictionary_entry_list[entry_id]["text"] = generaltext
|
dictionary_entry_list[entry_id]["text"] = generaltext
|
||||||
dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)
|
dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)
|
||||||
|
|
Loading…
Reference in a new issue