added pdf parser if entry link is direct pdf

This commit is contained in:
alpcentaur 2023-11-22 17:03:15 +00:00
parent 677e54c0c2
commit df4a8289b8
3 changed files with 102 additions and 70 deletions

View file

@ -10,8 +10,12 @@ import lxml.html
import lxml.html.soupparser import lxml.html.soupparser
from lxml import html from lxml import html
import requests
from trafilatura import extract from trafilatura import extract
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
class fdb_spider(object): class fdb_spider(object):
def __init__(self, config_file): def __init__(self, config_file):
@ -357,6 +361,16 @@ class fdb_spider(object):
# save interim results to files # save interim results to files
if '.pdf' in entry_link:
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
response = requests.get(entry_link)
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "bw")
f.write(response.content)
f.close
else:
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
os.makedirs(os.path.dirname(file_name), exist_ok=True) os.makedirs(os.path.dirname(file_name), exist_ok=True)
@ -441,9 +455,25 @@ class fdb_spider(object):
fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist") fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist")
if '.pdf' in dictionary_entry_list[entry_id]["link"]:
print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id)
try:
generaltext = ''
for page_layout in extract_pages(file_name):
for element in page_layout:
if isinstance(element, LTTextContainer):
generaltext += element.get_text()
except Exception as e:
generaltext = 'NONE'
print('parsing pdf did not work, the original error is:', e )
else:
p_text = tree.xpath( p_text = tree.xpath(
"//p//text()" "//p//text()"
@ -486,6 +516,8 @@ class fdb_spider(object):
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
if len(generaltextlist) < 2: if len(generaltextlist) < 2:
print('no text parsed, the wc is', len(generaltextlist)) print('no text parsed, the wc is', len(generaltextlist))
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))