Browse Source

added pdf parser if entry link is direct pdf

onlinkgen
alpcentaur 1 year ago
parent
commit
df4a8289b8
3 changed files with 92 additions and 60 deletions
  1. +1
    -1
      main.py
  2. BIN
      spiders/__pycache__/fdb_spider.cpython-39.pyc
  3. +91
    -59
      spiders/fdb_spider.py

+ 1
- 1
main.py View File

@ -15,7 +15,7 @@ spider = fdb_spider(config)
#spider.parse_entry_list_data2dictionary(list_of_fdbs) #spider.parse_entry_list_data2dictionary(list_of_fdbs)
# spider.download_entry_data_htmls(list_of_fdbs)
#spider.download_entry_data_htmls(list_of_fdbs)
spider.parse_entry_data2dictionary(list_of_fdbs) spider.parse_entry_data2dictionary(list_of_fdbs)

BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc View File


+ 91
- 59
spiders/fdb_spider.py View File

@ -10,8 +10,12 @@ import lxml.html
import lxml.html.soupparser import lxml.html.soupparser
from lxml import html from lxml import html
import requests
from trafilatura import extract from trafilatura import extract
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
class fdb_spider(object): class fdb_spider(object):
def __init__(self, config_file): def __init__(self, config_file):
@ -356,13 +360,23 @@ class fdb_spider(object):
print(ex) print(ex)
# save interim results to files # save interim results to files
if '.pdf' in entry_link:
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
response = requests.get(entry_link)
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "bw")
f.write(response.content)
f.close
else:
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+")
f.write(web_content)
f.close
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+")
f.write(web_content)
f.close
def parse_entry_data2dictionary(self, list_of_fdbs): def parse_entry_data2dictionary(self, list_of_fdbs):
for fdb in list_of_fdbs: for fdb in list_of_fdbs:
@ -441,74 +455,92 @@ class fdb_spider(object):
fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist") fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist")
if '.pdf' in dictionary_entry_list[entry_id]["link"]:
print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id)
try:
p_text = tree.xpath(
"//p//text()"
)
div_text = tree.xpath(
"//div//text()"
)
#print("oi", text)
generaltext = ''
for n in range(len(p_text)):
if len(p_text[n]) > 0:
generaltext += p_text[n] + ' '
for n in range(len(div_text)):
if len(div_text[n]) > 0 and div_text[n] not in p_text:
generaltext += div_text[n] + ' '
generaltext = ''
generaltextlist = generaltext.split(' ')
if len(generaltextlist) > 5000:
print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist))
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
for page_layout in extract_pages(file_name):
for element in page_layout:
if isinstance(element, LTTextContainer):
generaltext += element.get_text()
try:
with open(file_name , 'r', encoding='utf-8') as file:
html_content = file.read()
except Exception as e: except Exception as e:
generaltext = 'NONE'
print('parsing pdf did not work, the original error is:', e )
with open(file_name , 'r', encoding='latin-1') as file:
html_content = file.read()
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
generaltext = extract(html_content)
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
else:
p_text = tree.xpath(
"//p//text()"
)
if len(generaltextlist) < 2:
print('no text parsed, the wc is', len(generaltextlist))
div_text = tree.xpath(
"//div//text()"
)
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
#print("oi", text)
generaltext = ''
for n in range(len(p_text)):
if len(p_text[n]) > 0:
generaltext += p_text[n] + ' '
try:
with open(file_name , 'r', encoding='utf-8') as file:
html_content = file.read()
except Exception as e:
for n in range(len(div_text)):
with open(file_name , 'r', encoding='latin-1') as file:
html_content = file.read()
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
if len(div_text[n]) > 0 and div_text[n] not in p_text:
generaltext += div_text[n] + ' '
generaltext = extract(html_content)
try:
if len(generaltext) > 2:
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
except:
generaltextlist = generaltext.split(' ')
if len(generaltextlist) > 5000:
print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist))
print('trafilatura got this out:', generaltext , 'setting generaltext to NONE')
generaltext = 'NONE'
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
try:
with open(file_name , 'r', encoding='utf-8') as file:
html_content = file.read()
except Exception as e:
with open(file_name , 'r', encoding='latin-1') as file:
html_content = file.read()
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
generaltext = extract(html_content)
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
if len(generaltextlist) < 2:
print('no text parsed, the wc is', len(generaltextlist))
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
try:
with open(file_name , 'r', encoding='utf-8') as file:
html_content = file.read()
except Exception as e:
with open(file_name , 'r', encoding='latin-1') as file:
html_content = file.read()
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
generaltext = extract(html_content)
try:
if len(generaltext) > 2:
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
except:
print('trafilatura got this out:', generaltext , 'setting generaltext to NONE')
generaltext = 'NONE'
dictionary_entry_list[entry_id]["text"] = generaltext dictionary_entry_list[entry_id]["text"] = generaltext
dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist) dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)

Loading…
Cancel
Save