added trafilatura exception
This commit is contained in:
parent
14ece9bceb
commit
d3335f203b
2 changed files with 69 additions and 13 deletions
Binary file not shown.
|
@ -10,6 +10,7 @@ import lxml.html
|
|||
import lxml.html.soupparser
|
||||
from lxml import html
|
||||
|
||||
from trafilatura import extract
|
||||
|
||||
|
||||
class fdb_spider(object):
|
||||
|
@ -215,8 +216,8 @@ class fdb_spider(object):
|
|||
fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period")
|
||||
|
||||
|
||||
print('blabliblub')
|
||||
print('len', len(tree.xpath(fdb_conf_entry_list_parent)))
|
||||
#print('blabliblub')
|
||||
#print('len', len(tree.xpath(fdb_conf_entry_list_parent)))
|
||||
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
|
||||
|
||||
try:
|
||||
|
@ -253,7 +254,7 @@ class fdb_spider(object):
|
|||
+ "]"
|
||||
+ fdb_conf_entry_list_child_period
|
||||
)[0]
|
||||
print('period', period)
|
||||
#print('period', period)
|
||||
except Exception as e:
|
||||
print("period could not be parsed", e, period)
|
||||
period = 'NONE'
|
||||
|
@ -266,7 +267,7 @@ class fdb_spider(object):
|
|||
+ "]"
|
||||
+ fdb_conf_entry_list_child_link
|
||||
)[0]
|
||||
print('link', link)
|
||||
#print('link', link)
|
||||
|
||||
except Exception as e:
|
||||
print("link could not be parsed", e, link)
|
||||
|
@ -386,9 +387,9 @@ class fdb_spider(object):
|
|||
fdb_conf = self.config.get(fdb)
|
||||
fdb_domain = fdb_conf.get("domain")
|
||||
fdb_conf_entry = fdb_conf.get("entry")
|
||||
print('balubaluba', fdb_conf_entry)
|
||||
#print('balubaluba', fdb_conf_entry)
|
||||
fdb_conf_entry_general = fdb_conf_entry.get("general")
|
||||
print(fdb_conf_entry_general)
|
||||
#print(fdb_conf_entry_general)
|
||||
|
||||
|
||||
for entry_id in dictionary_entry_list:
|
||||
|
@ -424,7 +425,7 @@ class fdb_spider(object):
|
|||
fdb_conf_entry_unitrue_entry_child
|
||||
)
|
||||
|
||||
print("oi", child)
|
||||
#print("oi", child)
|
||||
|
||||
if len(child) > 0:
|
||||
dictionary_entry_list[entry_id][key] = child[
|
||||
|
@ -444,18 +445,73 @@ class fdb_spider(object):
|
|||
|
||||
|
||||
|
||||
text = tree.xpath(
|
||||
p_text = tree.xpath(
|
||||
"//p//text()"
|
||||
)
|
||||
|
||||
print("oi", text)
|
||||
generaltext = ''
|
||||
for n in range(len(text)):
|
||||
div_text = tree.xpath(
|
||||
"//div//text()"
|
||||
)
|
||||
|
||||
if len(text[n]) > 0:
|
||||
generaltext += text[n] + ' '
|
||||
|
||||
#print("oi", text)
|
||||
generaltext = ''
|
||||
for n in range(len(p_text)):
|
||||
|
||||
if len(p_text[n]) > 0:
|
||||
generaltext += p_text[n] + ' '
|
||||
|
||||
for n in range(len(div_text)):
|
||||
|
||||
if len(div_text[n]) > 0 and div_text[n] not in p_text:
|
||||
generaltext += div_text[n] + ' '
|
||||
|
||||
|
||||
generaltextlist = generaltext.split(' ')
|
||||
if len(generaltextlist) > 5000:
|
||||
print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist))
|
||||
|
||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||
|
||||
try:
|
||||
with open(file_name , 'r', encoding='utf-8') as file:
|
||||
html_content = file.read()
|
||||
except Exception as e:
|
||||
|
||||
with open(file_name , 'r', encoding='latin-1') as file:
|
||||
html_content = file.read()
|
||||
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
|
||||
|
||||
generaltext = extract(html_content)
|
||||
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
|
||||
|
||||
if len(generaltextlist) < 2:
|
||||
print('no text parsed, the wc is', len(generaltextlist))
|
||||
|
||||
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist))
|
||||
|
||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||
|
||||
try:
|
||||
with open(file_name , 'r', encoding='utf-8') as file:
|
||||
html_content = file.read()
|
||||
except Exception as e:
|
||||
|
||||
with open(file_name , 'r', encoding='latin-1') as file:
|
||||
html_content = file.read()
|
||||
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e)
|
||||
|
||||
generaltext = extract(html_content)
|
||||
try:
|
||||
if len(generaltext) > 2:
|
||||
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' ')))
|
||||
except:
|
||||
|
||||
print('trafilatura got this out:', generaltext , 'setting generaltext to NONE')
|
||||
generaltext = 'NONE'
|
||||
|
||||
dictionary_entry_list[entry_id]["text"] = generaltext
|
||||
dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue