|
|
@ -10,6 +10,7 @@ import lxml.html |
|
|
|
import lxml.html.soupparser |
|
|
|
from lxml import html |
|
|
|
|
|
|
|
from trafilatura import extract |
|
|
|
|
|
|
|
|
|
|
|
class fdb_spider(object): |
|
|
@ -215,8 +216,8 @@ class fdb_spider(object): |
|
|
|
fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") |
|
|
|
|
|
|
|
|
|
|
|
print('blabliblub') |
|
|
|
print('len', len(tree.xpath(fdb_conf_entry_list_parent))) |
|
|
|
#print('blabliblub') |
|
|
|
#print('len', len(tree.xpath(fdb_conf_entry_list_parent))) |
|
|
|
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): |
|
|
|
|
|
|
|
try: |
|
|
@ -253,7 +254,7 @@ class fdb_spider(object): |
|
|
|
+ "]" |
|
|
|
+ fdb_conf_entry_list_child_period |
|
|
|
)[0] |
|
|
|
print('period', period) |
|
|
|
#print('period', period) |
|
|
|
except Exception as e: |
|
|
|
print("period could not be parsed", e, period) |
|
|
|
period = 'NONE' |
|
|
@ -266,7 +267,7 @@ class fdb_spider(object): |
|
|
|
+ "]" |
|
|
|
+ fdb_conf_entry_list_child_link |
|
|
|
)[0] |
|
|
|
print('link', link) |
|
|
|
#print('link', link) |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
print("link could not be parsed", e, link) |
|
|
@ -386,9 +387,9 @@ class fdb_spider(object): |
|
|
|
fdb_conf = self.config.get(fdb) |
|
|
|
fdb_domain = fdb_conf.get("domain") |
|
|
|
fdb_conf_entry = fdb_conf.get("entry") |
|
|
|
print('balubaluba', fdb_conf_entry) |
|
|
|
#print('balubaluba', fdb_conf_entry) |
|
|
|
fdb_conf_entry_general = fdb_conf_entry.get("general") |
|
|
|
print(fdb_conf_entry_general) |
|
|
|
#print(fdb_conf_entry_general) |
|
|
|
|
|
|
|
|
|
|
|
for entry_id in dictionary_entry_list: |
|
|
@ -424,7 +425,7 @@ class fdb_spider(object): |
|
|
|
fdb_conf_entry_unitrue_entry_child |
|
|
|
) |
|
|
|
|
|
|
|
print("oi", child) |
|
|
|
#print("oi", child) |
|
|
|
|
|
|
|
if len(child) > 0: |
|
|
|
dictionary_entry_list[entry_id][key] = child[ |
|
|
@ -444,18 +445,73 @@ class fdb_spider(object): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = tree.xpath( |
|
|
|
p_text = tree.xpath( |
|
|
|
"//p//text()" |
|
|
|
) |
|
|
|
|
|
|
|
print("oi", text) |
|
|
|
|
|
|
|
div_text = tree.xpath( |
|
|
|
"//div//text()" |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
#print("oi", text) |
|
|
|
generaltext = '' |
|
|
|
for n in range(len(text)): |
|
|
|
for n in range(len(p_text)): |
|
|
|
|
|
|
|
if len(p_text[n]) > 0: |
|
|
|
generaltext += p_text[n] + ' ' |
|
|
|
|
|
|
|
for n in range(len(div_text)): |
|
|
|
|
|
|
|
if len(div_text[n]) > 0 and div_text[n] not in p_text: |
|
|
|
generaltext += div_text[n] + ' ' |
|
|
|
|
|
|
|
|
|
|
|
generaltextlist = generaltext.split(' ') |
|
|
|
if len(generaltextlist) > 5000: |
|
|
|
print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) |
|
|
|
|
|
|
|
if len(text[n]) > 0: |
|
|
|
generaltext += text[n] + ' ' |
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
try: |
|
|
|
with open(file_name , 'r', encoding='utf-8') as file: |
|
|
|
html_content = file.read() |
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
with open(file_name , 'r', encoding='latin-1') as file: |
|
|
|
html_content = file.read() |
|
|
|
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) |
|
|
|
|
|
|
|
generaltext = extract(html_content) |
|
|
|
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) |
|
|
|
|
|
|
|
if len(generaltextlist) < 2: |
|
|
|
print('no text parsed, the wc is', len(generaltextlist)) |
|
|
|
|
|
|
|
print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) |
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
try: |
|
|
|
with open(file_name , 'r', encoding='utf-8') as file: |
|
|
|
html_content = file.read() |
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
with open(file_name , 'r', encoding='latin-1') as file: |
|
|
|
html_content = file.read() |
|
|
|
print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) |
|
|
|
|
|
|
|
generaltext = extract(html_content) |
|
|
|
try: |
|
|
|
if len(generaltext) > 2: |
|
|
|
print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) |
|
|
|
except: |
|
|
|
|
|
|
|
print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') |
|
|
|
generaltext = 'NONE' |
|
|
|
|
|
|
|
dictionary_entry_list[entry_id]["text"] = generaltext |
|
|
|
dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|