diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 31c7d9b..d093fac 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index bb0b6bd..5da1a6d 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -10,6 +10,7 @@ import lxml.html import lxml.html.soupparser from lxml import html +from trafilatura import extract class fdb_spider(object): @@ -215,8 +216,8 @@ class fdb_spider(object): fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") - print('blabliblub') - print('len', len(tree.xpath(fdb_conf_entry_list_parent))) + #print('blabliblub') + #print('len', len(tree.xpath(fdb_conf_entry_list_parent))) for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): try: @@ -253,7 +254,7 @@ class fdb_spider(object): + "]" + fdb_conf_entry_list_child_period )[0] - print('period', period) + #print('period', period) except Exception as e: print("period could not be parsed", e, period) period = 'NONE' @@ -266,7 +267,7 @@ class fdb_spider(object): + "]" + fdb_conf_entry_list_child_link )[0] - print('link', link) + #print('link', link) except Exception as e: print("link could not be parsed", e, link) @@ -386,9 +387,9 @@ class fdb_spider(object): fdb_conf = self.config.get(fdb) fdb_domain = fdb_conf.get("domain") fdb_conf_entry = fdb_conf.get("entry") - print('balubaluba', fdb_conf_entry) + #print('balubaluba', fdb_conf_entry) fdb_conf_entry_general = fdb_conf_entry.get("general") - print(fdb_conf_entry_general) + #print(fdb_conf_entry_general) for entry_id in dictionary_entry_list: @@ -424,7 +425,7 @@ class fdb_spider(object): fdb_conf_entry_unitrue_entry_child ) - print("oi", child) + #print("oi", child) if len(child) > 0: dictionary_entry_list[entry_id][key] = child[ @@ -444,18 +445,73 @@ class fdb_spider(object): - text = tree.xpath( + p_text = tree.xpath( "//p//text()" ) - - print("oi", text) + + div_text = tree.xpath( + "//div//text()" + ) + + + #print("oi", text) generaltext = '' - for n in range(len(text)): + for n in range(len(p_text)): + + if len(p_text[n]) > 0: + generaltext += p_text[n] + ' ' + + for n in range(len(div_text)): + + if len(div_text[n]) > 0 and div_text[n] not in p_text: + generaltext += div_text[n] + ' ' + + + generaltextlist = generaltext.split(' ') + if len(generaltextlist) > 5000: + print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) - if len(text[n]) > 0: - generaltext += text[n] + ' ' + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + try: + with open(file_name , 'r', encoding='utf-8') as file: + html_content = file.read() + except Exception as e: + + with open(file_name , 'r', encoding='latin-1') as file: + html_content = file.read() + print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + + generaltext = extract(html_content) + print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + + if len(generaltextlist) < 2: + print('no text parsed, the wc is', len(generaltextlist)) + + print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + try: + with open(file_name , 'r', encoding='utf-8') as file: + html_content = file.read() + except Exception as e: + with open(file_name , 'r', encoding='latin-1') as file: + html_content = file.read() + print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + + generaltext = extract(html_content) + try: + if len(generaltext) > 2: + print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + except: + + print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') + generaltext = 'NONE' + dictionary_entry_list[entry_id]["text"] = generaltext + dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)