From d3335f203bd402a08dbd618d6d272d66743b0581 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Wed, 22 Nov 2023 00:02:29 +0000 Subject: [PATCH] added trafilatura exception --- spiders/__pycache__/fdb_spider.cpython-39.pyc | Bin 9449 -> 10628 bytes spiders/fdb_spider.py | 82 +++++++++++++++--- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 31c7d9b9eeccf0d21724b4c8d1aef741fde0f916..d093faccc18ca32e9a5cc847940dbe2f3dca400b 100644 GIT binary patch delta 3692 zcma)9Z)hCH72nz0Tb=Ho?$6%;le9X?ie+1}?D$_)B~4?wP8vr^94Cs;R+W=xC9UtY zE6uJX>)7KA3Z5FeMn z5sJbPbD@WSEI&)~d;gHTe4#js69DyjJ|VrpZzPAf9y}XTP@Y10mH#GL-7AHbNGgXy zT%KhRvIs%`gK&xT@{hyAhxMG`Ct|l@$&2FYwo*AB06o5F9ck?D%H?$(`(s&o}!$zRfGyc(Yb5!HvBLSsSq5HvTuKMmryX1=;8Hv; z1-Q~p2xNR4!cvuX2%B=^G*N}Huqj<|x}4ag;KcDaC(tys6J9$5?WEVvLc7~hHejpR zAxH6~IZsMJ`X0@<$E}OB@Ss(6TNN1YR5DID3hjmE`l=$R-au;DyS52HYA8-O?E}dp z(jBrk1s;#Pd*R!<<9VEhz3AtEO7tgHM|4kw-#rZja3V@PnH=aIoM!EN!JkilOKx8U z{;T9|epAVcY@GjG84b6j>DQ;v)w8;>X!2hx3J<4>d@MB_ZON+|(@olvYPwP7mr|cJ z({QdwVr<{*gV*F3f)C+1!m|jy2qzFuBAi0N>&;Yz0f3S87j7CmjY9_zh7n!_Xr-vm zchd_0MS5`T%P9Q{!Yc@0LwFV80>TJF$7SUH%mMj_AhiC<qz$ja~m3Cb~pny zNC|UhmAP?J9v>&ye>X6FNS;S10F=8fYHRBn>!SGFX}aI&VU5_@%|Kv!8tGXIbj+Hf&^S1 z!hY&5lZ0hrPHak;J;QJGXPQw+CWI>PeP>RRDmkIOtP`IU;3BcLJFZ>|^b|({L(|Yo zIX$~!nt@i@ZDpaAp}9uZ?y;jzx}Ah)4>oexb8Va$cm2JchVbV2YcCezfA%h;u^?yV>AzjCYFOof|wB+(fXwAccQe9_EYsczVBum1*fpvbxUXzX9Oxp@WG^MB#uF$u|6Fa zi2Jz=2x({8OFJ)lX=mEw8rY9ZeT2+sm;CXipUUXM*UVwjTC{ObIogtMR}QNbBQ(k?x>1G z_P1HiG^+N=d~w#u_?TM^sqcAwFvKkIi+k!$62I+rIBcFEx)H_ z5KxUwc8dPp-2)Z2c>0YBrQ^bv*eCg2+Wg5;?KaD zMTPn*gh-}B{id6%s~+yTNh+iwgU|bv2N~oMS`IGs*5YVDmrNHA(x5I?h%V8PE^)6e z-6S*|m&&0tQei-mOxrS<|Bgsyg-8p{uuc427tJ1QKWw}=O#_C=I-esDojn7`E$#tuxW{t7s#z}<%uF+OlE;TK~)LTdm zFSWJ>AT4OzMN{B;%DYTTW01ye)(f)Duhowmg=DmeN2z4QN87n<6^q>}lnyACPO%nl z^LGui^=4VC)SIPKtSW6@R_hq3?zJz)Pwr{ap&j4ci`}ppVGBYZLO;S*fUJ8WKR%Jq zP2}e$*ns#@>myn5mDcL0sxt=56~(VwQmrcaTwz)~*3zOka(6KT^D;s9%#K9fZ3s4m z?Fc&%5(onbFCpwiK#^dr2q}O}w6S^7y9=Qcp$lOzKs7>h;%Zb=cEf{sKiZkuheJmY zUPBl{IE>JP(9rVYezZ&d1e~*vg{tpUZBC+*nD|Uj2&x}Y6o;RPPxTZU^6ek=ujS_T zEVKQ{^#;N)!eQ}o^B?vwmbfcCX3WyE#gw5ZP!MMLjRK@JabhsjcX=5Cca5@X41$l8`oGjj<}|Bd1JHy zLf4oW9EppSWSndl>Eucr-z(;(a$N{P#y|+q*yItX=xxbnp-|o2gI&u#t1<-z zEPG9bdr=v=6LgP^WXnE?@^UBlL3_XjC2U9C1MTr!#jWyMaic1#98?t|g5t-PR(-18 z5Gm6I`sxSv0C0uGyRqb=7qyZ$LAw!l$68&3@gNT#lt%WKLp-=D^ALE#Jh19DePE$> zMQYk*`r%G!)KZTn-bABu*e~vohX!HSrKQpXi&KYr8rR@#F|Z=YMdM&O7A-P|CCJU? zs2KoC2gpB&{YXz)$0x4G;h5GUGLw0<=AoN(X}464F3TnK)zXp}lu8dwJ5`@bhyxOl ztjJ{*q&WzwaaPDpOssl2L}F6uLP?<#{8qO#NCDd~?zi`~ZB!}tJ^Y3jOg0Z)J`N3T zT_V}>%2rD!jzR!CjexH@>zFI#ri$!JRylRz^dalfWOGJVF6PAbj`XRjv_?|wJjPpP z9)966f0iJi3}oBIg>+PW+vPiANrR;d8`U0rfoiYq=NN7ZC4s#x+S6M+BWPYg*elMa z6KV&T*@U&&O8Q9BbI~XmY&>r@%03Kh)QSG?lv*HA`&^vr?sO10e4RG&PWRT7;+FkH zCi3557kC{CSc>8%3JJ<#qRD;{?m4|XfP@)@R}qdQj3Q8kjg>6Rk3RUbAqB)oJ$esX z89#dqy$*n^a(T8`$ol749yGe2)kPvMe(On*ZQ@B!|G}zSoVaXEnik8m=&R1bD@8h& YH}=4xq*+uor|cw7IVO7v+=*BD2ku`e;{X5v diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index bb0b6bd..5da1a6d 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -10,6 +10,7 @@ import lxml.html import lxml.html.soupparser from lxml import html +from trafilatura import extract class fdb_spider(object): @@ -215,8 +216,8 @@ class fdb_spider(object): fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") - print('blabliblub') - print('len', len(tree.xpath(fdb_conf_entry_list_parent))) + #print('blabliblub') + #print('len', len(tree.xpath(fdb_conf_entry_list_parent))) for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): try: @@ -253,7 +254,7 @@ class fdb_spider(object): + "]" + fdb_conf_entry_list_child_period )[0] - print('period', period) + #print('period', period) except Exception as e: print("period could not be parsed", e, period) period = 'NONE' @@ -266,7 +267,7 @@ class fdb_spider(object): + "]" + fdb_conf_entry_list_child_link )[0] - print('link', link) + #print('link', link) except Exception as e: print("link could not be parsed", e, link) @@ -386,9 +387,9 @@ class fdb_spider(object): fdb_conf = self.config.get(fdb) fdb_domain = fdb_conf.get("domain") fdb_conf_entry = fdb_conf.get("entry") - print('balubaluba', fdb_conf_entry) + #print('balubaluba', fdb_conf_entry) fdb_conf_entry_general = fdb_conf_entry.get("general") - print(fdb_conf_entry_general) + #print(fdb_conf_entry_general) for entry_id in dictionary_entry_list: @@ -424,7 +425,7 @@ class fdb_spider(object): fdb_conf_entry_unitrue_entry_child ) - print("oi", child) + #print("oi", child) if len(child) > 0: dictionary_entry_list[entry_id][key] = child[ @@ -444,18 +445,73 @@ class fdb_spider(object): - text = tree.xpath( + p_text = tree.xpath( "//p//text()" ) - - print("oi", text) + + div_text = tree.xpath( + "//div//text()" + ) + + + #print("oi", text) generaltext = '' - for n in range(len(text)): + for n in range(len(p_text)): + + if len(p_text[n]) > 0: + generaltext += p_text[n] + ' ' + + for n in range(len(div_text)): + + if len(div_text[n]) > 0 and div_text[n] not in p_text: + generaltext += div_text[n] + ' ' + + + generaltextlist = generaltext.split(' ') + if len(generaltextlist) > 5000: + print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) - if len(text[n]) > 0: - generaltext += text[n] + ' ' + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + try: + with open(file_name , 'r', encoding='utf-8') as file: + html_content = file.read() + except Exception as e: + + with open(file_name , 'r', encoding='latin-1') as file: + html_content = file.read() + print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + + generaltext = extract(html_content) + print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + + if len(generaltextlist) < 2: + print('no text parsed, the wc is', len(generaltextlist)) + + print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + try: + with open(file_name , 'r', encoding='utf-8') as file: + html_content = file.read() + except Exception as e: + with open(file_name , 'r', encoding='latin-1') as file: + html_content = file.read() + print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + + generaltext = extract(html_content) + try: + if len(generaltext) > 2: + print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + except: + + print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') + generaltext = 'NONE' + dictionary_entry_list[entry_id]["text"] = generaltext + dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)