From df4a8289b895d2a294a545d1999ee97fef4128c8 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Wed, 22 Nov 2023 17:03:15 +0000 Subject: [PATCH] added pdf parser if entry link is direct pdf --- main.py | 2 +- spiders/__pycache__/fdb_spider.cpython-39.pyc | Bin 10628 -> 11209 bytes spiders/fdb_spider.py | 150 +++++++++++------- 3 files changed, 92 insertions(+), 60 deletions(-) diff --git a/main.py b/main.py index f1f9f17..15dcd94 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,7 @@ spider = fdb_spider(config) #spider.parse_entry_list_data2dictionary(list_of_fdbs) -# spider.download_entry_data_htmls(list_of_fdbs) +#spider.download_entry_data_htmls(list_of_fdbs) spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index d093faccc18ca32e9a5cc847940dbe2f3dca400b..767558c00d875f8d726932e7f2d8360ad70e705c 100644 GIT binary patch delta 2941 zcmb7GO>7&-6`olx$rZ&VDT)6gC7O~cQGb^I-B__wJE_seHRPW_)wU8raaJ@Xij;;c z*%p={UlILV zZ`{2Lx@+ZlBmMNg|-71tdM+T zp6Xf|UT6i@3MH1wl}l@dJVed2o4TR%g+u1~NPgg!wJG$C3jfQuP_0rewMfXjL`5>pr;pn*FBlfU+P9J(egj?52-}>r&NW zI+_Efy6!)ql$eh7043Xv7h7LpZSyZS8aX!V?KZi+vMnGzgcQDmG!pFGnyK4_sJJ zm8QK@ANJ*+TJ>RHz9c4wHs~+JJ464fv5={lYJ{+GgjjX5V#qA=RA;Jn8zp;lELsg3 zUei;DQd^#=hRhJ$hpJ)2XNHZwy40Zji5WD*&He*&kyL{Xy8Q&MFBeJaoJiQ9^YH8STsJ*hZ{m8&s(ax+3)~x6=vpzJfXEU-(8iAsDd~T}D9H^s`}b zBnHM}uwVifl(`J9pD-}z9=+Kz>XCnS)z#OVmE4Skvm(y_X zgaO%r?057N8>C_9H(=&uTT{m`TxL^WJ21KOsN0$2z~qNte(k{I%BS77jsu(Q1hzMW z;lNHzz)slt6gd1pfr^D>pf+s=*$n0$OKpjiq*($x+%EJ}bDbn~o@eLC_9up&Ii8_M zCs5?+4lf0PG?hB-a8n5A?ki?G^WV(0m#i3BF1|>P_CDpPX)!P|AFznsYGDiTUGncE zd`sj<&fVr3%E0MBUz#zW*Fv#pYDuw@3W`6TRBKir1tKCXR{_F<9F=l9ci>A39AYvS5n{(} z*(+Pcg282wyV~vWOVBCDVE7$`wQ0&)g|4of~N(^0$lVC+1#iNBBS)%ww+DTLP%?jo!qY#}^t`j*s( zIwt_0)+)&H3RQZPGs==uBOc|tHI@$W48Dg4R!v^7luMdthZl>5)tOG)vYMRcMy2@M zc!-`APsV3zd<7@;!ci2uPuAGsS!)}#0jRo7JciEPE9CEGiu$fzXzLA delta 2313 zcmb7_OKcNI7=UNi>$MX<^YSZp9D<$L2?-D&NyVFjpe;>#Heeczn>l0&PCQzaiN^+a!dQMFQ~Rx9;Nd+MP=qV1{wzjh#0X(?X$+nIkJ|7+&o zFDGuEc4u6!W&*!^KfR^?e&Mb=Oc(ABT#U+j=bS6$qU0!HR;Jt}Oi|rcOUlFOX_B|`45w`81q1p+X&7!oRxK&Ix-&5ML*->*jexw~iY@;IXM+6Yf z;!kG}jfk+TcOVLaWIK1Dya%xvV6=i>oV{P7jl1X>vro8*U`Uz_@+f8P>-A^iB%{}9RzjviY_pVab!F+clN;IqPW&WyEQFLXcRrjn? zN_1tWjgr!G>0(J{1ddx^oYzkSTK6*I4a5lqn!V_Fy||dmi*ua@Z4*1fe2JS!W*o-; zvJPF*=JIEf)Uc}ykoRuEsVl2`!(hbsj=31VUIgGj?YwCi=Pm0@-Sga2ODM?-62^?vf&gIo~ zAv32wlbY`$UCJ;er2M9M6geqQckK?M$(TKiQCYS&@+D9HVdhJo{6Ns?EG1$t`e;e_ ztT>r}l~#OaD*!D&3swReWJ&i`X%$jEnWzM@rw4&?yPrJ*wY9>qrYVjDSf5EH$-164`Wy#Z15jkyZ2Bv z>{Yt%?yueap=WNs7>^A)o?iU{aVxgJ!z^P@t~);mN1q=SMr^0J6c0-V5qf-oPyy+p zbzYku95V|6-rHQ^GM}jZ5Z^`9$8b>^=XmfI4qh*EHkd8WL;ZZgQn68O9}(HU#96bh zV}&=Y<*WIe#${C6jWEl5D%8}nxQIVgThB)#iC0I;LcDTewxl_Hj!`{Gz~ zirU2I$z7C+$H~PDUR2zVID|Nbm_p1Vp6+C}JSc;*L~vCZMRsa$vJczO+n0#jzSkUs zPv-@kXBXe}hiFLr(%((D3;V#(xs*fW87PJsZJuY$_!&xZYDUd!=E9A%8TLJMMK+&T X_ksDO+=?nCP$~r^Cxth4N>0n)X1VyK diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 5da1a6d..4f97c90 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -10,8 +10,12 @@ import lxml.html import lxml.html.soupparser from lxml import html +import requests + from trafilatura import extract +from pdfminer.high_level import extract_pages +from pdfminer.layout import LTTextContainer class fdb_spider(object): def __init__(self, config_file): @@ -356,13 +360,23 @@ class fdb_spider(object): print(ex) # save interim results to files + + if '.pdf' in entry_link: + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + response = requests.get(entry_link) + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "bw") + f.write(response.content) + f.close + + else: + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" - file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" - - os.makedirs(os.path.dirname(file_name), exist_ok=True) - f = open(file_name, "w+") - f.write(web_content) - f.close + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "w+") + f.write(web_content) + f.close def parse_entry_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: @@ -441,74 +455,92 @@ class fdb_spider(object): fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist") + if '.pdf' in dictionary_entry_list[entry_id]["link"]: + print('parsing a pdf', dictionary_entry_list[entry_id]["link"], entry_id) + try: - - p_text = tree.xpath( - "//p//text()" - ) - - div_text = tree.xpath( - "//div//text()" - ) - - - #print("oi", text) - generaltext = '' - for n in range(len(p_text)): - - if len(p_text[n]) > 0: - generaltext += p_text[n] + ' ' - - for n in range(len(div_text)): - - if len(div_text[n]) > 0 and div_text[n] not in p_text: - generaltext += div_text[n] + ' ' + generaltext = '' - - generaltextlist = generaltext.split(' ') - if len(generaltextlist) > 5000: - print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) - - file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + for page_layout in extract_pages(file_name): + for element in page_layout: + if isinstance(element, LTTextContainer): + generaltext += element.get_text() - try: - with open(file_name , 'r', encoding='utf-8') as file: - html_content = file.read() except Exception as e: + generaltext = 'NONE' + print('parsing pdf did not work, the original error is:', e ) - with open(file_name , 'r', encoding='latin-1') as file: - html_content = file.read() - print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) - generaltext = extract(html_content) - print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + else: + + p_text = tree.xpath( + "//p//text()" + ) - if len(generaltextlist) < 2: - print('no text parsed, the wc is', len(generaltextlist)) + div_text = tree.xpath( + "//div//text()" + ) - print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) - file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + #print("oi", text) + generaltext = '' + for n in range(len(p_text)): + + if len(p_text[n]) > 0: + generaltext += p_text[n] + ' ' - try: - with open(file_name , 'r', encoding='utf-8') as file: - html_content = file.read() - except Exception as e: + for n in range(len(div_text)): - with open(file_name , 'r', encoding='latin-1') as file: - html_content = file.read() - print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + if len(div_text[n]) > 0 and div_text[n] not in p_text: + generaltext += div_text[n] + ' ' - generaltext = extract(html_content) - try: - if len(generaltext) > 2: - print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) - except: + + generaltextlist = generaltext.split(' ') + if len(generaltextlist) > 5000: + print('text over 1000 words for entry id', entry_id, ' number of words:', len(generaltextlist)) - print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') - generaltext = 'NONE' + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + try: + with open(file_name , 'r', encoding='utf-8') as file: + html_content = file.read() + except Exception as e: + + with open(file_name , 'r', encoding='latin-1') as file: + html_content = file.read() + print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + + generaltext = extract(html_content) + print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + + if len(generaltextlist) < 2: + + + print('no text parsed, the wc is', len(generaltextlist)) + + print('text under 2 words for entry id', entry_id, ' number of words:', len(generaltextlist)) + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + try: + with open(file_name , 'r', encoding='utf-8') as file: + html_content = file.read() + except Exception as e: + + with open(file_name , 'r', encoding='latin-1') as file: + html_content = file.read() + print('encoding utf8 in opening with trafilatura did not work, trying latin1, original error message is:', e) + + generaltext = extract(html_content) + try: + if len(generaltext) > 2: + print('generaltext word count was: ', len(generaltextlist), 'but now trafilatura did the job and new wordcount is:', len(generaltext.split(' '))) + except: + + print('trafilatura got this out:', generaltext , 'setting generaltext to NONE') + generaltext = 'NONE' dictionary_entry_list[entry_id]["text"] = generaltext dictionary_entry_list[entry_id]["text-word-count"] = len(generaltextlist)