From d2324d265ac97f03a1fc3f5f3bf4826e46362431 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Wed, 6 Dec 2023 13:46:54 +0000 Subject: [PATCH] added pdf child text downloading and parse to json exceptions/cases for javascript entry data and normal data --- main.py | 4 +- spiders/__pycache__/fdb_spider.cpython-39.pyc | Bin 13103 -> 14314 bytes spiders/config.yaml | 6 +- spiders/fdb_spider.py | 105 ++++++++++++++++-- 4 files changed, 101 insertions(+), 14 deletions(-) diff --git a/main.py b/main.py index 9d6075b..3cc9312 100644 --- a/main.py +++ b/main.py @@ -14,11 +14,11 @@ spider = fdb_spider(config) #spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) -spider.find_config_parameter(list_of_fdbs) +#spider.find_config_parameter(list_of_fdbs) #spider.parse_entry_list_data2dictionary(list_of_fdbs) -spider.download_entry_data_htmls(list_of_fdbs) +#spider.download_entry_data_htmls(list_of_fdbs) spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 7a05bea6294753fd22b15441129ef700951fed70..f4a18608d2592d80ab827088384ccd371ffa433b 100644 GIT binary patch delta 4086 zcmcInUu;`f8NcWF+OZwSb{zk=W8cJ1lQjRk(v~h;s||^16I};`wiZ?Ja-7DFYsc5O zNn_5fg**^11&ccgY3hR`BqR_5#?ZtAjX-m7% zJrL`D_ndRT^Zogq@B7Y|AFaJ#_ZNqTybAulePy%shnsKs?a(+khq*h1DBSfarK?`PVY&?umn(*sdAKvC7=vOSQ<&HKMQZZXgS*5W;=bMh zb7JJ@evEt=1>|T5qhPDQ>c@Eaf>K*l6~o;nJEw?Z4iKdkh0cIAn;shsf+HmN4Oy(n5#+Qx?X2jg3hH)&5}leq%iQ^$kx35_oA|BEu~}iR)}YMzFh@UXCN~Q zD|Fk6pV(GhY^4u)W(5xpaE=dj02#RFZSGsP9bwp!6(RMJI<22)JS1(spUys+G{$-Z zJR+Pqe$2?|lE(VOnSyLNh$H0BU*c1gQs2-=@nKJ0H;m1KV|#mh z&+?A_lk8cJ@oh3?eR=$~WMOdinyy3n$~C=O7!;~kn)Pe?YRL-cz6jPvE*{?yIvgMc z7g$+I*QYU=&RMr|Z-b?fpBnN>`&U@~YO!3izLH-c8`c~7v-v$RD%3~a#7lg{ODWNa zpHT6;lT^j3n_9yYPYn)94yTLyRO`Y-)$Mg8!Pv@MzwsokKTgEj6AgB#%HEHV@S1L-6aEh367?ct25P`}aJ;KDAYB*3y2y^U`4l$i}0INF{gs)3hC(2ar z>Ib(>8rKb%Od7nAGzxdNp={vt9diKUot)yZU7{4BfH>KJm>Vu+hbWnDv2MD#yEm|_ zm>#J1VD7!oL<9$n@xU;j9C>K8=)BQIXedpEbj?{}pP65B1*9LT%X4Pr9#3OT!Qw z;vrZ{FMnu`@DSi1!CIIP_k0{*Swxg`h>r4z0G7pM$yD`=Sc?nf4sily-VEBFCm<)v z+DU8|we=QDb!pSj(maYe)e8*uK;9MwcnuIa6vh;o5`)*Lq}O!kqV09Yp#AS`G=}@{ zqfcCFsYoq~(Au`G@yIJDBD`Z66d{N1;}Mp{EMU*#OtWWH{~3=ng=t@<2}N)*MU3EE z0_%M)hmc_SjQ|HG9fb7dDKmNBSE~O_%4ka=X1s=Y?b61Fcz(;{|0T3p>)NXO3WHjqgGnfPBI8o|mio=|T@gdrPe zIeFzed64B%-bIdtd8~q~0LkjE;8ueH6S%$u;{Iwavm<88i18qBX)BkHnrWT}Kp9-2 zaXxCmPI&eKPw~-Sa#uNto4V_0e1PXYocwWVu)6d7zQJq~!YG?M?9?GkrW+qwR&XSd z4#Ic?S);$&mhnr(5TN7q?=j*!c(rj`X2mq%%HZB4>?lTA?CEDOVi+On)K5tN4%Qyw z=cCAITFETT6Mze{+4>AH*^*UnoursoW2TM9e=7{b1Bzc5(-iNgZUNb@{y2N9SjY@bi! zZQ}qm;i_D36Lx9?_nn>x>nCj-kf)fP{=XqJ&@Ajb(|!P&ft`2mLGzf%3^c#HFHFdK zTx44F0*{XLUxokbVNl`h(*2VJ*ONTOmc=QH%)g}6DDH7Pzu8kVzY$_~fhG;TlY zDraZ7g7WtC()a;Qqt(Wx@+LhMq z>Uz1#v~taE@mSF)YUQT3$<}5MT0qeemW}y&{R`NluL`mv2on;X(VC^w)uvXf8(LA@ zDpt!(?AEH~+Q$8To}ngi7&H)%z+fVYwVSk)p~8Me5DY+H(rr# zr7+5RJ2z!*EKIv}v1iMAec}0p-W1~{L9PkX666ICE4!F-85k5|*4>4e_3wq;uxoS0 zC|x%SZdTrsBkS{vFGc@B@XztnAc|~G(W2rPvff&JwvCZfCG;+(^p2`}sh7H`CYo51 zMEe>dK{7>yn5WUF#_nbHTR-;9o9JBcZ! z5_9y=Kl9(df95;-``Yi%MbAbeL4uF>#@g~bm)?(lU5M2&jbj|a*!)BnMsc3r5XXx@ zF}UwV($`NPw*6L+`MEbstPuD2XqK>mxLch1VQ@h02oF0W%nK>XqmV+7VoC}_ig$Ou`>-xHz*2%8%aF6Jgi<7@} zL#7b2+ygRC^&Vt(g_dsmu9hl?@>bOu;b~RN(_1q8O-o9x+M}j){sb2<=Xr)D&TD<$ z{0-oklhM0PMp?66zc6N3(7Q^H*j4^8P>t7GTjohT{_UNWidP%-bN}+C)lKWn z+Ih2tg(ZE(veuV8TU%R8Vzc%ycxczR=%o0ne$8t(utQuOdkxxuj^&c)q&tta@|nvW zwrn=)qB8ywR=*gZ7RiZOdQKdk_(An9wh%3>`Du`bX;9N>l}4#1zYhxDphvtn@q8dK zTn$rkZ=x-JJ~^MSz$%KN7{T!p!YPQrd-|GqXY|C5_w+Ni*V38JH4p(vL4s|0nNQ9w zANQ(@I3o47kNcdF0j|15a6+l==cGj)jce$&yj~jfLkcP>04c;n&`DYebH5^=>vjaO zKs!7FDhTJvYmb7m$P*!>DtttRk7bD+?ItXLE6@r$As&Ux7<&9Hdh7xJyo$4`;sWKv`G5M^(^i&=+RBm`7(-rJ8veA(e(zvcs_2#$`)d6umD- zoPEpJ2ao$g;!^;;w3C+Ps2VWsme#8N-~d1 zv)?ZKIQ}KYU%5{)BSX>`vHeFvpn0!OwXbzwRQzH8Uwvy>Hn+vi0}H{BIf6+eZT{lc I-wtT>KS 0: + print("oi", child) + + if '.pdf' in child: + + print('child in entry data is pdf, downloading it..') + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".pdf" + entry_link = dictionary_entry_list[entry_id]["link"] + if 'http' not in child: + if 'javascript' or 'js' not in entry_link and 'http' in entry_link: + try: + response = requests.get(entry_link + child) + except Exception as e: + print(entry_link + child + ' seems not a valid pdf link to download, orginal error message is:', e) + + if 'javascript' or 'js' in entry_link: + entry_domain = dictionary_entry_list[entry_id]["domain"] + if child[0] == '.' and child[1] == '/': + if entry_domain[-1] == '/': + pdf_link = entry_domain[:-1] + child[1:] + if entry_domain[-1] != '/': + for n in range(len(entry_domain)): + if entry_domain[-1] != '/': + entry_domain = entry_domain[:-1] + else: + break + + + + pdf_link = entry_domain + child[1:] + + if child[0] == '/': + if entry_domain[-1] == '/': + pdf_link = entry_domain[:-1] + child + if entry_domain[-1] != '/': + pdf_link = entry_domain + child + + print('pdf_link', pdf_link) + try: + response = requests.get(pdf_link) + except Exception as e: + print(pdf_link + ' seems not a valid pdf link to download, orginal error message is:', e) + + + + #response = requests.get(child) + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "bw") + f.write(response.content) + f.close + + print('parsing a pdf', pdf_link, entry_id) + + try: + + generaltext = '' + + for page_layout in extract_pages(file_name): + for element in page_layout: + if isinstance(element, LTTextContainer): + generaltext += element.get_text() + + except Exception as e: + generaltext = 'NONE' + print('parsing pdf did not work, the original error is:', e ) + + + + dictionary_entry_list[entry_id][key] = generaltext + + if len(child) > 0 and '.pdf' not in child: dictionary_entry_list[entry_id][key] = child[ 0 ]