From 0e5875660038f3f3cc71d3d1192ffaae428c8a6c Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Fri, 15 Dec 2023 11:33:50 +0000 Subject: [PATCH] added last resort exception for entry page downloading with wget, also implemented some further logic regarding getting the right links --- main.py | 4 +- spiders/__pycache__/fdb_spider.cpython-39.pyc | Bin 16356 -> 16864 bytes spiders/fdb_spider.py | 72 ++++++++++++------ 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/main.py b/main.py index 4a2d91f..cab6c07 100644 --- a/main.py +++ b/main.py @@ -16,9 +16,9 @@ spider = fdb_spider(config) #spider.find_config_parameter(list_of_fdbs) -spider.parse_entry_list_data2dictionary(list_of_fdbs) +#spider.parse_entry_list_data2dictionary(list_of_fdbs) spider.download_entry_data_htmls(list_of_fdbs) -spider.parse_entry_data2dictionary(list_of_fdbs) +#spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index f60df44bcef7d627811872ce7b3b969b0c339ed3..a8d26b3005800c673ae14119ae4d445e31952ed8 100644 GIT binary patch delta 3457 zcmZ`*du&tJ8NcUVzw)qS5<7mxPOjsS*h%apypF|=0Iw9$#~`z{Hk zl-at!`)j@!AZTopP%w<#%8QyXCg}E!dAHOgTE)yGS~;&A z7YmYd3N{!c166Y@U$Iw6Dh5Fn3u5*r6Y@qT*Y7HN_;NjygE#V~(=6}Ss>ZyUOLOxO+%^TBVTxj>nrHEzyP$cBwY&v-%uT_i zNGnA&Z?SGpK&G$g=Sz_YoG?3GOQR^&tRCUhk-)dib=H61Gwdi0+g327Yt#0$feX8Y zaNE3Sc}p}c7zR_x*!(X_4>Q&@-&y{1#sct(waFdC`ChP3WT)baIysu)@ytYGG^;G1 zUuzp?wE>JOL{XL#&%6}wAlC|bygd{@VFT$2Df}CA*BEo1h1V ztD6S<$lOA(mY|D3L>RKDsf?<`lSh<(Y=#|*l|qjb>_k|ojH@waRE?*S2d74rgNZn@ zcf-5YZ?h-h6<31&1j^m7mD`urp>f0jBF}P#UxnUaz+#Tv@lZ63yWfG_yvGnl-}R=pt8ha2pq%6FtIbSPoA7 zpq}vzaqi|nb2o#8Z#zu+xozzOQY>|YUi7L?Yxq1X>A-XW_@1xMdL7q*0oPWwYT(OZ zZkexIwem(y#HC+?yN8y36J8;3_ya5qJ^pixGX-z^--PqEZc#Y|->Cghi5 zz@kbge)imHXsND+pVbv+>aZEMEBDVTO;9-jCy)qY1cwP8G}|QHQ530`AYVY=b>Gk(o8#c&ct9$K<; z&6+1Kt94`YSU{6`m^bs5iz075E#?E{E!JxRRIYl|t4pX?)`FDH;q;W@^qfRNifCn| zX1V4?u4gRBqr8o`pB9S^JUYkdS>$yPuU(hD4U1_wrU~MsA=RdNw9h@L1!pBV+Zf%F ztJLa?%TPlb_zKMCUDR1?!H`9H(R)HVyTs`v4!*JLPR|x~miqlZJk!`;LL*0ESB}Cz z8Upaw#s*h6N~l${7X4cNSfeg|*0EY9U|e=KzW_@L3%)>@44Rhoyd)@ohC5ed82gdG~h=UT_21!ML}hC@J&wDCU5 zjnC9TXt~p{Ur35j*XV}7H1+w{3@W$)1VtvOCuksOBnZK#P`$mK%qjvGK?c&H4(ACn zpCLFvaFF1O@cmGnJq3ZV)2WNqBzdO@rV0G8HSBMuy>ZJBo#2OaW!qD^cp@>{)*Wrn zEjyOom`aVuWBsZ4oj<%B4#>HIL%LT!5}!iFlhvcKq|8&($wX?DqXT|e7JwgwUoG>{ zE4&0XaI|?BTMt*88)sxy$JuiOtIvl6p!hLg)#tdw)kew5S0XI$fU%Q7*o~}votgLK*^Ny@95YUNqzfhI!C+9kXXW@qI z6NiGpTn@;z4x9FpcNl)tx=G$c<~{=I?~plz{lY^{v0!UKOvRx)(ro%MLsj}0#3Em^ zIwVmvuxe%(?W`0&j&zD9BeX{yRi&bcZa2CuVl^wpe;s!grX+_P*JYrT=?yjR2hUMA=)sURvJVa+3OnU2J~J89DEs@rg;BT%se zBl6vBtKp3fKhE`g9btA6Oe@X?KGbS7f6+T9^a}Z!f|ULculI51cj3-Apm1%2i=D0T zrxgd;O4!i3DscTXd_(#=7CrCWCnoreQmF7^kQ>N@Dz)UK-z6ZYdB-*Qy%l_~o=-0bRP1z6qF1$(-~I0GlT zkFjC6+3jNQV5~zjXD(xOBQwF%Jzq0raFTQNknXK>qz=nDNsdO96X;u19T@H%aO$U; zUPt9A@*IH*ul34i<*Q_$g5UP8DANx+joK%nYUS2|GDEhmOHZSL ztz};(^JRhw_*I{e?SdP9VdqQa(0AHb$ht(3hUPW3I5&^3nPb!N_cbpt`}|bDojFG+ zsu0W)TqO896bD>k#f-KGuS_L^g;`20GeW?!AF&Cq9z7m*DQDq#YwK`fe!N!3UFI8T nV*{{l;33miO7$YZK&R}Z1R|yo-C{YzJ1ZA0VmaD$w@Cj3M@>=Z delta 3007 zcmai$ZA@F|702&$ZESz2a;*K@<00yiYA zi<-FYC!hSCF(>?5>~}uW3WCB83I*-(dR|n7l%U#wUh!+{M2$we1Sd-(y_L%>UvWxE zOIkq|3u5j86Y?cYm_I%5QOw2CWuaKct0+d+aH~?XC^7+~n>VBH!JU!;qisScXrw`m zu!C6_F+IPzh8<@da9`s&E1S7f(KrOzpqLh!L*OpF+>2tqQmHIfC>6_$2})(k!i0i2 zb7028cXH3^H$Zap0rKb^EQmPp~uJla7X9r3aBeMiY^}9vTC-h zkZM`^E?&FD@=m2HU$u<(&2^}EY>pJ$5`7EKE z5F?0)xIxZjWht3{P1=vcuvIdV>looU+_yW~5InU1j-7-L9Fy!r2sv+;%TE&U9IQFd zuyWYz%CR>1sVk`whwrZV-AiI1;1YOnLc|36n^LM+9fG1)EI7AwzmT>nr922Jug|m} zPfd$&$1ZDm4P34C*kuzBDI%s+DP|?5)GoYg;Wyp@tAj7RH#YYQuUFsE4&!4&pU(xa z`2Jyd_;37t#ae*-Z!c8SW-QXvXF=O4CyWs^1Wrg0{@WRqlB18%j|khh9}mrr6NU&U z2^qwyjVA#DR`wbMnmzE#fYCiozB$4LLXI#`I1LX2?K(FTgscEH!7s&15o&9Wn(NSM z!D|4o>?B;O`L;$oyauJAQKrB|=rbn6z^*?n)EC6*TRg0|&SN!@4BU$rQ>})_);wBy zzwAr-Qvt=#8+j9tyesnNB{3hM* z%3GJjVu-gaGrB`IM|qoK+^if2Hg8Vh%{6#)y=+!oD4cKH2p`q<_QvpL_hw}>6|_0s z=Fd@O3Kn-@WeV|jib273Y^_g~i5_kp4%c;;()MV})38)`$kmAj$E28wUL}~SRZD^? zRn3I-UG(~$iko{d4@)Ew7wl4NL6kGT8v^x>%^S;>BLoOFgk1!BcTyOJ!a-vdnGS*kZ-rZJm&jZoQ07Xn z5JuqRaI#U&20M;d%TlA~N3&y6a$4SysZ3F`g!2Rsj5K)7l;6@tL|ijBHwTv+td@$Y z+(0HXkxX8;}ENb(x%#laFcX^=KKW8vyHlQkSr=b z(jme#gu{d*g#W20RVS)|_EFsPgl`cB5%JROTv8r8HzH@^y18VUXXfJIiI~_6&=3hk z`zVg&C3O*YBjWlKiJ931R;E2TTGb`eV;P=MOO(_D7bBk3cPW$v9M?&SH2S1o9E9t( z3Di#^Tg6m5L&GScl@KQ!Cp@L$(=iaJ4k(J|Ibd0nfHvHHJh{)G|A>FbjO%67v}}eoYxOh$1UzIuQ#wD-$iO zT4QHuu`=k69xA_p3#mQcFgyv@qDhT@_+RiK8xL79DW4m zwg@{7``fzNyD;B2i(GGWRg_~l^#3N&w z*|aQCRS0{xHGj3&siN(mb)Hh`7|oG}wl%>R4SNVr{w*xg@EYMdp&$sR&8GN08ux>) zyGwVB%;PZ7T^ss8f}+54u+-gD4`hCyK#enQALp-*jAhc}BMID0ZshUVoq*4~!?v3g zrwZ?Fvfd#~gRiHWy$n4)%WM?ZdcMQVD-*pIW;;vX8N%y?9}wPyLZ2t(Kr>s3Ffc=@ zVL=EO&LFGtYLxUijXiz|?)Ujv7p(XB*%<8TZ(%*q-~S9d3QPSl=7taZJxuc-f=klS diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 9569fcf..7c197f0 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -19,6 +19,8 @@ from pdfminer.layout import LTTextContainer import time +import subprocess + class fdb_spider(object): def __init__(self, config_file): @@ -99,7 +101,7 @@ class fdb_spider(object): ) entry_jsdomain = 'NONE' - if entry_jsdomain == 'NONE': + if entry_jsdomain == 'NONE' or entry_jsdomain == 'None': for i in entry_iteration_var_list: @@ -417,8 +419,17 @@ class fdb_spider(object): if 'javascript:' in link: dictionary_entry_list[n]["link"] = link if fdb_domain not in link and ('http' or 'https' or 'www.') not in link: - if link[-1] == '/': - dictionary_entry_list[n]["link"] = fdb_domain + link + if link[0] == '/': + if fdb_domain[-1] != '/': + dictionary_entry_list[n]["link"] = fdb_domain + link + if fdb_domain[-1] == '/': + dictionary_entry_list[n]["link"] = fdb_domain + link[1:] + if link[0] == '.' and link[1] == '/': + if fdb_domain[-1] != '/': + dictionary_entry_list[n]["link"] = fdb_domain + link[1:] + if fdb_domain[-1] == '/': + dictionary_entry_list[n]["link"] = fdb_domain + link[2:] + else: dictionary_entry_list[n]["link"] = fdb_domain + '/' + link @@ -534,14 +545,17 @@ class fdb_spider(object): driver.switch_to.window(window_before) - if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link: + if 'javascript' not in entry_link and '.pdf' not in entry_link: + #print('oi') try: # defining cookie to not end up in endless loop because of cookie banners pointing to redirects url = entry_link - req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=lovely'}) + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) response = urllib.request.urlopen(req) + #print('response from first one', response) except Exception as e: + print('cookie giving then downloading did not work, original error is:', e) try: response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) print( @@ -575,30 +589,38 @@ class fdb_spider(object): f.write(response.content) f.close - else: - file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" + + + + file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" - if web_content == 'NONE': - print('other downloading approaches did not work, trying requests') + if web_content == 'NONE': + print('other downloading approaches did not work, trying requests') - try: - from requests_html import HTMLSession - session = HTMLSession() + try: + from requests_html import HTMLSession + session = HTMLSession() - r = session.get(entry_link) + r = session.get(entry_link) - r.html.render() - web_content = r.text - - except Exception as e: - print('requests_html HTMLSession did not work') - - - os.makedirs(os.path.dirname(file_name), exist_ok=True) - f = open(file_name, "w+") - f.write(web_content) - f.close + r.html.render() + web_content = r.text + + except Exception as e: + print('requests_html HTMLSession did not work trying wget, ori error is:', e) + + try: + os.makedirs(os.path.dirname(file_name), exist_ok=True) + oi = subprocess.run(["wget", entry_link, '--output-document=' + file_name]) + + except subprocess.CalledProcessError: + print('wget downloading did not work.. saving NONE to file now') + + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "w+") + f.write(web_content) + f.close # save the entry_domain, implemented first for further downloads in javascript links f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+") @@ -692,7 +714,7 @@ class fdb_spider(object): pdf_link = entry_domain[:-1] + child[1:] if entry_domain[-1] != '/': for n in range(len(entry_domain)): - if entry_domain[-1] != '/': + if entry_domain[-n] != '/': entry_domain = entry_domain[:-1] else: break