From d284fef01568da9b0fe886796546634bfcff937f Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Thu, 29 Feb 2024 13:15:01 +0000 Subject: [PATCH] changes for new database dtvp, new exceptions trying to click away cookie pop ups --- log.txt | 218 ++++++++++++++++++ main.py | 7 +- .../__pycache__/fdb_spider.cpython-311.pyc | Bin 42691 -> 43779 bytes spiders/config.yaml | 46 +++- spiders/fdb_spider.py | 30 ++- 5 files changed, 289 insertions(+), 12 deletions(-) create mode 100644 log.txt diff --git a/log.txt b/log.txt new file mode 100644 index 0000000..be992ec --- /dev/null +++ b/log.txt @@ -0,0 +1,218 @@ +trying to get element +0 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +1 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +2 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +3 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +4 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +5 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +6 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +7 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +8 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +9 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +10 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +11 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +12 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +13 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +14 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +15 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +16 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +17 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +18 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +19 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +20 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +21 +scrolling.. +clicking.. +trying to get element +0 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +1 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +2 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +3 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +4 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +5 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +6 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +7 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +8 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +9 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +10 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +11 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +12 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +13 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +14 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +15 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +16 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +17 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +18 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +19 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +20 +scrolling.. +clicking.. +length of the window handles 1 +trying to get element +21 +scrolling.. +clicking.. diff --git a/main.py b/main.py index f892b52..0578d81 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,10 @@ import sys config = "spiders/config.yaml" #list_of_fdbs = eval(sys.argv[1]) #list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"] +#list_of_fdbs = ["giz","evergabe-online"] +#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"] list_of_fdbs = ["ted.europa.eu"] +#list_of_fdbs = ["dtvp"] # doing the crawling of government websites @@ -19,7 +22,7 @@ spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) spider.parse_entry_list_data2dictionary(list_of_fdbs) -spider.download_entry_data_htmls(list_of_fdbs) +#spider.download_entry_data_htmls(list_of_fdbs) -spider.parse_entry_data2dictionary(list_of_fdbs) +#spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-311.pyc b/spiders/__pycache__/fdb_spider.cpython-311.pyc index 2d81f66759e264a86ea74205baadd24b3ba71db6..e84421e4082bee8d806366cfcff0fde2f8d6b752 100644 GIT binary patch delta 5426 zcmcJTdr(wYn!xYrcQ^Kf&@Y+>x`C#-O+y2k_yj~i#0bhmqaZ@VZ3X0^m#Y|ZJD752 zHpyTX{bn<6MmKE6m`rrmxV<~Io7t*~V=|kh*0EdLXgazzW0I_4!Ns%X{h?)h#D zq0ycFV`?w-ug~*4-#Pbu-}z2oxhgyMsm$<}Uaw>5^St_>L*)HShMkJiDLupdkP%58 zGp*|SMOY>>gV;iPq_Rj@F3Lr6nAY>>q4TgO^7o2_2J#K*a8~bAX&Etd#tAhA78}d? z&dC_&JpCk}utRjrWJAr?JQ-nNR*|O=m0U4gF0jB>jYTF@LR4R=ubywtFoP~I8&@e% zCcJO7TDj75gbJje(^f)`F7Gb*q#N>OlSVixopv*DeWjirPU zQU;tC&G{c0Dl4hGX=xX#qnA{G({!IKhiRn)%F=CVe><(3y38?c47(!Intn|&sicw< z$|DDYLjMCc5a>N52%{sNeWN2naA0JtNC+JfidetP8{9t;Iv@;$_|S-#?RsDx8FlrI zj_w}_jk$zTSFo>-)|Qt34%cA9BJj@+jO@dTeIdaW8VU`EMuZ23u|9ruXsD>n?|Q1N zjGaiM`N$sYy96ri7BJWgpuwI^oRN^-NHX7GRPaXTsE^Sy z%u!}m#h6{Q%!o|8Q8t(Sda?GlE$au*%`(F>Jr)i});ii$lLuq=^0>V`YA?TS&v{+< znr@~lX0M3bE28#_S%oswecNAg%U^xnU;Vy1x@j!t7vg^5RMTxId&?QP?hM47<#A_u zRQmtsZf?;mBg=Gui@Q$uj63S7j5({~&Z?-h>bBE$sxJBW4Sn>pO4^<#{!_!`dEuw7 zX5Ge^t0C@ch`JggO7|9>6@B*Lx)|JutX%>u0(bXRLeSVWn zF7NV1nhHA!{FQH+)*xzS{0&UMu(O|j6lYnaE)~s2A5Ge(j!%tue88zuMw`U zSqAr6gKS2R)&v)eOz`=!OgZ*XHmQQITaAB6!oi{}GT%8D-t(DBvg#b1;^s9`^BP)p#&f## zZSzIPJC5jz<`}y*&TfrvYl*Y1F-Ke6(RRzxaoy1obL@^gcE`+JadTJH-1Sv<-kZy3 zgtsQ%n21(viFupi-lkZ7b3DH}77158DYG`_Z3`%(0bPq(9bIi|F{)!mWs5oT(Tbz` zydyZRXAu*K=MW+yO!0tp^6Mjm8*+&3cQSan_E(cj7Y$pcBPJ0F#8Jcxh)sxvh2$$S zi#wTbMr@^EtxFb{KZaEtVhAxsk+5(BeYo5~9&@vzBV$B4r@N2_mTe_u8PslL;Z<+COF9Cz zUAbYcV540hKAb_+;&O3DIJ0d(Olpi^Yq3ejW*j%dmX-p#_YbzLhVxpZf!d6Yo|&4i z=flec8&vOc;Mw7xmqn$RVXnnXERfmy7W`~2HQ3fHD$@iHS{?9ys|gC)9I)j%BQ&*T zLsgpzCfe-jj<6*~qbxd}AFj1!!KdqtaKFt%vZa!z+cQZHgxXnfR2ofTyJ+QH`z1HU zv!Vv(wprmyd#&>O;Y_MqZkGc*+tV}LnG4#59h@VaMe8o@blSK)(Q%j=Q;QbPu^)XK zGfnspweT0)9hQ8qV97b}6lnx-)C#}&o&it#rWShAr*R%|w-?JbGZyr5xrIypLdvAP z@Y41`X1uB48qT5bHfM0*^>OUdF-01U zB=sz!FH19MpC5)}7cE3Cty#zddBG|ZbK%5}44B=i^SVTrXcRLS>KV?u4)xLuI_HIj zvuKoqFWL{?dpz0_)Hv)G)+O)$*tm6 zpO#%%v*e8yq_&>#3Be9)o>-8oTG*4K%v#X{|GH~ASqHft7E8*Yi^a%92b6c(ER|f< zh3eGNXNsg#xso1g*~V1i>mj&11MYR0(+d|5D$+>z4@+BKnX0r{c#fLYdC9a;EyC<# z=-F5JeBp>D`8`gct0xbGVGsOer-vk!s)4(mELl{NyCGFDn(+KQsmH0M3VX{_m3_n% zA3wBJ+7%4{6UhlAJ;{$ByR+M%q01xp{4qPcZhnU=M9-iP=2*K))DJG&>5B*6_0Xia z`mk5bq8=)#+`pyFd}Hdm5E{}_H_!+@w=p0?B^9zd=W<^$VlRw{SGKGDj zZ?I&(Zl)CDE)}&m-(UdNc9)P%xVy;zWzwF||1^JB7Gp_+U~h*;5=#_M+vc*O}vw9$jk5`z~@v zmx6)J#l19BN2j$@ZYg5h0{4T3!1h*?AHYCwxdm67e+#iF6oHGqzutgnbwW27JRGDq zmw^L9GJvSUrg&N7Yp`B5I)H!tt5~ug@k2x*;*H3=eR0|3a()};e25l=AJK{^MYJK> z5!(?v5YHf_z~*rgj}*j6(M=<^+lknP=t6WNb|dysB;ddHx=1(V?sJzd#65gHR-+e6 zs9Qq^4~1w5*?>7!XrynH3-MA6x&;pGb7uSlHpR#)p`wv3t>c^E^?hu!e^KZp%}tt? zG$S6n;EBSoMT{XZbX^)<9mKLG#M>}em!V3AQV{6>k!b`=1;imlC*16>3k+j&lp>)U zr&rDHaekDB!l+oGO>($4_$y7VTu(E#^2nL}w}^U=EZsH7?1ATo8z*N~|L=Fj`K#irhRMvm zWiPpIFNv1Y+iG>(Uag4QtG|K|hc6IQgdaJsnru*^uCo?Yf0nARwU4c&Q@i4*(bpBp zJ}A=FFH?R{VX9XvKU6De`Xkn|sao+-wXV^l{#g4&qf7m6#7Nt1C>+TJ5j3N}7)Q)@?6R-JC4a?a*s)R+x5Zw4Z8} zH2rCMjpZ4y>Xuj6u|j*R+|&`!-j>zab~!YkWfGeC%%SV>rhQhhx+5DvE?|V2u7YBm_tLLOh(G&QtYZRS<4W7Zx1D zB!?J6EHY@!eh2Yg$UfmOied65Vu~VR-OKfM)1RI8c1suMZn};Fe<;NF!od@)>pHfR zj6II2FR+KU?~p+6yF;|mad_`U4mkv$o%ou382R8Nn{GLb?M@cQv zH_B$2l&^khKkLotl`EL zz`nV$T=KD>&^Gh{=p7982~r21U{|TXmDWAbtQ#KX4h@B>cs|<-U zV_WR6YuDYjblb7Mx*a#O+p*j3_GPEKt<}m1FuxpL726-}CtYbN_SisO0r`B+1WewHk&#!hhfL{qm*L$?Ie#f0@NFhZ%t^W`-1< zfv`kidQd}r#JWvksUQ`|4%*JGgP-koM$X8hDtNIj1)kBmC44pPNUqh^#QHEyj}0Ew zS|FONg*Vz$6lz8=4`)GHjWN}IlrYRm`ak&)+5#TdBw-7oPS+r{@Jh%?$-{Rx#zJRaK7vy*ALSC*v5Ga^aWG}mI&Ky@% ze%Ce+zxA&;tw?>^uL&fX9fn!m{vg{m)h86WOMNG2wK9%vX#vmX56IOCIOvCNY^yEBD(wgL=jd7`Hf9f@%qKM96 zBDI$*8tK_+6uwaE;0d>mSYcw5F|O4Oo)jZwtxX2KTMt!* zI`F%63hE^>1KSH1!E^2;sJTxMgH_3}w@7Ey-{eyx-XVUdD1Th>F0BqNj*ONUN#tMX8C!0|U%66ZTIyI;Cc99kS#45YsL`)Z zQNEiZUu}xKJ$J7*het~Th#iQXh`SKGD83>cTwNsCXd@DCK0LhS&B2+2N6i$(ZiEc+ zLxh01)uiZ&!f>}ER#3Q1@$(E`l|j_l zi?|zc58__LeTW|+?ngX;0K`GWA;e>dAVtVTgUn2Wtc4Ur4lP@#COMJOJ5va}yv7U} zH98GpB#aU<%r=F_ODsfX1GYFrLh53jZv;8RU9 z{It0V-3$KEoJM^~M;%lLc`XiUs*nVi*16$?W)}Q*kDk6u(1<2Y@OVoxF~T)ZJ~6@P zEziQIYxQtqtwl6*d94-p59;6>`n$MQ4+X6m!~%`2`Ki{hF<~efQ!hB)Y9*O)wzWcP z6Ke_~uPq(6w&`J2o5g5PjFWJ>U}Cc&+~y^@aILLWmt&62jR{*=D`c(9&dn7pJD8AC zFtXNO^i*buaGeS7c0fgYso5!{vU#{1eD1UndbYFq@KkNu(gJJ|jL4((5IH%0kc2Z> z!ZHCvIFqh^W{)w}m(5QcD_yaSVG|zdZYrvJruTu(j~ld&wxq{Q44d)9X|&yBNY!JB z>4dX{EI}ujZ`fed=F>)M$2zTJWnJP1xk0_zSy_ma2;2GUSUdgNWyWN_DkkAKFecMa zA00{RjOAyGP8CC+C(~$WOJ*#JR?yOqVj7IFhKlT%l^OFUH^v;!zG>c$Ji@TJcVtMy ztzt{3q$Y76#nq?vi}*D7&pwNdS~js|y<#=DMa?8^Z#mR#SfGvHnhK~{p9#-w$dmp+ zbh32#c!SxT!!M4Fl+JzH35FdwcUKNu$==4!{kh~+)r`63CiVkuryb7xIC*y1kuZ#E z!2yRlD{|)vNr?ssLkFDjYHA`&Y8lQG^NkvI{;Ar;9&mbN8mWQpRhhcPchteIjrpn0 z`1cAHno4}9rLvkt2NRq}X`GxCV+9w4FzpQeU^w?UgR042@N9C(XiDUO(_K#Z)kYIp z0QLF=UzAkY%PN?5xOiDE7G~)I$-0>Tm%RL?Mv+9`7-buf<&Qizs z>br{)7dJfRyHZDw3tnw7>$E*_9vnZa_Nh^^4dFb&N{^RG|NCeB^0SK*7Z*319vF=Q zcVA&RUp%t&kJ3dxIqiDk0-*pVn(4v1IYs9ZTs=jxrr}~dJ`~j1T|Fi6@#f-&CAajY z|2wb;ZxPsui$b&6uGlbWaLkFx>{7wPE)(o{q->EUPBrXii^6$9M=rx}jOo+9Wii>o zaI>jhdJu}~JXW(!@VR$xbspO=wU%Aq>rQ{iDU}(w$25_4^3!EHs@Ugku)!#u^bQkr z_)5*wI)r{*_ovhUGh!CC9G>>&X`-sFJ`Wr4g+g%7w}3nX>h3b*Fr6j$6e2F6fz{m? zD)2)e)%19FcxZ0!-^NE3{eJwL%&`t&mpa6wh&;sOk*+NxlEHFrC6?TXRS0w#ZZ)C= zaR=f~#2Q31q6H!5L-i=dWqDW>b00A&T7t!PL@S~Vu^!Pu5mkqF`gwSv$4WN9<(}-W z>2!%(ggOnVqg>@p95H!p4*0zGeLu7=qZw;b^c#J6Q<(OfuP zW`M7I?@8)Kz3qrrcsQ`QvakWcw+ii5%e`QVjOWQE}XeeaC5Eb$Td^zU%eU zdhOy`$s4tr#%%dnrM@vkel|l+<+tjLO$OOHgQjVo@|F4;r5xot<8p)Syg{?P zN_pOUR<+S-NM#YFx(=tyzlBI8{RF723QLd{uwi*w-kgcin>ns!6dSW0!fF6oL!PHV2{Z^K9M@7{+2>8eYPH6?PasgR;vBd7X*uBex9s-CO(uv+7tt-dDJ zd$sCohI-v*hw6GRp@r)Xn^&Inkwik}k7NX8xfErs+&eqM@4jxNk=Zs>pvO~~8%9iq zCp1H#{RG-mZDYIXWg*zx!r`ZY`%Pr&qYn}? zAKrtz$a1*-VD{hwtiKVn3$ZvOQt=p`dASj