From 953f85ee5bffa8ce14c1c013e8f1a0eac83facfb Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Wed, 13 Dec 2023 16:05:26 +0100 Subject: [PATCH 1/2] added new lines to chromedriver, to make it work on other systems --- .../__pycache__/fdb_spider.cpython-311.pyc | Bin 28138 -> 34536 bytes spiders/config.yaml | 4 +-- spiders/fdb_spider.py | 29 +++++++++++++++--- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/spiders/__pycache__/fdb_spider.cpython-311.pyc b/spiders/__pycache__/fdb_spider.cpython-311.pyc index f3f6ff0dbf01bbd5a04eb68e5c9d483e2d81e48d..cf9c6504ae1c7eae5fd13a2cfe4c79b1d0d524aa 100644 GIT binary patch delta 10078 zcmcgR3s7U%b?@o>BmoivAtZheiI4b*@3P{%e7(yqy9+EVyX=O=z|(?QkjRsOHGNt& zok{9+whivf#MpHjoN;VyZ)#G~8ImR~oo+k$oA=lkqsqY{vE3$VXR1x?)^$6bp8JGA zg55ZsNvnA}I``aj&$;)UbML$7=ojzcKl?*T(hG@+Y79R4=e*93pZHdiO;+(0Ifgxn zF?a{IC>grzmoV5Q;t)R~ZQU{q(8|W8Xb%UHhUeQzHG6U&SWnr>*^!A|PR} zs=lClVo`GYyYT({wO^>(Wcbh4>`7mi$`Y`w+_ihj27S_7s(n?mp6Bl`{+X?NtUYk&4K=N_HxN*rCKVJ{ZUD#no%*Sb2zP~vQU$V zHrY@~KKqjrBe_SYq;f<)JCjP~nj(t% z#PitI{6b~^0^%)TWp<~=xhT8+QwrQ#yUv`O`9svIbuDao|dFBNgwlx$k= zm*H5p1Y=~-DZit59z?yoFU~J#SRO{3{Tg4^0fo{r z8aCWzFlugT+ysUB;ddUN$8Jgov3cB=#p+C18Z~Tn#SrCBfc7g5=5!qM<-~~4Gb*$v ze7UGITihNg2p(E&B9pkByk}UQG|6XR8%~&${MrXCN9NZtku~La>!ithIIK_R*E4#w z$P~jN@hK6yA?*-7RfR=*SbLAI(C{0WB>mT!WKX;4ke0MPy7>^@$PmayGA1oz9wjmi zV_aarWlX7Mv|w1-(0BaljFw4{If*h3aKDMl^qZ+H#sp_9PCFQ4S&xjEfl=8^3Y8PF z{DC)^bc$qmZN{8jQIkP*M0|v2Fd54!(FiYwNG2vzqXH;0Ms+6i?8n`5nmA(oZHHexb8Dh%driH(f}G#SbonKD~;*<3FY@(}cA`>LbkFl^Es#eG## z45%4Vd^uB`%=ksUBmYi8{KkooP9~cwjPLiOK+HnpPh-p>e}OR)C$pnk4w(jzF-^`< z({y7@PQ3Tw$g%DDZB_Ij4jz1&Dh3+Do`=XZ`AOf2s6JRB8O4`#(F|)ahNMcUQp)~} zu!|qy>#^$C%vs+4wE)jU&gwnqLEl%Y$bi%8|nym#s*m5u?hfiaV9@vslhU zeNyOSlk2mREpdlS73`$XH5#n@-V~3ZcE2^2i!uEs8iqha{CXJc^Kj_DEM5eV)o^SO zg9FTZ%PJmZ@OyBOi9HWI;=l)%{bNZhi7claqpIVT`OWURBgS(f<2_(_0Hcl_9v-`5 zs)o_TTM@_H1xLi24%x{XFZ1E+Et|(5zF;x@F!~#xvNcEJwq{o}EFnbE+gAKAhYvx^ z|JfEk9&fILEezplnJi#i?52#kjk_ql{moBV;g#4GGG;3ChIk2}nK)ND4jWyC?i96C zh0`{Fp7>(VdmgOu@}5oLAj@a+(UDIcF;4+gFj*AkTP{ZX2Hi0NaLg1;mP9)s(jF8F z@o(+A!?Sw0zdzL{iNE!8wc%6GRk%0*<-_j~r05y(4)O5ATZ%3m8+~TD=_DZ&#=Bo} z`}(5d_AkY|dHs1BdA0}-spg%d6z!UG($G-$Iq5mqm=mELF7M3L$YVv4pObK&(6fkL z5>n2&ysptHXGrFCEjUBOgmZ+Na(cZBnRYwvoc8#fcFH+AJ3bz8xyS7@9@^JbS5;ND zkYKmFJ$CPin;P}pTrk@0l*>B;a)5iz?wy#n&w5A3oePHYSub5a>T;KlP0*fc=l;yJ z&YRA$SsxlQJTu~(KvVI|_*@>hH>C7NW=*4)5uJ&^Fc!eDg%YoCg!aKiNS}wKJoD}; z&j>~OCY+?x?V}$fCw$XW&3PP)7VIqmX#K{va5HFKOs$32bS)${}c6o8QWKn8slQAZFM zMPLkpTm+^7d?GJHQ|*5d4-w9rF0XIcbAz^`UKfCnbbfyR6Is~|b)C&_`&YKIy@j1? zcLa(`Lxk5k<#fAdr)e57=0=-k`%W*5*tslfLNDUe%{(^eqJ6U?Q{W{O1hfy4<)S%5 ziS43my7#P~FhWtoBlP&}G#E9ch;-5g7|T`|*|*x0E`$=UyWG?;%tYjnk4?G8ZiJHN zf#{hZo`8^m01BzR^Df`m#IVm3Qbf8!nz31$2BE{V^i)WL{50(K%+g~{R?%UR(3R{z zw3oS*U~wPPoJc!AJL~lNyu-*=^y4sNT85^r02!2%rpuvGR7mSl2f9ItHlfjiBg<1p z7mKzU{W}0cib(v?=o-fU^^jTSK}cq$DG;(=r$&d7w}-`88lFOKIdCYn-8Vv}2Qi_Z z%7=GVfsavi4!fw3Ix5?JgGM*WkaEw8$3qEw_8YnGbJC%N2qWd09&x!ts_QP;Chn1G zXDD%xZPa<)Lp#}LI}Ao-8MJ30Iz4V$11)bMS{6Q>!|H|!mK;{F|K8CO7>1bpb%7ykr_CNBO75~W=H6V#g)3zu_OiaC7^udfmGHIFNUg!W5`e~|b^?QQMj+HeAv zlDVxf+R_*C`Vv83BKB_+nOj8WQs4?t*aX7H5jGeUA#zy_Jkcl+jU3UqPh7gj6QzQv zdY{H+9p?#$KsY$UAx?uUZsqiCyuMA)w}F;q^LBFXR&wr2;ktuM&gGNah2-|f6}zY; z(I=K9`lRp!HHoj-KT>0fdQRWWC$tC&EnGrN&`=o0PK%Wa1Q+ogondk03F6DdBEdfT z=t!WT?1ibXPpvlc1ua5B3#YRN?Je8(jxBq~#%11qUa+6%bj88Sj_u0Mt;){LOulkJ zs2t#QWnqal*@Xw|&u!QDZq@g47smK{N~ou}R0ljC?-&v@?3 zC0S)*1(svEuast4V0=y+eqSZa0w`Bn%^zX_mXu*LW+K0q_;lh^+Gn&&+F+qwC~REs z6$(!@FC_%ivxRiqc6#|%diiPspWY;-H*x7rVKrte6AJ5=5_UU!xODrl=4TIb@ikq6_uFdh+=EAZ>Qvlz!lrFXJSZvRgZCh%$EVb(?+{ICD`WAOG zfF4WjeGId zm3K~Tm$Yt`v~Cn{xAkqc^>O`IIclD3>*L#Q3T-#H+X7o{fiQ-(N;)N=upK{*G@>79 zm7YNjfXl=VnID!(u>4W>`#MViDZ@cQ1VZ2PhKH1o9SlUFCs38QO_q1KPWh9qhdeXWHM&TN67{BzIHP z9S-^3EL}&H{BD&T@Xb_1XR&OvSbaK4xp_=?TBUqdB?p|7waRQC8{mEy02cd<*4hc(cG($1LaP!;#Uw>|t{nJ7DXsFWCdE4@_0>kzJ67#g zhw2@N{Awfn<1#ihj1(`!rTXk4)Q-uHJzSnf&I54ALCc>rcOJg8MyM z8Pb1Lef)}}D=R*tA(Gw&5|65gc*p;+mB^9u$Xa84L*+t>2SN{u$N0k|U_V#ht7n1p zi;p-z>yYzJ`);?}(FJ+$GnpL4d}I`Za6^|p@v~Rmn`zZcEqAQ!cyo>+FMgta0_upV zC# z0<&VPckwfzjZ6lUwQP=N!Fdabu&FX`m)}0-E;?eEOuvFE^k=aD)?Y7SGFYxVRRR(9 zr`=YiLENf1yV0GehNM3QzCVwg2Ps8|wp%unjn0Xb;zRTt$_RV81kXo1n*BnLLH=7@ zmm;z*mh8=vMpj^BCwlEEC5I+KGRdf{Z7n@S&lQavQ?ZA2Cq;Hr8d$u@Z;s8Q8Im!x zGLhKoLt=A9vAK?P2CxIM3<-a?C4nR3sBy}U zL9salIr%C^*$=JF5;t{Ad@ogY17$R*=-m_-jOC1$s)!%So*($UsgkO?Qym|qHBb&@ ztmyw6i9e6s>@QFwGY5J3^o{=;}5WdBa~emtT4qKor|j5D~syebz!Zk&qvWl46lGKQA1j;HTR5?obYdNU2d)W-5wt~Le7m$xhOO^Qmu)p2bGc@dfeq6nIfS^z`mqT%J~E@i&SAyh=m9k zBxdvd^ev>|_u1_$U8x~r*6q4JGUbID*u00PQ2M`Sxq27Jv)F(7Olvt^hIqRWup{s! z0!s)yg}~DYR3PwG1S;9{L)IiHu*^)kd^FlaMfvO>4RzrjHZ*ix@eRcD5}S7IjEP2- zRk{Iz7ZLal0+#`>e{#(%r(ZzWqt_af6*I#==S^RTpj>kz#{TeHfh-?&NAi3HH7vurmff5osAcq;G#na>sU^W;T=yvUIk*=MPuz$9NfxvUD> z+cs(i``OI_!QQ{D4qA&=4V?8jZ#^EC$ntw6kUbamNOl~}-)1(>aJ>V3#{loRBseau zbgn?wR9*|o3oyjOozlwH;`Ld+ssl2jPW&v&fvV2}3wB=8jT-hINQeUH2WX!PQ4#~0 zHYh=e!km9n5_Fv1yev2dxJ$!=dLUTd zu-+z=H*ZvLm7nCwPX-%KZAv%wn^K`6aCUQ+yEwpIyadl1kMRwcSBb;yJ9Q_%@7-wR zn|p-j9=@(usOt@5vdR-8SZxhD+Bez+$EnTgEysDzaXxqo5H)n?8eUeuS{-vpAMsrI*A3*qu{WM>CQYbyi^+Nm&@Z^9HAO`^3 z$+NAPRy+8-Iw7xa84r@VFDSpRTp=# zJQJ2_a!2vqs$QT26;!f;OGux()f?%FV)HzMSXa7P}|G;%Z+#c%@bioQaeBWqSC1hRQU%99-*hw;&q+6t6uZEIjyTk^LmXO@E>HPc2^KTs8IJ9H9x3N>CtOAy&Ui@U8}q= zjkuSl?rYTCBlUd_%{_-4@VAp%<(CYKZG-w!qiQ=xf61ZRcE|z$QH_4+m|{n17_zB$ zG7}JPlMR(7?vzT0js+Au4{Kv_1;Qy&rKW0(@`m=g26l_I;}q;RQIt3sVr@EQbLf7+dw%ixvM^Y#7H`;zq4z z1|9gXAaIZEx-r&sRU!wg#=r&HxJZhD{qk8U;(9QogGibRK#_rdlKt-+cg@>~B9hTN n{1c*a+C$AwIZx2JMxsoLV_0~r?#hzZpTWz{+n_RX z-t{>t*&0e^;o%%Jd@;8qqRiD&%4Y=)r|qV&Rs7Siu;8#RgR=-}14KZ^S;b!oXMzoXzgewNP@O_saz=a(@q&P8AvhwWI;Qf==6{cZqr{_B z3vm>{TiF&;P@;`+kqZX@bG^nAQu_z)v!)YABU!fApdmwQxGC2GZ*8J$lcQ(6$9mN% zfXqVQ`SuNFnY5f%kfTH56;8J-ZS2ytg%o;SX4}W;8)hyyMt+;>x-m8&t>1Q*b>7ZR_NZzy%gK{`=AcInNrNRC!Ml;i;Uz`rm zQ5@d?)wVY-+AgRA!t!0_jLB95f+aW;XYNVw%?N1#K@x)I6k6rX;z~qFD`;UobX`pF6~7VkW*NKchVz z!QP&oc_1$HJ=z%vw)J(x^$k@tj}rwBHF%}G6QMja^lZz8?`sa!09~HrEONjIDIaR{G|S==Ocl&cW!0A^fqG#%#j0O+Yfvypt6(&b3p{ zwdb5MXT`L$VpKLNyK~d*N)R%nDT$I&;}E!+Q*`p!OiuY!PWid6nTn>Vil%7u!_obL zXhlBJX%9C%0Wp0puS3i+Qew z6mvv_lP96aiIou3D{uzxahivJWv!++IhhMid5wd;;Hl2o!pJVtpvKHzWaKqE#l_UM zH7dm=mPVXQDjIPv=_riNG@5Y9m9sU48BdYdRxYqZdjf!>TYnSr?B#)k??czw2%Q0c-l}I>cZGE596kO>22_~T&+Icv zg6Ncl&8`eeAGE?#?M@hLcgn&Kjeu~$!aPMqDwQszi#ogrXK)61j5EO=P6JE`t)R^z zL(mStIADU&gS62o7&j5Eo{%YskEewun1WgGZwJk^K`<>`I#ypAYY49hmyS^Wk756T zG%aUAiqKoYV&klEzpqATOTH^YWYs}Uh}2+APRBF5YE3xm#4}Eucd2rC81<(7EznC1pO{)63hdD$NH1F zN%~6#1AMbLJ&kkR!{JiWuJ8?*Fz5w zA-iD5wI^`Ec!viLbYvsBoNtg@4-d2#d4dH?F%<4KE_sLY=1o=SC&gFvi$rfYD;W#C z)~NxaKjs2_lZpO3mE=$1f-iRts$fNzT3Yn~^)Kt|{X^BT6&~MgGI0g-@#ia(to^TC zPKCek(XICg25v?2#sC6mc?;pE=8IF5A#y>zlsDW;d|@sGq^?)ZuEPBhI{aJshdt}k zx#D|j3-m%UmM#`OzwTWLw^gyJs3>E?AB{-ELt;Hq0#*KMX1<1iyZns|t{3JCr=hy?!51tr&t@?#yqsQRC292^ib7nlHFp@dcFHNhV@d61mUNb>p8c2t) zJYF(!1#BrRyVQ$t@-72Bz;aC`SdHsMYaC{%qFBN diff --git a/spiders/config.yaml b/spiders/config.yaml index 2d1a81a..8fcc262 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -9,7 +9,7 @@ foerderinfo.bund.de: entry-list: link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link2: '#searchResults' - iteration-var-list: '[1,2,3,4,5,6,7,8]' + iteration-var-list: '[1,2,3,4,5]' parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" child-link: "/a[@class='c-search-result']/@href" @@ -28,7 +28,7 @@ foerderinfo.bund.de-bekanntmachungen: entry-list: link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D' link2: '#searchResults' - iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]' + iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]' #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]" child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()" diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index e27cecc..e64737e 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -320,15 +320,21 @@ class fdb_spider(object): if fdb_domain in link: dictionary_entry_list[n]["link"] = link - if fdb_domain not in link and ('http:' in link or 'www.' in link or 'https:' in link): + if fdb_domain not in link and 'http:' in link: + dictionary_entry_list[n]["link"] = link + if fdb_domain not in link and 'www.' in link: + dictionary_entry_list[n]["link"] = link + if fdb_domain not in link and 'https:' in link: dictionary_entry_list[n]["link"] = link if 'javascript:' in link: dictionary_entry_list[n]["link"] = link - else: + if fdb_domain not in link and ('http' or 'https' or 'www.') not in link: if link[-1] == '/': dictionary_entry_list[n]["link"] = fdb_domain + link else: dictionary_entry_list[n]["link"] = fdb_domain + '/' + link + + except Exception as e: print( @@ -345,10 +351,23 @@ class fdb_spider(object): def download_entry_data_htmls(self, list_of_fdbs): from selenium import webdriver - + from selenium.webdriver.chrome.service import Service + from pyvirtualdisplay import Display + display = Display(visible=0, size=(800, 800)) + display.start() + + #outputdir = '.' + #service_log_path = "{}/chromedriver.log".format(outputdir) + #service_args = ['--verbose'] + #driver = webdriver.Chrome('/usr/bin/chromium') options = webdriver.ChromeOptions() options.add_argument('headless') - driver = webdriver.Chrome(options=options) + options.add_argument("--remote-debugging-port=9222") + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + service = Service(executable_path='/usr/bin/chromedriver') + driver = webdriver.Chrome(options=options, service=service) + #driver = webdriver.Chrome() for fdb in list_of_fdbs: try: @@ -427,7 +446,7 @@ class fdb_spider(object): driver.switch_to.window(window_before) - if ('http' or 'www') in entry_link and ('javascript' or 'js' or '.pdf') not in enry_link: + if ('http' or 'www') in entry_link and 'javascript' not in entry_link and '.pdf' not in entry_link: try: # defining cookie to not end up in endless loop because of cookie banners pointing to redirects From fbee5d6229c64f6b6ffefaa4705e78b1c5fc679a Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Wed, 13 Dec 2023 16:20:27 +0100 Subject: [PATCH 2/2] last commit in detached head --- main.py | 4 ++-- requirements.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 3cc9312..4a2d91f 100644 --- a/main.py +++ b/main.py @@ -16,9 +16,9 @@ spider = fdb_spider(config) #spider.find_config_parameter(list_of_fdbs) -#spider.parse_entry_list_data2dictionary(list_of_fdbs) +spider.parse_entry_list_data2dictionary(list_of_fdbs) -#spider.download_entry_data_htmls(list_of_fdbs) +spider.download_entry_data_htmls(list_of_fdbs) spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/requirements.txt b/requirements.txt index fe32fe3..434498f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ pycryptodome==3.19.0 PySocks==1.7.1 python-dateutil==2.8.2 pytz==2023.3.post1 +PyVirtualDisplay==3.0 PyYAML==6.0.1 regex==2023.10.3 requests==2.31.0