From 16199256e3bb21e25d50a954487119e649a53d96 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Thu, 14 Dec 2023 23:37:10 +0000 Subject: [PATCH] javascript on highest level done better --- spiders/__pycache__/fdb_spider.cpython-39.pyc | Bin 14314 -> 16356 bytes spiders/config.yaml | 18 +++--- spiders/fdb_spider.py | 53 +++++++++++++++--- 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index a9293e6b987678f62423bccb570866465a3ec4f6..f60df44bcef7d627811872ce7b3b969b0c339ed3 100644 GIT binary patch delta 5365 zcma)9TX0)P89wLec67CDOO_?e_OX16lgM^#CvJKrm(r$FoTdqBqSm;I_r#X0i@oRA zj?`7u;tmY7DQ>sGOzD7J2AJl7f##tPg-+=VJTTJ&4`mnu9fs)uZD*Jc1AR!L<^NZ* zqNFLsp7ZHHyZ`R~x3l~0JNdWDzNFXdlHl)y*hBgo-+j^-l`Bu~`f?|ch@5FpwBI2S z2T>oDh^hxREQ#QcU;U>nck&mNslhH7<%j1GJjTDRlEmc0(x!Fxy^_MDv?Sy|bHC*^ z{vC^b*s&#-)cd5|lvOg7O}P-0rIJmSiV@R7RA$>yWQjOSD!FxDw$s2OX1={2FnJ?D0J78unf5kG`>k~TeCY`@G>9l}OH`-YC!@8cvx=LZ9 zY2DVwx}b4dB4GV_Th3Mc9hb%><{{1{Wz#~>GZ*+S&+aREOGUe75QVvwX)W}ARl=lC%!*Pu))4y} z(twZxWRMJ9kV~DcJsn{o)=7q8!Y~Vir%)=gWOkWu=H7s!IV{b-2`P$Jy2%dFV<96V zE>DAY58267aL>)WVM(OU38_;`P_E2Xt~J7B&4YL0U>2$LqUBfh?1m-7X$=;IDhH_# zl6fV9uI(j0If{+SwHB~)o%rEkdB-suzD?^&V8y>;CDL(eWc1RatyK;t9BCty$}R0F z9-PT*b(tTq-_TLD&K#Y&eY=06-p_BgSGtRVw2?A(nlw}STzoA_q{UamUV^)rN~)^rLlj^jFAOU>&j9!?p{ znPlO4`W>* zHor*oS$*5GM2mh-pIB6cAgT2;Swb5km&;q2AXJD#UE3T#o8div@nM?I9IRW#MN#5w`%Y1M5LzIRJ(H!RWUB%+wQ9u>c7-^^u||AHvdhk7x&N z#jn_v9x=v$>2>EziX7r!^;~pE(W7pF;u1gBv2^L8cR}{=!$vp46vAHK*D}2KMih@A z+=4KPfOVfx&Aee!sC;@eyp9B@7rX93IL04o>6DN2XIfs9Pw+*5M*bOp!~Y|Xd4&rs z@+VvGvsh<6{N2`di={Zh^|mGX)l0u=`;T0HfJ9kf4bP+pSb#a0KMaS%%@T7$s|>A- zbl3aYb^D(sJ@x))oBK(y-ugmwYkgLk9wxn;mgQd(Eh>I+33(HmS%CC`0Woog|AY4i zgYg)g*a1AXw@urWR}RO>Z~ zowx8iC@!}sE+6p&V*$hK+|GU8Ew~;0BJl(&(JH71(oX9vD|yVak9n}kY$0t7h>PjR zcc{kp5RjRYmw79#;FS*y@xgn`6>~~{<_BW}KsZ~41jY$g=OICm+JWDBNC!xPqEE;n zkc0Ja!VA3q2{gN0x4ZLy*)4{LVR(m-A|QpB3Q_6UHkXBvE<;dTz%$FN39PP#b>c1& zmP%m~Ek$652*VDd&~L-YHrNl^NiVot_5<#i>nfUQVPcfumEt76zM;@K(Y9tcUL9y6Sdz|&QUvYss&326BO zWXo(}J!BY2F^0RFfq&qMH{An>%geBQZ+-b*>?1oimF4e~5#f=Qs=_yU6Q!NdCZr-r zCC%(jYw13wkzHgri9f55QK+y!Y^}r?(Bx7%1{rx8%V^V5c(`6j#p2GoN;ut5#>hC? zb3v&Lkg+WphQy$ZCbrX+J6Io^hOzyS%pInW0g1kL9P8hJZ>V0D{pb5dS;B7_>+<^5 z3Rj;P79T7;3Cl*nP`LVlq1J-EZYbV-wsodZ2Ta@BFim_N;MRL}(*j^xKNfqF8D+ee zfgPW0h6pKqE_C@*vpWbSiDer5c%B<+-q%QTEIqU(L;cq>oqP4ure_fng(JsC74UKz z{V0yZHgrQ~IJ=`X0P_rNNTorxZNv~8N{>`_iXt9L?-KE=MZ{KsYlqk|0`UEl{Y@dc zcEk8sy*B)~Zn?s#_wx02k}JgxA1qujuN3&HmA|Y!Dy}0p&WIZU?Zr(T85HeD7(~EL znvNjsMA(I}8zGJ`iZF&yJ2_FJJfrOio=35a&~$q87qrw#aa7=uS#_n8Ye{3Crk2eb znK4m+6=4lQ9Kj`!F55;I&1~ib+)Y&d98%5v62&8wJ_L}ktgo;0wZ6Z)JFzK@5aEll zhswL@L445=@RX?$%EKrjYo@m&d=}vjggX&F@kka&tvG{kLcO~YK8G*^kZ>C7DRX{t z&dew5HP)PP^J2gII6Vs8RV8m!)ogM}Cn;)Ft#}`)0;i?9WLCd&e$#23?*^RDaj(9z zsspRrkHhVHZaze#u1P| z(*(jXglkB`DQv^Z=)(xtrv-d3 z7XPaKYTC><2XpekrH=;VPIm}LQh;KNm&bYoa5F1fw9!%8R7yNOc3V(Tk4A0K4IF3i zm&XR>FY&j=CfY=SVPViO0wk2Q!AHjL+#~9WK}%I#CTHR1ky|=JA4Rpt0E*t_BsK8z znAD)VD$x%9-1txAFrVA=l0qkVbmCkYgHmiU^>b!?Jx@t|9xi*PD8!NOCbsUuNktv) zMd_Ntna4IvTobjM#MfI9jv(BEZ~@^7gs&lNN|Luu^`64_16-S&w%>!|F+MvvH1Pjq zk^f+Fx0~abZy@CPo0C!bUjD)4NbqTF5GnjRO3xrH^Zipj^8NhO)RvM|E`2cdj2wIb zJ5~@LLHH)ZbNu4IC>*np?=}EcR-I~D>Qhex_Q11?I-nWY=>`6`eZBHDSEn_3o{vnA a$+z;;)3+=1F8;0QNjbuQHQlACmj42RK{uZO delta 3637 zcmb7GTWnlM89sB)Ue4Zk?Y-IFn{Tn#_Bu{5aatR@X_JyPiQ^`PHcMQy@f^pyws-B> zV>@2vWThGjc!=a`P*tHw-F<=wgbEG<0TNN3P(=j1P_#&WKuFyB!bN>S0{;K(CN@dR z#nyi3pP7GV{`v2}t^Tp(&3Qad3I6_6S=TtU_ca?1L z+wc92kdXMXJl^jEZa@59fM2;FOA;rir9FB6bwlP-K{Dw%5mg?DHH(_G*U2t1)K{hA zm_^d%J+jk7B*RLiN?2E@DlRCoDpz3>nLlATbZfy@vvZrSa>p)-NZcNg>cntX+PJgk zs7o~`b->sK$FbIN@3oG5trC00<3t`f-o>rEGLe9Lw8ecFxZQAhP*=Enm*Cn}pG0Bu zmb>U|e#OtN@pG z*F`{`7HaT@tqX=Mw!ayCpZHTqyAbdkmK4{*1DE@NY*<;Yctc|YCVg`~r!OWB?j%d0k^UJzpv=Y~|R(Y96?cieH2ukTLO2WR>KFJNmAB z$kOs2%X~tt1g_W>a0F%kzUb-LR^-a>#hs27>rKO784_b%UHb!_ACTlajq%_Xu$3u| zKcW&%imi*eSSbEw9^`iJ?~rKMlvEt&L7D_QwIc5+>>HvM3uRK^TS4@%hg_id(=8`e z_LQB!fC(+8lUiuEs0U(U-9vl03?_7dQ-N#JUikHiV7%MWKg-(j+5Y3DCQ$g4BID;|H zPK$pIMU&@{Jdbb%;T43>Abb{KTx7b(ZTEpJet zR(pZtQSyjdC5aL$uS#pgfH>uHO|DCZ!j+mr6*DH}f`u#8a+??yGtF4$e`8o75Ur4E zFjH9Tq-Lj0w3(^K){1s@=MAu--L#>D+PNJwK^4LsquqYXZbmzN$D}c^DOEATxeff! zGJnfZ!OG^04he@_!Rjs?gTRcuWd}GsE0Y z-SsLld=T+=>VZjr1*6|iy+8+AzNbQhJb6fNUsLXf z%%AWuml;@FRU^n6?|qjiUC3iY87p~Fdp1eLsQ|5yd zc9#ZWyhOWA?9n^7NEH&m-O3OT88N-95G%xajP}w#+J8r;M{dhT9A|3@9tTw>z$9OV zvY|pc7-^ZLQWfmfjYq*;73Cb4OV=o?~ILNA~?V^LB$%hCqyJ;y+k=^pT|#g`NXj7w`YK<9V32 zLvFXQZ^0{%nx%RS^2*q1?^>Uc0&hv}N=7#aA*!YnJ-nyTUpr#@aZh2uR3J78>Yioi zn=J)`M(D9a8hJcp{GSdBkS~hRA(n@} zP#w&wWo=a}<~P^aJWOXjFD$dtns%uvX4ajfbRoxXY_4fVJ!@+&etfkvtC(5d`tdOL zQocwR;Cyqpb#XPnxRUj5!(?fDVL4Z%3aCX~98N7|UF*3U+CsUs$riPSTq-xzwcLtE z^Q_#kU@z0K16|CmX$M6z>(Gj92O5^*3hTxv_28ZM%i6}K20aYcuZB%4E|#cfwmxi7 z+#8NnSCADea@NMQBCN4A4AQ-aDlTe*y>7BJZD@8%kJsRFZkya@m}`UWHLZU8hT?qF-S-WGMhtJnK1DhQ2bDOTltO zVwZ(D`6Ch)bI<%x{(~$$&)%$}dMZ9$yQyckON?d~OPinr^v-nrP`_lhN-3CSNrZoQ zqo?*z&mrF+!a*aoio-R8B0@=$JguH;1INQ6FgamsHojtZ@<`t)oIU8No@!6=_T-3* zA>$UpOX6pfF)}JXoJ@zxIAPjs6R8Tqc`-DVAi6j=RhK^^`yWnyjf6gl3l|WsA#5Y; zirP$MfZ2f#!mo^tmP%BIdS1P%R;4KX66R_;#8$azNX3mJW ZX2xO|$}0GVDo15EfiD410dH}`_g^AFO|t+1 diff --git a/spiders/config.yaml b/spiders/config.yaml index dca8a01..c0be443 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -82,15 +82,15 @@ evergabe-online: jsdomain: 'https://www.evergabe-online.de/search.html' jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span[' jslink2: ']' - iteration-var-list: "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102]" - parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody" - child-name: "//tr/td[1]/div/a/text()" - child-link: "//tr/td[1]/div/a/@href" - javascript-link: "/td[6]/a" - child-info: "/td[4]/text()[1]" - child-period: "//td[2]/abbr/text()" - #child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" - child-sponsor: "/tr/td[4]/text()" + jsiteration-var-list: "[1,2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,7,8,9,10]" + iteration-var-list: "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]" + parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody//tr" + child-name: "//td[1]/div/a/text()" + child-link: "//td[1]/div/a/@href" + javascript-link: "" + child-info: "//td[3]/div/text()" + child-period: "//td[5]/text()" + child-sponsor: "//td[2]/div/text()" entry: general: uniform: 'TRUE' diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 023adf7..9569fcf 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -17,6 +17,9 @@ from trafilatura import extract from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer +import time + + class fdb_spider(object): def __init__(self, config_file): with open(config_file, "r") as stream: @@ -81,10 +84,17 @@ class fdb_spider(object): e, ) try: - entry_jsdomain = eval(entry_list.get("jsdomain")) + entry_jsiteration_var_list = eval(entry_list.get("jsiteration-var-list")) except Exception as e: print( - "No iteration-var-list defined in config.yaml - the original error message is:", + "No jsiteration-var-list defined in config.yaml - the original error message is:", + e, + ) + try: + entry_jsdomain = entry_list.get("jsdomain") + except Exception as e: + print( + "No jsdomain defined in config.yaml - the original error message is:", e, ) entry_jsdomain = 'NONE' @@ -134,14 +144,16 @@ class fdb_spider(object): else: from selenium import webdriver from selenium.webdriver.chrome.service import Service + from pyvirtualdisplay import Display display = Display(visible=0, size=(800, 800)) display.start() - #outputdir = '.' - #service_log_path = "{}/chromedriver.log".format(outputdir) - #service_args = ['--verbose'] - #driver = webdriver.Chrome('/usr/bin/chromium') + ##outputdir = '.' + ##service_log_path = "{}/chromedriver.log".format(outputdir) + ##service_args = ['--verbose'] + ##driver = webdriver.Chrome('/usr/bin/chromium') + options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument("--remote-debugging-port=9222") @@ -149,7 +161,34 @@ class fdb_spider(object): options.add_argument('--disable-dev-shm-usage') service = Service(executable_path='/usr/bin/chromedriver') driver = webdriver.Chrome(options=options, service=service) - + # driver = webdriver.Chrome() + driver.get(entry_jsdomain) + for i in range(len(entry_jsiteration_var_list)): + time.sleep(2) + print('trying to get element') + try: + element = driver.find_element( + "xpath", + entry_list_jslink1 + + str(entry_jsiteration_var_list[i]) + + entry_list_jslink2 + ) + print(entry_iteration_var_list[i]) + time.sleep(2) + print('clicking..') + element.click() + time.sleep(2) + #window_after = driver.window_handles[1] + print('length of the window handles', len(driver.window_handles)) + #driver.switch_to.window(window_after) + web_content = driver.page_source + + f = open("spiders/pages/" + key + str(entry_iteration_var_list[i]) + "entryList.html", "w+") + f.write(web_content) + f.close + except Exception as e: + print('the iteration var element for clicking the pages was not found.. the original message is:',e ) + def find_config_parameter(self, list_of_fdbs): for fdb in list_of_fdbs: