From 57a7111959d6ab214490184c974e197fb8b5de4f Mon Sep 17 00:00:00 2001 From: corsaronero Date: Sat, 25 Feb 2023 15:04:56 +0000 Subject: [PATCH] added memberData function with first step loading json, renamed memberListData function --- .../.MembersParliamentCrawler.py.kate-swp | Bin 0 -> 86 bytes crawlers/MembersParliamentCrawler.py | 56 +++++++++++++----- .../MembersParliamentCrawler.cpython-310.pyc | Bin 2520 -> 3181 bytes crawlers/config.yaml | 4 +- crawlers/useMembersParliamentCrawler.py | 2 + 5 files changed, 46 insertions(+), 16 deletions(-) create mode 100644 crawlers/.MembersParliamentCrawler.py.kate-swp diff --git a/crawlers/.MembersParliamentCrawler.py.kate-swp b/crawlers/.MembersParliamentCrawler.py.kate-swp new file mode 100644 index 0000000000000000000000000000000000000000..e8cbb683337def692c355b6dba67c59d0efe22de GIT binary patch literal 86 zcmZQzU=Z?7EJ;-eE>A2_aLdd|RWQ;sU|?VnvFP5s!q`4hNGo{9LCzT!*HU 0: @@ -88,7 +94,27 @@ class membersParliamentCrawler(object): except Exception as e: print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e) - + + # save interim results to files + f = open('output/' + country +'MemberList.txt', 'w+') f.write(str(dictionaryMemberList)) f.close + + def parseMemberData2dictionary(self, listOfCountries): + + for country in listOfCountries: + + f = open('output/' + country +'MemberList.txt') + text = f.read() + + # replace quotes with double quotes because of JSON specification - RFC7159 which would result in error for json.loads function + text = text.replace("\'", "\"") + + dictionaryMemberList = json.loads(text) + + for member in dictionaryMemberList: + + print('oi') + print(dictionaryMemberList[member]['link']) + diff --git a/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc b/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc index ce03e24d4da7c52e195d13b262379671695fe158..df11283147cda8e27e704ecea9c54cc24fa132a7 100644 GIT binary patch literal 3181 zcmb7GTaP106|SnTzPNpvWtq%olQ48vB*BEWSuP^L3T-smAkfB)ltjrk3Z}ufe}foyY76d(!xv@mv3yzj~E%TIVx7WhDeF?!B?i5ktTs9Vv@IHfDFH5oVMB73Y*EQ7D>rja z0R>rQSf%9HK4t|DDxKUdZ|Omp>iHi&7)T|(F!cg&tdfIBj=X882c8~C5A3LB-eqqu zj0=5wfEuVEWG@j?nb_H!s!+?^>_tf`o35&$aw~$xe%s$mCU8NaOmn+e z6lb|{Bxh<1`%vjOE^iJG>^n-PV_<;v-Kjj-OJWTdoYQ{6pHkFu=q-^NTID4xaJC z!TpAwQD+6tE!AX5hiqjCnlXSQB!H+9k_86}@s){=Z((NU$}EYHSy18ysIgQ4y3ire zE85a_W~`jdKBY_eftL3cZmB`I>q_P>DZ>5+$soa)>J_b(qO}I|RhLv$j<}kHB;#3C zRBn>Hq|DFjH;HgTSHrdqUG*+G{>d42-Ps?3y7hwiRZ+|88^o_asu#qE9Eq=MqF&ra z19rLv1Z`xERUP)um*7f(q>W(%NLo_#T1TOyQ`b=xU;?Zwz{=e14Z=v=jv?-xmyum- z`nxs#M1d7Z&aqKlgW~)Vj7r1!uwW=s!xN$K;zWDXL>;xf{UlApgJ|Z>l8M(3RjR!~ z@Tv54;(aw5zT0eh;5)plnZLG^i5^dMd)>GMa!a3R^-b7aa|=Af2?QaIJM-N%nT&D2 zQu$_?BZJj8p2<`}d?<4xlCf`|i~~K$4TwPM1?+*GF1V_0V(BFi&8>o`>Sa_QReX-` zfP7noMGOkm>~cqb^^Ckg$iDx=MrEVV%gHrs{w(zN{VnLQZ4#9o#7z zP3@xiHVTYP8(v|zxRuL2y%Xr*hnEL>3wozN2H`Nqb~txy%%$bO!*dThdngp)^m_}Y;iY%(3zpyu zfS>df{Q+Mwd~Thx1=q$3>ZNA@jyD#T248@WAb1R!nFV}=g!SWZo$^*V?( zePc~;`78;GYiwF2ByK=c4vu~t1wEPbVVcCbiLNy*aMQG(0ZMW6IoA?U`X-F^t@2`? zHO5OD#(2gM0~nSfY#nxuIkaZD%z^(pW&h>>n*Zqkx#(wR>@;nB0wJ-x-Q4YVM~Rq3 zSgv)u$CDr`TT}xURUV4lC=d)A`!3*w@kJRR(5PmkL~6rVSqvB!4emTfkhr3;HhS8QlmBLPwmg@ORl!?F@oXoaDNq~mszI)8RD zp{eDNs^P`~t^5ISz5o)455T8D$hY>!fg7hv2;L;6EF8#a-f!NY%)Iw|Gybjd%ei1s zt$GB1FE$?E{Hb~pY+bvCnj{hu60Hy-y40;F$d*VG-m@bcRxwV*KOoYQ>?G4x~9l>f`mR(?%0ZdTgrvX2U$ zaSXIGyRa9eaHP>3Eu@`U8$>!d#hgw>CJHxm7WlVJbUh%)&N0$~HMs|+*Nc1YSnYR4 zI=!0zXk8U0OFI2K>2AKocJr9+_`Y(mw$i#C>%3s^fSW&J%{<_rhjml2iP)G|=&ls_WBE1xbN4l4UVS6C&h2gD% z>V=_hwUZb@;gqM;^p*dMe*M4UpQUGStafN2#1p;SOwasddHYDkxAM93^(=J;V?7+} zFbzI@dS~|ZM5}X{W3w97l1VSwGl}kv2IJwdrAD!em@!1&DGOhEO0HV{+i__naX;dd zVM`yC)<~<;eSdN@9_r4Zugv38+=vF*LAhxT@|Aga>RBVsqrm%$=SEDcIF9l{tgn~Q zSKG#Y*0>>7&mrceck%KYxENhRAtw!I?UU`}EH>7T~33 zKc}ymhrgM><(~&TkNlVI=Wz%1QvQv<6`EW|CQpHT3cs?gEL57FvJs+StdgYD-c#}M kIL6}E^N0QmwF~G{)cD<}l;syIuLkBbCZK`22&mD&07k+bw*UYD diff --git a/crawlers/config.yaml b/crawlers/config.yaml index 82233a0..792a3bc 100644 --- a/crawlers/config.yaml +++ b/crawlers/config.yaml @@ -2,11 +2,13 @@ # Follow the syntax and dont use tbody as it gets added by the browser (when researching xpath through inspector) # xpath syntax: https://www.w3schools.com/xml/xpath_syntax.asp +# lxml xpath syntax: https://www.geeksforgeeks.org/web-scraping-using-lxml-and-xpath-in-python/ nicaragua: + domain: 'http://legislacion.asamblea.gob.ni' memberList: link: 'http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/Main.xsp' - parent: 'html//body//form//table//tr//td//table' + parent: '//html//body//form//table//tr//td//table//tr' child-name: '//td//a/text()' child-link: '//td//a/@href' member: diff --git a/crawlers/useMembersParliamentCrawler.py b/crawlers/useMembersParliamentCrawler.py index ec88a18..4300d24 100644 --- a/crawlers/useMembersParliamentCrawler.py +++ b/crawlers/useMembersParliamentCrawler.py @@ -11,4 +11,6 @@ Crawler = membersParliamentCrawler(config) #Crawler.downloadMemberListPagesOfCountries(listOfCountries) +#Crawler.parseMemberListData2dictionary(listOfCountries) + Crawler.parseMemberData2dictionary(listOfCountries)