From a05b5bc33fcf7b6a7e0cae9fc8a46f269f245616 Mon Sep 17 00:00:00 2001 From: corsaronero Date: Sat, 25 Feb 2023 16:19:12 +0000 Subject: [PATCH] added function parseMemberData2dictionary() for first property political party --- .../.MembersParliamentCrawler.py.kate-swp | Bin 86 -> 0 bytes crawlers/MembersParliamentCrawler.py | 85 +++++++++++++++--- .../MembersParliamentCrawler.cpython-310.pyc | Bin 3181 -> 4399 bytes crawlers/config.yaml | 10 +-- crawlers/output/nicaraguaMemberList.txt | 2 +- crawlers/useMembersParliamentCrawler.py | 2 + 6 files changed, 82 insertions(+), 17 deletions(-) delete mode 100644 crawlers/.MembersParliamentCrawler.py.kate-swp diff --git a/crawlers/.MembersParliamentCrawler.py.kate-swp b/crawlers/.MembersParliamentCrawler.py.kate-swp deleted file mode 100644 index e8cbb683337def692c355b6dba67c59d0efe22de..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 86 zcmZQzU=Z?7EJ;-eE>A2_aLdd|RWQ;sU|?VnvFP5s!q`4hNGo{9LCzT!*HU 0: - dictionaryMemberList[name[0]] = {} - dictionaryMemberList[name[0]]['name'] = name[0] - dictionaryMemberList[name[0]]['link'] = link[0] + dictionaryMemberList[n] = {} + dictionaryMemberList[n]['name'] = name[0] - + if countryDomain in link[0]: + + dictionaryMemberList[n]['link'] = link[0] + + if countryDomain not in link[0]: + + dictionaryMemberList[n]['link'] = countryDomain + link[0] + except Exception as e: print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e) @@ -101,20 +110,74 @@ class membersParliamentCrawler(object): f.write(str(dictionaryMemberList)) f.close - def parseMemberData2dictionary(self, listOfCountries): + def downloadMemberDataHtmls(self, listOfCountries): for country in listOfCountries: f = open('output/' + country +'MemberList.txt') text = f.read() - # replace quotes with double quotes because of JSON specification - RFC7159 which would result in error for json.loads function - text = text.replace("\'", "\"") + dictionaryMemberList = eval(text) - dictionaryMemberList = json.loads(text) + + for memberid in dictionaryMemberList: + + + memberLink = dictionaryMemberList[memberid]['link'] + + # download the html page of the Member + + response = urllib.request.urlopen(memberLink) + webContent = response.read().decode('UTF-8') - for member in dictionaryMemberList: + # save interim results to files + + filename = 'pages/' + country + '/' + str(memberid) +'.html' + + os.makedirs(os.path.dirname(filename), exist_ok=True) + f = open( filename, 'w+') + f.write(webContent) + f.close - print('oi') - print(dictionaryMemberList[member]['link']) + def parseMemberData2dictionary(self, listOfCountries): + + for country in listOfCountries: + + print('started to parse data of member of ' + country + ' ..') + + f = open('output/' + country +'MemberList.txt') + text = f.read() + + dictionaryMemberList = eval(text) + + + countryConf = self.config.get(country) + countryDomain = countryConf.get('domain') + countryConfMember = countryConf.get('member') + countryConfMemberInfo1 = countryConfMember.get('info-1') + countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent') + countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty') + + for memberid in dictionaryMemberList: + + print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..') + + filename = 'pages/' + country + '/' + str(memberid) +'.html' + + tree = lxml.html.soupparser.parse(filename) + + politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty) + + print('oi', politicalParty) + + if len(politicalParty) > 0: + + dictionaryMemberList[memberid]['political party'] = politicalParty[0] + + + + f = open('output/' + country +'MemberList.txt', 'w+') + f.write(str(dictionaryMemberList)) + f.close + diff --git a/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc b/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc index df11283147cda8e27e704ecea9c54cc24fa132a7..75d8bd4a3914d9ae53a6bd3556d4a026b859c900 100644 GIT binary patch delta 2303 zcmai0&2Jk;6rY*>@OpQ>cG3_hO$eqfptL0YghHDJp`^4Rh$0oKswD(&X47n9dz0Bo zlUTdB!abl$sEYOIuR6VQb8dlY!M9t-^In~?M z+%>{AuHPqI_nnSbb+wB( zW@kC!udVx$%+l}mJ5FW^KUxi&k)NgG#+!63T`}&Bl;Elkv>9lTq=dwDmVEGC!djoE zrg^q)>=<0%rAaQ%foFhcDbEBiue=<1wlrlf)^@De+NB+MAjW*+bVs;VCNdv89a<(l zUnVg$*yBEB)>g9xi05`_o7*e8co2|SkDVzJyWAdD{xISAZpH;1xl2T=>)|?>a}Ui? z@nC-)9DXprcIhT*mUSZStY0H|PeDF85*v z-%*5C>ae637k3Kqb$W;HQcB|DG6Y%!v;msFInfVk6-U5iCJ!66pqbvt{oEdaf$?Id z*EbvWOka{JdU9E}87n#xRbGY{z+-rzh^Nycsb-$Szo2xaslxdL^Vh~qYXbd!? zXr2W#UJ`lq$Iv_nCex+x{cL~-3o;1PFD-90a~8t&rW9KDWm;BM%ES<#7 zNDuKmniJ_e`E%DVfg4SMu^3~6x^BD7ru)wz8)IYCWfm)eEiso3Fm_M>)9W8();;!5 zzv)qn>OZwEB4dZsX~#}~%~#sUKNFhb;1}R14$dV^Ld~dbL&iHa(SR{@mn6DmnD4}h z4dB`9i5VNQ37~7kB*{r_S>G{YWF@c>*YB8$!Fg_|SSGMB7n_QWxiU%YwzQV>(Ci&2 zws$GFc>XhBfF!ZaZGae$b>qB(S6eOsyn{V}ojGv|t}9+fqf*s}9)i9{PwyF%2<`jh zG_(B8Ad(-3D?&loVPM->ua}88;508AdaL<^q_Eg;8cTYAZeuM3m_Y{2chr zTEjntxbqm|;skEirN1e~tLRz{zj50a0k7I!@7^MMQbn_A;kc1W>Je-nyb0GM~C!5#R`kEKlHy^XAte7^asOh zAM9OVuy6l|nLEILY+r*OJdQs=Lb;|e)DQT;6d$x6ABe)90rqBW_Vfy{1`qYhY6Bq* zj?~D2l%%?QEcwiyAXbpwJclK%QN7cmW7xqBy~f2LuACS_TH~ z*l_`(c(I2(iXA0mMTi9wfr=vCxLxc}UIIbejElSkLe@Q`Z;Gs4Is9$Bzel?Q9AxyA zMLVP{N9J0`BUuyD=M@=NlwMRg08GFYh9=OOr3>*W6(vJuFo3X-NGlwK&ky9@yZibR!#fdD*7tEd9E zF*4p(ZgmTxcu2jl8pJf31L0E@ASQ5Z63uBeXV9ESa}LeZMNgH0Kv7W@ATE+NxUa!` zweOJjby-~<{Lj$ake;*E(I<~Xe*C7ajz4*J29r7eNG`=JAtOXw#p}F+ckmv~b(-0y zrXIpXgdd0{)h#Ml+m}}xN^A%o4;~xDwFKhY59J1u`b+ElzNC&&E3G;wOE2I-DA(dG zG}qFH&geACQ-tMh!3maQjanDoF`>c0ctN~@>#z>1`f?ODtKPb(*Mr-Wy?+s+oc`ui Y=tRogTLW0K5CO`52yOz!PX~;D0cGJ7CIA2c delta 1109 zcmZ`&&5jdC5bo;fnd$kpcX909WwR_OfV>+BghWXsA`uY@DH2>b04)*8>`YjbvB&Od zhsBK9m7G%!kyfLeanBW zN|*+LOYr;e}wLS z*qs`}JN)Jyc>D0jbM{TrcFUDsahwni=Upbl_ivGYixK5M`WDC}05bNJ7x{TGr9f_xJIL-N zcoOu@g?3VhKqyVw>EDk_BZ>RM+#7V%ZfRsn$~W{5 zuEEr>s$mVGU6YQsk84;*5KPH9E*nvIM|IQUcOC}9+et@#B_CZ)`(G5toW1&6dUsvJ zhK3Wx_s+XlK14Hn4}nukn~b?Wbz!lZbc1eyPkq{e2G9fcSLin>KcN3ETOn{@zm7l9 zleSu%ceh&lHoD4cY(7V$8k?;mb=UTDHluoA=XAo9F=M%&A2T;`I>L#iaEhkLsX|X~ zVIxuW|M1Oc{Mt92j{drD%Q2&)(O?!~eg*0wiy(UIxSa9XDve8Z(2*D}z5*q`G|&oAv3 zf7h14E`s{j-A&vr&uP$PO8;dD^sj-SA6KTIKgYAV$JhM}mA~yvOYgaeAi|MMlJ33p e&mOC67W4WhoGpH;-&qcIF4nYyZQ