diff --git a/crawlers/.MembersParliamentCrawler.py.kate-swp b/crawlers/.MembersParliamentCrawler.py.kate-swp new file mode 100644 index 0000000..e8cbb68 Binary files /dev/null and b/crawlers/.MembersParliamentCrawler.py.kate-swp differ diff --git a/crawlers/MembersParliamentCrawler.py b/crawlers/MembersParliamentCrawler.py index 0f0b25d..388bb3a 100644 --- a/crawlers/MembersParliamentCrawler.py +++ b/crawlers/MembersParliamentCrawler.py @@ -1,6 +1,7 @@ import yaml +import json import urllib.request, urllib.error, urllib.parse @@ -33,21 +34,18 @@ class membersParliamentCrawler(object): if key in listOfCountries: try: memberList = self.config.get(key).get('memberList') - except: - print("There is a problem with the entry memberList in the config.yaml") + except Exception as e: + print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e) try: memberListLink = memberList.get('link') - except: - print("No memberListLink defined in config.yaml") - print(memberListLink) - + except Exception as e: + print("No memberListLink defined in config.yaml - the original error message is:", e) # download the html page of the List of Members response = urllib.request.urlopen(memberListLink) webContent = response.read().decode('UTF-8') - # save interim results to files f = open('pages/' + key +'MemberList.html', 'w+') @@ -55,7 +53,7 @@ class membersParliamentCrawler(object): f.close - def parseMemberData2dictionary(self, listOfCountries): + def parseMemberListData2dictionary(self, listOfCountries): for country in listOfCountries: @@ -64,19 +62,27 @@ class membersParliamentCrawler(object): #use soupparser to handle broken html tree = lxml.html.soupparser.parse('pages/' + country + 'MemberList.html') - #for e in tree.iter(): - # print(e.tag) - + +# for e in tree.iter(): +# +# print(e.tag) +# # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): # # #print(etree.tostring(e).decode()) dictionaryMemberList = {} - for n in range(len(tree.xpath('//html//body//form//table//tr//td//table//tr'))): + countryConf = self.config.get(country) + countryConfMemberList = countryConf.get('memberList') + countryConfMemberListParent = countryConfMemberList.get('parent') + countryConfMemberListChildName = countryConfMemberList.get('child-name') + countryConfMemberListChildLink = countryConfMemberList.get('child-link') + + for n in range(len(tree.xpath(countryConfMemberListParent))): - name = tree.xpath('//html//body//form//table//tr//td//table//tr[' + str(n) + ']//td//a//text()') - link = tree.xpath('//html//body//form//table//tr//td//table//tr[' + str(n) + ']//td//a//@href') + name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName) + link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink) if len(name) > 0: @@ -88,7 +94,27 @@ class membersParliamentCrawler(object): except Exception as e: print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e) - + + # save interim results to files + f = open('output/' + country +'MemberList.txt', 'w+') f.write(str(dictionaryMemberList)) f.close + + def parseMemberData2dictionary(self, listOfCountries): + + for country in listOfCountries: + + f = open('output/' + country +'MemberList.txt') + text = f.read() + + # replace quotes with double quotes because of JSON specification - RFC7159 which would result in error for json.loads function + text = text.replace("\'", "\"") + + dictionaryMemberList = json.loads(text) + + for member in dictionaryMemberList: + + print('oi') + print(dictionaryMemberList[member]['link']) + diff --git a/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc b/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc index ce03e24..df11283 100644 Binary files a/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc and b/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc differ diff --git a/crawlers/config.yaml b/crawlers/config.yaml index 82233a0..792a3bc 100644 --- a/crawlers/config.yaml +++ b/crawlers/config.yaml @@ -2,11 +2,13 @@ # Follow the syntax and dont use tbody as it gets added by the browser (when researching xpath through inspector) # xpath syntax: https://www.w3schools.com/xml/xpath_syntax.asp +# lxml xpath syntax: https://www.geeksforgeeks.org/web-scraping-using-lxml-and-xpath-in-python/ nicaragua: + domain: 'http://legislacion.asamblea.gob.ni' memberList: link: 'http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/Main.xsp' - parent: 'html//body//form//table//tr//td//table' + parent: '//html//body//form//table//tr//td//table//tr' child-name: '//td//a/text()' child-link: '//td//a/@href' member: diff --git a/crawlers/useMembersParliamentCrawler.py b/crawlers/useMembersParliamentCrawler.py index ec88a18..4300d24 100644 --- a/crawlers/useMembersParliamentCrawler.py +++ b/crawlers/useMembersParliamentCrawler.py @@ -11,4 +11,6 @@ Crawler = membersParliamentCrawler(config) #Crawler.downloadMemberListPagesOfCountries(listOfCountries) +#Crawler.parseMemberListData2dictionary(listOfCountries) + Crawler.parseMemberData2dictionary(listOfCountries)