alpcentaur
/
wd_importPEP



import yamlimport json
import urllib.request, urllib.error, urllib.parse
from lxml import etreeimport lxml.htmlimport lxml.html.soupparser


class membersParliamentCrawler(object):        def __init__(self, configFile):                with open(configFile, "r") as stream:            try:                self.config = yaml.safe_load(stream)            except yaml.YAMLError as exc:                print(exc)          # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
    def downloadMemberListPagesOfCountries(self, listOfCountries):                # download only html pages of the countries specified in input                for country in listOfCountries:            for key in self.config:                if key in listOfCountries:                    try:                        memberList = self.config.get(key).get('memberList')                    except Exception as e:                        print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e)                    try:                        memberListLink = memberList.get('link')                    except Exception as e:                        print("No memberListLink defined in config.yaml - the original error message is:", e)                                        # download the html page of the List of Members                    
                    response = urllib.request.urlopen(memberListLink)                    webContent = response.read().decode('UTF-8')
                    # save interim results to files                                        f = open('pages/' + key +'MemberList.html', 'w+')                    f.write(webContent)                    f.close
                        def parseMemberListData2dictionary(self, listOfCountries):                for country in listOfCountries:                        try:                                                #use soupparser to handle broken html                                tree = lxml.html.soupparser.parse('pages/' + country + 'MemberList.html')
#                for e in tree.iter():##                    print(e.tag)#                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):#                     #                     #print(etree.tostring(e).decode())                                dictionaryMemberList = {}                                countryConf = self.config.get(country)                countryConfMemberList = countryConf.get('memberList')                countryConfMemberListParent = countryConfMemberList.get('parent')                countryConfMemberListChildName = countryConfMemberList.get('child-name')                countryConfMemberListChildLink = countryConfMemberList.get('child-link')                                for n in range(len(tree.xpath(countryConfMemberListParent))):                                        name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName)                    link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink)                                        if len(name) > 0:                                            dictionaryMemberList[name[0]] = {}                        dictionaryMemberList[name[0]]['name'] = name[0]                        dictionaryMemberList[name[0]]['link'] = link[0]                        
            except Exception as e:                                print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)                        # save interim results to files                        f = open('output/' + country +'MemberList.txt', 'w+')            f.write(str(dictionaryMemberList))            f.close
    def parseMemberData2dictionary(self, listOfCountries):                for country in listOfCountries:                        f = open('output/' + country +'MemberList.txt')            text = f.read()                        # replace quotes with double quotes because of JSON specification - RFC7159 which would result in error for json.loads function            text = text.replace("\'", "\"")                        dictionaryMemberList = json.loads(text)
            for member in dictionaryMemberList:                                print('oi')                print(dictionaryMemberList[member]['link'])