alpcentaur
/
wd_importPEP



import yaml
import urllib.request, urllib.error, urllib.parse
from lxml import etreeimport lxml.htmlimport lxml.html.soupparser


class membersParliamentCrawler(object):        def __init__(self, configFile):                with open(configFile, "r") as stream:            try:                self.config = yaml.safe_load(stream)            except yaml.YAMLError as exc:                print(exc)          # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
    def downloadMemberListPagesOfCountries(self, listOfCountries):                # download only html pages of the countries specified in input                for country in listOfCountries:            for key in self.config:                if key in listOfCountries:                    try:                        memberList = self.config.get(key).get('memberList')                    except:                        print("There is a problem with the entry memberList in the config.yaml")                    try:                        memberListLink = memberList.get('link')                    except:                        print("No memberListLink defined in config.yaml")                    print(memberListLink)                                                            # download the html page of the List of Members                    
                    response = urllib.request.urlopen(memberListLink)                    webContent = response.read().decode('UTF-8')
                                        # save interim results to files                                        f = open('pages/' + key +'MemberList.html', 'w+')                    f.write(webContent)                    f.close
                        def parseMemberData2dictionary(self, listOfCountries):                for country in listOfCountries:                        try:                                                #use soupparser to handle broken html                                tree = lxml.html.soupparser.parse('pages/' + country + 'MemberList.html')                #for e in tree.iter():                #    print(e.tag)                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):#                     #                     #print(etree.tostring(e).decode())                                dictionaryMemberList = {}                                for n in range(len(tree.xpath('//html//body//form//table//tr//td//table//tr'))):                                        name = tree.xpath('//html//body//form//table//tr//td//table//tr[' + str(n) + ']//td//a//text()')                    link = tree.xpath('//html//body//form//table//tr//td//table//tr[' + str(n) + ']//td//a//@href')                                        if len(name) > 0:                                            dictionaryMemberList[name[0]] = {}                        dictionaryMemberList[name[0]]['name'] = name[0]                        dictionaryMemberList[name[0]]['link'] = link[0]                        
            except Exception as e:                                print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)
            f = open('output/' + country +'MemberList.txt', 'w+')            f.write(str(dictionaryMemberList))            f.close