alpcentaur
/
wd_importPEP



import os
import yamlimport json
import urllib.request, urllib.error, urllib.parse
from lxml import etreeimport lxml.htmlimport lxml.html.soupparser


class membersParliamentCrawler(object):        def __init__(self, configFile):                with open(configFile, "r") as stream:            try:                self.config = yaml.safe_load(stream)            except yaml.YAMLError as exc:                print(exc)          # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
    def downloadMemberListPagesOfCountries(self, listOfCountries):                # download only html pages of the countries specified in input                for country in listOfCountries:            for key in self.config:                if key in listOfCountries:                    try:                        memberList = self.config.get(key).get('memberList')                    except Exception as e:                        print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e)                    try:                        memberListLink = memberList.get('link')                    except Exception as e:                        print("No memberListLink defined in config.yaml - the original error message is:", e)                                        # download the html page of the List of Members                    
                    response = urllib.request.urlopen(memberListLink)                    webContent = response.read().decode('UTF-8')
                    # save interim results to files                                        f = open('crawlers/pages/' + key +'MemberList.html', 'w+')                    f.write(webContent)                    f.close
                        def parseMemberListData2dictionary(self, listOfCountries):                for country in listOfCountries:                        try:                                                #use soupparser to handle broken html                                tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html')
#                for e in tree.iter():##                    print(e.tag)#                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):#                     #                     #print(etree.tostring(e).decode())                                dictionaryMemberList = {}                                countryConf = self.config.get(country)                countryDomain = countryConf.get('domain')                countryConfMemberList = countryConf.get('memberList')                countryConfMemberListParent = countryConfMemberList.get('parent')                countryConfMemberListChildName = countryConfMemberList.get('child-name')                countryConfMemberListChildLink = countryConfMemberList.get('child-link')                                for n in range(len(tree.xpath(countryConfMemberListParent))):                                        name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName)                    link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink)                                        if len(name) > 0:                                            dictionaryMemberList[n] = {}                        dictionaryMemberList[n]['name'] = name[0]                                                if countryDomain in link[0]:                                                    dictionaryMemberList[n]['link'] = link[0]                                                if countryDomain not in link[0]:                                                        dictionaryMemberList[n]['link'] = countryDomain + link[0]                                        except Exception as e:                                print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)                        # save interim results to files                        f = open('crawlers/output/' + country +'MemberList.txt', 'w+')            f.write(str(dictionaryMemberList))            f.close
    def downloadMemberDataHtmls(self, listOfCountries):                for country in listOfCountries:                        f = open('crawlers/output/' + country +'MemberList.txt')            text = f.read()                        dictionaryMemberList = eval(text)                                    for memberid in dictionaryMemberList:                                                memberLink = dictionaryMemberList[memberid]['link']                                # download the html page of the Member                    
                response = urllib.request.urlopen(memberLink)                webContent = response.read().decode('UTF-8')
                # save interim results to files                                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'                                os.makedirs(os.path.dirname(filename), exist_ok=True)                f = open( filename, 'w+')                f.write(webContent)                f.close                                def parseMemberData2dictionary(self, listOfCountries):                for country in listOfCountries:                        print('started to parse data of member of ' + country + ' ..')                        f = open('crawlers/output/' + country +'MemberList.txt')            text = f.read()                        dictionaryMemberList = eval(text)                                    countryConf = self.config.get(country)            countryDomain = countryConf.get('domain')            countryConfMember = countryConf.get('member')            countryConfMemberInfo1 = countryConfMember.get('info-1')            countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent')            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty')                    for memberid in dictionaryMemberList:                                print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')                                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'                                tree = lxml.html.soupparser.parse(filename)                                politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty)                                print('oi', politicalParty)                                if len(politicalParty) > 0:                                        dictionaryMemberList[memberid]['political party'] = politicalParty[0]                                                            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')            f.write(str(dictionaryMemberList))            f.close