|
|
import yaml
|
|
|
|
import urllib.request, urllib.error, urllib.parse
|
|
|
|
class membersParliamentCrawler(object):
|
|
|
|
def __init__(self, configFile):
|
|
|
|
with open(configFile, "r") as stream:
|
|
try:
|
|
self.config = yaml.safe_load(stream)
|
|
except yaml.YAMLError as exc:
|
|
print(exc)
|
|
|
|
|
|
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
|
|
|
|
def downloadMemberListPagesOfCountries(self, listOfCountries):
|
|
|
|
# download only html pages of the countries specified in input
|
|
|
|
for country in listOfCountries:
|
|
for key in self.config:
|
|
if key in listOfCountries:
|
|
try:
|
|
memberList = self.config.get(key).get('memberList')
|
|
except:
|
|
print("There is a problem with the entry memberList in the config.yaml")
|
|
try:
|
|
memberListLink = memberList.get('link')
|
|
except:
|
|
print("No memberListLink defined in config.yaml")
|
|
print(memberListLink)
|
|
|
|
|
|
# download the html page of the List of Members
|
|
|
|
|
|
|
|
response = urllib.request.urlopen(memberListLink)
|
|
webContent = response.read().decode('UTF-8')
|
|
|
|
f = open('pages/' + key +'MemberList.html', 'w+')
|
|
f.write(webContent)
|
|
f.close
|
|
|
|
|
|
|