automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
1.6 KiB

  1. import yaml
  2. import urllib.request, urllib.error, urllib.parse
  3. class membersParliamentCrawler(object):
  4. def __init__(self, configFile):
  5. with open(configFile, "r") as stream:
  6. try:
  7. self.config = yaml.safe_load(stream)
  8. except yaml.YAMLError as exc:
  9. print(exc)
  10. # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
  11. def downloadMemberListPagesOfCountries(self, listOfCountries):
  12. # download only html pages of the countries specified in input
  13. for country in listOfCountries:
  14. for key in self.config:
  15. if key in listOfCountries:
  16. try:
  17. memberList = self.config.get(key).get('memberList')
  18. except:
  19. print("There is a problem with the entry memberList in the config.yaml")
  20. try:
  21. memberListLink = memberList.get('link')
  22. except:
  23. print("No memberListLink defined in config.yaml")
  24. print(memberListLink)
  25. # download the html page of the List of Members
  26. response = urllib.request.urlopen(memberListLink)
  27. webContent = response.read().decode('UTF-8')
  28. f = open('pages/' + key +'MemberList.html', 'w+')
  29. f.write(webContent)
  30. f.close