automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
3.3 KiB

  1. import yaml
  2. import urllib.request, urllib.error, urllib.parse
  3. from lxml import etree
  4. import lxml.html
  5. import lxml.html.soupparser
  6. class membersParliamentCrawler(object):
  7. def __init__(self, configFile):
  8. with open(configFile, "r") as stream:
  9. try:
  10. self.config = yaml.safe_load(stream)
  11. except yaml.YAMLError as exc:
  12. print(exc)
  13. # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
  14. def downloadMemberListPagesOfCountries(self, listOfCountries):
  15. # download only html pages of the countries specified in input
  16. for country in listOfCountries:
  17. for key in self.config:
  18. if key in listOfCountries:
  19. try:
  20. memberList = self.config.get(key).get('memberList')
  21. except:
  22. print("There is a problem with the entry memberList in the config.yaml")
  23. try:
  24. memberListLink = memberList.get('link')
  25. except:
  26. print("No memberListLink defined in config.yaml")
  27. print(memberListLink)
  28. # download the html page of the List of Members
  29. response = urllib.request.urlopen(memberListLink)
  30. webContent = response.read().decode('UTF-8')
  31. # save interim results to files
  32. f = open('pages/' + key +'MemberList.html', 'w+')
  33. f.write(webContent)
  34. f.close
  35. def parseMemberData2dictionary(self, listOfCountries):
  36. for country in listOfCountries:
  37. try:
  38. #use soupparser to handle broken html
  39. tree = lxml.html.soupparser.parse('pages/' + country + 'MemberList.html')
  40. #for e in tree.iter():
  41. # print(e.tag)
  42. # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
  43. #
  44. # #print(etree.tostring(e).decode())
  45. dictionaryMemberList = {}
  46. for n in range(len(tree.xpath('//html//body//form//table//tr//td//table//tr'))):
  47. name = tree.xpath('//html//body//form//table//tr//td//table//tr[' + str(n) + ']//td//a//text()')
  48. link = tree.xpath('//html//body//form//table//tr//td//table//tr[' + str(n) + ']//td//a//@href')
  49. if len(name) > 0:
  50. dictionaryMemberList[name[0]] = {}
  51. dictionaryMemberList[name[0]]['name'] = name[0]
  52. dictionaryMemberList[name[0]]['link'] = link[0]
  53. except Exception as e:
  54. print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)
  55. f = open('output/' + country +'MemberList.txt', 'w+')
  56. f.write(str(dictionaryMemberList))
  57. f.close