automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
import os
import yaml
import json
import urllib.request, urllib.error, urllib.parse
from lxml import etree
import lxml.html
import lxml.html.soupparser
class membersParliamentCrawler(object):
def __init__(self, configFile):
with open(configFile, "r") as stream:
self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
def downloadMemberListPagesOfCountries(self, listOfCountries):
# download only html pages of the countries specified in input
for country in listOfCountries:
for key in self.config:
if key in listOfCountries:
memberList = self.config.get(key).get('memberList')
except Exception as e:
print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e)
memberListLink = memberList.get('link')
except Exception as e:
print("No memberListLink defined in config.yaml - the original error message is:", e)
# download the html page of the List of Members
response = urllib.request.urlopen(memberListLink)
webContent ='UTF-8')
# save interim results to files
f = open('crawlers/pages/' + key +'MemberList.html', 'w+')
def parseMemberListData2dictionary(self, listOfCountries):
for country in listOfCountries:
#use soupparser to handle broken html
tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html')
# for e in tree.iter():
# print(e.tag)
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
# #print(etree.tostring(e).decode())
dictionaryMemberList = {}
countryConf = self.config.get(country)
countryDomain = countryConf.get('domain')
countryConfMemberList = countryConf.get('memberList')
countryConfMemberListParent = countryConfMemberList.get('parent')
countryConfMemberListChildName = countryConfMemberList.get('child-name')
countryConfMemberListChildLink = countryConfMemberList.get('child-link')
for n in range(len(tree.xpath(countryConfMemberListParent))):
name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName)
link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink)
if len(name) > 0:
dictionaryMemberList[n] = {}
dictionaryMemberList[n]['name'] = name[0]
if countryDomain in link[0]:
dictionaryMemberList[n]['link'] = link[0]
if countryDomain not in link[0]:
dictionaryMemberList[n]['link'] = countryDomain + link[0]
except Exception as e:
print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)
# save interim results to files
f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
def downloadMemberDataHtmls(self, listOfCountries):
for country in listOfCountries:
f = open('crawlers/output/' + country +'MemberList.txt')
text =
dictionaryMemberList = eval(text)
for memberid in dictionaryMemberList:
memberLink = dictionaryMemberList[memberid]['link']
# download the html page of the Member
response = urllib.request.urlopen(memberLink)
webContent ='UTF-8')
# save interim results to files
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
os.makedirs(os.path.dirname(filename), exist_ok=True)
f = open( filename, 'w+')
def parseMemberData2dictionary(self, listOfCountries):
for country in listOfCountries:
print('started to parse data of member of ' + country + ' ..')
f = open('crawlers/output/' + country +'MemberList.txt')
text =
dictionaryMemberList = eval(text)
countryConf = self.config.get(country)
countryDomain = countryConf.get('domain')
countryConfMember = countryConf.get('member')
countryConfMemberInfo1 = countryConfMember.get('info-1')
countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent')
countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty')
for memberid in dictionaryMemberList:
print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
tree = lxml.html.soupparser.parse(filename)
politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty)
print('oi', politicalParty)
if len(politicalParty) > 0:
dictionaryMemberList[memberid]['political party'] = politicalParty[0]
f = open('crawlers/output/' + country +'MemberList.txt', 'w+')