automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

186 lines
7.0 KiB

import os
import yaml
import json
import urllib.request, urllib.error, urllib.parse
from lxml import etree
import lxml.html
import lxml.html.soupparser
class members_parliament_crawler(object):
def __init__(self, config_file):
with open(config_file, "r") as stream:
try:
self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
def download_member_list_pages_of_countries(self, list_of_countries):
# download only html pages of the countries specified in input
for country in list_of_countries:
for key in self.config:
if key in list_of_countries:
try:
member_list = self.config.get(key).get("memberList")
except Exception as e:
print(
"There is a problem with the entry memberList in the config.yaml - the original error message is:",
e,
)
try:
member_list_link = member_list.get("link")
except Exception as e:
print(
"No memberListLink defined in config.yaml - the original error message is:",
e,
)
# download the html page of the List of Members
response = urllib.request.urlopen(member_list_link)
web_content = response.read().decode("UTF-8")
# save interim results to files
f = open("crawlers/pages/" + key + "MemberList.html", "w+")
f.write(webContent)
f.close
def parse_member_list_data2dictionary(self, list_of_countries):
for country in list_of_countries:
try:
# use soupparser to handle broken html
tree = lxml.html.soupparser.parse(
"crawlers/pages/" + country + "MemberList.html"
)
# for e in tree.iter():
#
# print(e.tag)
#
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
#
# #print(etree.tostring(e).decode())
dictionary_member_list = {}
country_conf = self.config.get(country)
country_domain = country_conf.get("domain")
country_conf_member_list = country_conf.get("memberList")
country_conf_member_list_parent = country_conf_member_list.get("parent")
country_conf_member_list_child_name = country_conf_member_list.get("child-name")
country_conf_member_list_child_link = country_conf_member_list.get("child-link")
for n in range(len(tree.xpath(country_conf_member_list_parent))):
name = tree.xpath(
country_conf_member_list_parent
+ "["
+ str(n)
+ "]"
+ country_conf_member_list_child_name
)
link = tree.xpath(
country_conf_member_list_parent
+ "["
+ str(n)
+ "]"
+ country_conf_member_list_child_link
)
if len(name) > 0:
dictionary_member_list[n] = {}
dictionary_member_list[n]["name"] = name[0]
if country_domain in link[0]:
dictionary_member_list[n]["link"] = link[0]
if country_domain not in link[0]:
dictionary_member_list[n]["link"] = country_domain + link[0]
except Exception as e:
print(
"parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
e,
)
# save interim results to files
f = open("crawlers/output/" + country + "MemberList.txt", "w+")
f.write(str(dictionary_member_list))
f.close
def download_member_data_htmls(self, list_of_countries):
for country in list_of_countries:
f = open("crawlers/output/" + country + "MemberList.txt")
text = f.read()
dictionary_member_list = eval(text)
for member_id in dictionary_member_list:
member_link = dictionary_member_list[member_id]["link"]
# download the html page of the Member
response = urllib.request.urlopen(member_link)
web_content = response.read().decode("UTF-8")
# save interim results to files
file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html"
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+")
f.write(web_content)
f.close
def parse_member_data2dictionary(self, list_of_countries):
for country in list_of_countries:
print("started to parse data of member of " + country + " ..")
f = open("crawlers/output/" + country + "MemberList.txt")
text = f.read()
dictionary_member_list = eval(text)
country_conf = self.config.get(country)
country_domain = country_conf.get("domain")
country_conf_member = country_conf.get("member")
country_conf_member_info1 = country_conf_member.get("info-1")
country_conf_member_info1_parent = country_conf_member_info1.get("parent")
country_conf_member_info1_child_political_party = country_conf_member_info1.get(
"child-politicalParty"
)
for member_id in dictionary_member_list:
print(
"started to parse data of member with name "
+ dictionary_member_list[member_id]["name"]
+ " .."
)
file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html"
tree = lxml.html.soupparser.parse(file_name)
political_party = tree.xpath(
country_conf_member_info1_parent
+ country_conf_member_info1_child_political_party
)
print("oi", political_party)
if len(political_party) > 0:
dictionary_member_list[member_id]["political party"] = political_party[
0
]
f = open("crawlers/output/" + country + "MemberList.txt", "w+")
f.write(str(dictionary_member_list))
f.close