@ -1,186 +0,0 @@ | |||
import os | |||
import yaml | |||
import json | |||
import urllib.request, urllib.error, urllib.parse | |||
from lxml import etree | |||
import lxml.html | |||
import lxml.html.soupparser | |||
class membersParliamentCrawler(object): | |||
def __init__(self, configFile): | |||
with open(configFile, "r") as stream: | |||
try: | |||
self.config = yaml.safe_load(stream) | |||
except yaml.YAMLError as exc: | |||
print(exc) | |||
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico'] | |||
def downloadMemberListPagesOfCountries(self, listOfCountries): | |||
# download only html pages of the countries specified in input | |||
for country in listOfCountries: | |||
for key in self.config: | |||
if key in listOfCountries: | |||
try: | |||
memberList = self.config.get(key).get("memberList") | |||
except Exception as e: | |||
print( | |||
"There is a problem with the entry memberList in the config.yaml - the original error message is:", | |||
e, | |||
) | |||
try: | |||
memberListLink = memberList.get("link") | |||
except Exception as e: | |||
print( | |||
"No memberListLink defined in config.yaml - the original error message is:", | |||
e, | |||
) | |||
# download the html page of the List of Members | |||
response = urllib.request.urlopen(memberListLink) | |||
webContent = response.read().decode("UTF-8") | |||
# save interim results to files | |||
f = open("crawlers/pages/" + key + "MemberList.html", "w+") | |||
f.write(webContent) | |||
f.close | |||
def parseMemberListData2dictionary(self, listOfCountries): | |||
for country in listOfCountries: | |||
try: | |||
# use soupparser to handle broken html | |||
tree = lxml.html.soupparser.parse( | |||
"crawlers/pages/" + country + "MemberList.html" | |||
) | |||
# for e in tree.iter(): | |||
# | |||
# print(e.tag) | |||
# | |||
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): | |||
# | |||
# #print(etree.tostring(e).decode()) | |||
dictionaryMemberList = {} | |||
countryConf = self.config.get(country) | |||
countryDomain = countryConf.get("domain") | |||
countryConfMemberList = countryConf.get("memberList") | |||
countryConfMemberListParent = countryConfMemberList.get("parent") | |||
countryConfMemberListChildName = countryConfMemberList.get("child-name") | |||
countryConfMemberListChildLink = countryConfMemberList.get("child-link") | |||
for n in range(len(tree.xpath(countryConfMemberListParent))): | |||
name = tree.xpath( | |||
countryConfMemberListParent | |||
+ "[" | |||
+ str(n) | |||
+ "]" | |||
+ countryConfMemberListChildName | |||
) | |||
link = tree.xpath( | |||
countryConfMemberListParent | |||
+ "[" | |||
+ str(n) | |||
+ "]" | |||
+ countryConfMemberListChildLink | |||
) | |||
if len(name) > 0: | |||
dictionaryMemberList[n] = {} | |||
dictionaryMemberList[n]["name"] = name[0] | |||
if countryDomain in link[0]: | |||
dictionaryMemberList[n]["link"] = link[0] | |||
if countryDomain not in link[0]: | |||
dictionaryMemberList[n]["link"] = countryDomain + link[0] | |||
except Exception as e: | |||
print( | |||
"parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:", | |||
e, | |||
) | |||
# save interim results to files | |||
f = open("crawlers/output/" + country + "MemberList.txt", "w+") | |||
f.write(str(dictionaryMemberList)) | |||
f.close | |||
def downloadMemberDataHtmls(self, listOfCountries): | |||
for country in listOfCountries: | |||
f = open("crawlers/output/" + country + "MemberList.txt") | |||
text = f.read() | |||
dictionaryMemberList = eval(text) | |||
for memberid in dictionaryMemberList: | |||
memberLink = dictionaryMemberList[memberid]["link"] | |||
# download the html page of the Member | |||
response = urllib.request.urlopen(memberLink) | |||
webContent = response.read().decode("UTF-8") | |||
# save interim results to files | |||
filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html" | |||
os.makedirs(os.path.dirname(filename), exist_ok=True) | |||
f = open(filename, "w+") | |||
f.write(webContent) | |||
f.close | |||
def parseMemberData2dictionary(self, listOfCountries): | |||
for country in listOfCountries: | |||
print("started to parse data of member of " + country + " ..") | |||
f = open("crawlers/output/" + country + "MemberList.txt") | |||
text = f.read() | |||
dictionaryMemberList = eval(text) | |||
countryConf = self.config.get(country) | |||
countryDomain = countryConf.get("domain") | |||
countryConfMember = countryConf.get("member") | |||
countryConfMemberInfo1 = countryConfMember.get("info-1") | |||
countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent") | |||
countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get( | |||
"child-politicalParty" | |||
) | |||
for memberid in dictionaryMemberList: | |||
print( | |||
"started to parse data of member with name " | |||
+ dictionaryMemberList[memberid]["name"] | |||
+ " .." | |||
) | |||
filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html" | |||
tree = lxml.html.soupparser.parse(filename) | |||
politicalParty = tree.xpath( | |||
countryConfMemberInfo1Parent | |||
+ countryConfMemberInfo1ChildPoliticalParty | |||
) | |||
print("oi", politicalParty) | |||
if len(politicalParty) > 0: | |||
dictionaryMemberList[memberid]["political party"] = politicalParty[ | |||
0 | |||
] | |||
f = open("crawlers/output/" + country + "MemberList.txt", "w+") | |||
f.write(str(dictionaryMemberList)) | |||
f.close |
@ -0,0 +1,186 @@ | |||
import os | |||
import yaml | |||
import json | |||
import urllib.request, urllib.error, urllib.parse | |||
from lxml import etree | |||
import lxml.html | |||
import lxml.html.soupparser | |||
class members_parliament_crawler(object): | |||
def __init__(self, config_file): | |||
with open(config_file, "r") as stream: | |||
try: | |||
self.config = yaml.safe_load(stream) | |||
except yaml.YAMLError as exc: | |||
print(exc) | |||
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico'] | |||
def download_member_list_pages_of_countries(self, list_of_countries): | |||
# download only html pages of the countries specified in input | |||
for country in list_of_countries: | |||
for key in self.config: | |||
if key in list_of_countries: | |||
try: | |||
member_list = self.config.get(key).get("memberList") | |||
except Exception as e: | |||
print( | |||
"There is a problem with the entry memberList in the config.yaml - the original error message is:", | |||
e, | |||
) | |||
try: | |||
member_list_link = member_list.get("link") | |||
except Exception as e: | |||
print( | |||
"No memberListLink defined in config.yaml - the original error message is:", | |||
e, | |||
) | |||
# download the html page of the List of Members | |||
response = urllib.request.urlopen(member_list_link) | |||
web_content = response.read().decode("UTF-8") | |||
# save interim results to files | |||
f = open("crawlers/pages/" + key + "MemberList.html", "w+") | |||
f.write(webContent) | |||
f.close | |||
def parse_member_list_data2dictionary(self, list_of_countries): | |||
for country in list_of_countries: | |||
try: | |||
# use soupparser to handle broken html | |||
tree = lxml.html.soupparser.parse( | |||
"crawlers/pages/" + country + "MemberList.html" | |||
) | |||
# for e in tree.iter(): | |||
# | |||
# print(e.tag) | |||
# | |||
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): | |||
# | |||
# #print(etree.tostring(e).decode()) | |||
dictionary_member_list = {} | |||
country_conf = self.config.get(country) | |||
country_domain = country_conf.get("domain") | |||
country_conf_member_list = country_conf.get("memberList") | |||
country_conf_member_list_parent = country_conf_member_list.get("parent") | |||
country_conf_member_list_child_name = country_conf_member_list.get("child-name") | |||
country_conf_member_list_child_link = country_conf_member_list.get("child-link") | |||
for n in range(len(tree.xpath(country_conf_member_list_parent))): | |||
name = tree.xpath( | |||
country_conf_member_list_parent | |||
+ "[" | |||
+ str(n) | |||
+ "]" | |||
+ country_conf_member_list_child_name | |||
) | |||
link = tree.xpath( | |||
country_conf_member_list_parent | |||
+ "[" | |||
+ str(n) | |||
+ "]" | |||
+ country_conf_member_list_child_link | |||
) | |||
if len(name) > 0: | |||
dictionary_member_list[n] = {} | |||
dictionary_member_list[n]["name"] = name[0] | |||
if country_domain in link[0]: | |||
dictionary_member_list[n]["link"] = link[0] | |||
if country_domain not in link[0]: | |||
dictionary_member_list[n]["link"] = country_domain + link[0] | |||
except Exception as e: | |||
print( | |||
"parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:", | |||
e, | |||
) | |||
# save interim results to files | |||
f = open("crawlers/output/" + country + "MemberList.txt", "w+") | |||
f.write(str(dictionary_member_list)) | |||
f.close | |||
def download_member_data_htmls(self, list_of_countries): | |||
for country in list_of_countries: | |||
f = open("crawlers/output/" + country + "MemberList.txt") | |||
text = f.read() | |||
dictionary_member_list = eval(text) | |||
for member_id in dictionary_member_list: | |||
member_link = dictionary_member_list[member_id]["link"] | |||
# download the html page of the Member | |||
response = urllib.request.urlopen(member_link) | |||
web_content = response.read().decode("UTF-8") | |||
# save interim results to files | |||
file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html" | |||
os.makedirs(os.path.dirname(file_name), exist_ok=True) | |||
f = open(file_name, "w+") | |||
f.write(web_content) | |||
f.close | |||
def parse_member_data2dictionary(self, list_of_countries): | |||
for country in list_of_countries: | |||
print("started to parse data of member of " + country + " ..") | |||
f = open("crawlers/output/" + country + "MemberList.txt") | |||
text = f.read() | |||
dictionary_member_list = eval(text) | |||
country_conf = self.config.get(country) | |||
country_domain = country_conf.get("domain") | |||
country_conf_member = country_conf.get("member") | |||
country_conf_member_info1 = country_conf_member.get("info-1") | |||
country_conf_member_info1_parent = country_conf_member_info1.get("parent") | |||
country_conf_member_info1_child_political_party = country_conf_member_info1.get( | |||
"child-politicalParty" | |||
) | |||
for member_id in dictionary_member_list: | |||
print( | |||
"started to parse data of member with name " | |||
+ dictionary_member_list[member_id]["name"] | |||
+ " .." | |||
) | |||
file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html" | |||
tree = lxml.html.soupparser.parse(file_name) | |||
political_party = tree.xpath( | |||
country_conf_member_info1_parent | |||
+ country_conf_member_info1_child_political_party | |||
) | |||
print("oi", political_party) | |||
if len(political_party) > 0: | |||
dictionary_member_list[member_id]["political party"] = political_party[ | |||
0 | |||
] | |||
f = open("crawlers/output/" + country + "MemberList.txt", "w+") | |||
f.write(str(dictionary_member_list)) | |||
f.close |
@ -1,16 +0,0 @@ | |||
from crawlers.MembersParliamentCrawler import * | |||
config = "config.yaml" | |||
listOfCountries = ["nicaragua"] | |||
Crawler = membersParliamentCrawler(config) | |||
# Crawler.downloadMemberListPagesOfCountries(listOfCountries) | |||
# Crawler.parseMemberListData2dictionary(listOfCountries) | |||
# Crawler.downloadMemberDataHtmls(listOfCountries) | |||
Crawler.parseMemberData2dictionary(listOfCountries) |
@ -0,0 +1,16 @@ | |||
from crawlers.members_parliament_crawler import * | |||
config = "config.yaml" | |||
list_of_countries = ["nicaragua"] | |||
crawler = members_parliament_crawler(config) | |||
# crawler.download_member_list_pages_of_countries(list_of_countries) | |||
# crawler.parse_member_list_data2dictionary(list_of_countries) | |||
# crawler.download_member_data_htmls(list_of_countries) | |||
crawler.parse_member_data2dictionary(list_of_countries) |
@ -1,31 +1,31 @@ | |||
from crawlers.MembersParliamentCrawler import * | |||
from crawlers.members_parliament_crawler import * | |||
from wikidata.wdPEP import * | |||
from wikidata.wd_PEP import * | |||
config = "crawlers/config.yaml" | |||
listOfCountries = ["nicaragua"] | |||
list_of_countries = ["nicaragua"] | |||
# doing the crawling of government websites | |||
# Crawler = membersParliamentCrawler(config) | |||
# crawler = members_parliament_crawler(config) | |||
# Crawler.downloadMemberListPagesOfCountries(listOfCountries) | |||
# crawler.download_member_list_pages_of_countries(list_of_countries) | |||
# Crawler.parseMemberListData2dictionary(listOfCountries) | |||
# crawler.parse_member_list_data2dictionary(list_of_countries) | |||
# Crawler.downloadMemberDataHtmls(listOfCountries) | |||
# crawler.download_member_data_htmls(list_of_countries) | |||
# Crawler.parseMemberData2dictionary(listOfCountries) | |||
# crawler.parse_member_data2dictionary(list_of_countries) | |||
# processing the resulted dictionary and create wikidata queries | |||
wikidataPEP = WikidataPEP(config) | |||
wikidata_PEP = Wikidata_PEP(config) | |||
# wikidataPEP.importMembersOfParliamentDict(listOfCountries) | |||
# wikidata_PEP.importMembers_of_parliament_dict(list_of_countries) | |||
# wikidataPEP.checkForEntityIds(listOfCountries) | |||
# wikidata_PEP.check_for_entity_ids(list_of_countries) | |||
# wikidataPEP.createMemberOnWikidata() | |||
# wikidata_PEP.create_member_on_wikidata() | |||
wikidataPEP.editMemberOnWikidata("Q116918332") | |||
wikidata_PEP.edit_member_on_wikidata("Q116918332") |