diff --git a/crawlers/MembersParliamentCrawler.py b/crawlers/MembersParliamentCrawler.py deleted file mode 100644 index 916fafc..0000000 --- a/crawlers/MembersParliamentCrawler.py +++ /dev/null @@ -1,186 +0,0 @@ -import os - -import yaml -import json - -import urllib.request, urllib.error, urllib.parse - -from lxml import etree -import lxml.html -import lxml.html.soupparser - - -class membersParliamentCrawler(object): - def __init__(self, configFile): - with open(configFile, "r") as stream: - try: - self.config = yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) - - # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico'] - - def downloadMemberListPagesOfCountries(self, listOfCountries): - # download only html pages of the countries specified in input - - for country in listOfCountries: - for key in self.config: - if key in listOfCountries: - try: - memberList = self.config.get(key).get("memberList") - except Exception as e: - print( - "There is a problem with the entry memberList in the config.yaml - the original error message is:", - e, - ) - try: - memberListLink = memberList.get("link") - except Exception as e: - print( - "No memberListLink defined in config.yaml - the original error message is:", - e, - ) - - # download the html page of the List of Members - - response = urllib.request.urlopen(memberListLink) - webContent = response.read().decode("UTF-8") - - # save interim results to files - - f = open("crawlers/pages/" + key + "MemberList.html", "w+") - f.write(webContent) - f.close - - def parseMemberListData2dictionary(self, listOfCountries): - for country in listOfCountries: - try: - # use soupparser to handle broken html - - tree = lxml.html.soupparser.parse( - "crawlers/pages/" + country + "MemberList.html" - ) - - # for e in tree.iter(): - # - # print(e.tag) - # - # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): - # - # #print(etree.tostring(e).decode()) - - dictionaryMemberList = {} - - countryConf = self.config.get(country) - countryDomain = countryConf.get("domain") - countryConfMemberList = countryConf.get("memberList") - countryConfMemberListParent = countryConfMemberList.get("parent") - countryConfMemberListChildName = countryConfMemberList.get("child-name") - countryConfMemberListChildLink = countryConfMemberList.get("child-link") - - for n in range(len(tree.xpath(countryConfMemberListParent))): - name = tree.xpath( - countryConfMemberListParent - + "[" - + str(n) - + "]" - + countryConfMemberListChildName - ) - link = tree.xpath( - countryConfMemberListParent - + "[" - + str(n) - + "]" - + countryConfMemberListChildLink - ) - - if len(name) > 0: - dictionaryMemberList[n] = {} - dictionaryMemberList[n]["name"] = name[0] - - if countryDomain in link[0]: - dictionaryMemberList[n]["link"] = link[0] - - if countryDomain not in link[0]: - dictionaryMemberList[n]["link"] = countryDomain + link[0] - - except Exception as e: - print( - "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:", - e, - ) - - # save interim results to files - - f = open("crawlers/output/" + country + "MemberList.txt", "w+") - f.write(str(dictionaryMemberList)) - f.close - - def downloadMemberDataHtmls(self, listOfCountries): - for country in listOfCountries: - f = open("crawlers/output/" + country + "MemberList.txt") - text = f.read() - - dictionaryMemberList = eval(text) - - for memberid in dictionaryMemberList: - memberLink = dictionaryMemberList[memberid]["link"] - - # download the html page of the Member - - response = urllib.request.urlopen(memberLink) - webContent = response.read().decode("UTF-8") - - # save interim results to files - - filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html" - - os.makedirs(os.path.dirname(filename), exist_ok=True) - f = open(filename, "w+") - f.write(webContent) - f.close - - def parseMemberData2dictionary(self, listOfCountries): - for country in listOfCountries: - print("started to parse data of member of " + country + " ..") - - f = open("crawlers/output/" + country + "MemberList.txt") - text = f.read() - - dictionaryMemberList = eval(text) - - countryConf = self.config.get(country) - countryDomain = countryConf.get("domain") - countryConfMember = countryConf.get("member") - countryConfMemberInfo1 = countryConfMember.get("info-1") - countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent") - countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get( - "child-politicalParty" - ) - - for memberid in dictionaryMemberList: - print( - "started to parse data of member with name " - + dictionaryMemberList[memberid]["name"] - + " .." - ) - - filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html" - - tree = lxml.html.soupparser.parse(filename) - - politicalParty = tree.xpath( - countryConfMemberInfo1Parent - + countryConfMemberInfo1ChildPoliticalParty - ) - - print("oi", politicalParty) - - if len(politicalParty) > 0: - dictionaryMemberList[memberid]["political party"] = politicalParty[ - 0 - ] - - f = open("crawlers/output/" + country + "MemberList.txt", "w+") - f.write(str(dictionaryMemberList)) - f.close diff --git a/crawlers/members_parliament_crawler.py b/crawlers/members_parliament_crawler.py new file mode 100644 index 0000000..645a937 --- /dev/null +++ b/crawlers/members_parliament_crawler.py @@ -0,0 +1,186 @@ +import os + +import yaml +import json + +import urllib.request, urllib.error, urllib.parse + +from lxml import etree +import lxml.html +import lxml.html.soupparser + + +class members_parliament_crawler(object): + def __init__(self, config_file): + with open(config_file, "r") as stream: + try: + self.config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico'] + + def download_member_list_pages_of_countries(self, list_of_countries): + # download only html pages of the countries specified in input + + for country in list_of_countries: + for key in self.config: + if key in list_of_countries: + try: + member_list = self.config.get(key).get("memberList") + except Exception as e: + print( + "There is a problem with the entry memberList in the config.yaml - the original error message is:", + e, + ) + try: + member_list_link = member_list.get("link") + except Exception as e: + print( + "No memberListLink defined in config.yaml - the original error message is:", + e, + ) + + # download the html page of the List of Members + + response = urllib.request.urlopen(member_list_link) + web_content = response.read().decode("UTF-8") + + # save interim results to files + + f = open("crawlers/pages/" + key + "MemberList.html", "w+") + f.write(webContent) + f.close + + def parse_member_list_data2dictionary(self, list_of_countries): + for country in list_of_countries: + try: + # use soupparser to handle broken html + + tree = lxml.html.soupparser.parse( + "crawlers/pages/" + country + "MemberList.html" + ) + + # for e in tree.iter(): + # + # print(e.tag) + # + # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): + # + # #print(etree.tostring(e).decode()) + + dictionary_member_list = {} + + country_conf = self.config.get(country) + country_domain = country_conf.get("domain") + country_conf_member_list = country_conf.get("memberList") + country_conf_member_list_parent = country_conf_member_list.get("parent") + country_conf_member_list_child_name = country_conf_member_list.get("child-name") + country_conf_member_list_child_link = country_conf_member_list.get("child-link") + + for n in range(len(tree.xpath(country_conf_member_list_parent))): + name = tree.xpath( + country_conf_member_list_parent + + "[" + + str(n) + + "]" + + country_conf_member_list_child_name + ) + link = tree.xpath( + country_conf_member_list_parent + + "[" + + str(n) + + "]" + + country_conf_member_list_child_link + ) + + if len(name) > 0: + dictionary_member_list[n] = {} + dictionary_member_list[n]["name"] = name[0] + + if country_domain in link[0]: + dictionary_member_list[n]["link"] = link[0] + + if country_domain not in link[0]: + dictionary_member_list[n]["link"] = country_domain + link[0] + + except Exception as e: + print( + "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:", + e, + ) + + # save interim results to files + + f = open("crawlers/output/" + country + "MemberList.txt", "w+") + f.write(str(dictionary_member_list)) + f.close + + def download_member_data_htmls(self, list_of_countries): + for country in list_of_countries: + f = open("crawlers/output/" + country + "MemberList.txt") + text = f.read() + + dictionary_member_list = eval(text) + + for member_id in dictionary_member_list: + member_link = dictionary_member_list[member_id]["link"] + + # download the html page of the Member + + response = urllib.request.urlopen(member_link) + web_content = response.read().decode("UTF-8") + + # save interim results to files + + file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html" + + os.makedirs(os.path.dirname(file_name), exist_ok=True) + f = open(file_name, "w+") + f.write(web_content) + f.close + + def parse_member_data2dictionary(self, list_of_countries): + for country in list_of_countries: + print("started to parse data of member of " + country + " ..") + + f = open("crawlers/output/" + country + "MemberList.txt") + text = f.read() + + dictionary_member_list = eval(text) + + country_conf = self.config.get(country) + country_domain = country_conf.get("domain") + country_conf_member = country_conf.get("member") + country_conf_member_info1 = country_conf_member.get("info-1") + country_conf_member_info1_parent = country_conf_member_info1.get("parent") + country_conf_member_info1_child_political_party = country_conf_member_info1.get( + "child-politicalParty" + ) + + for member_id in dictionary_member_list: + print( + "started to parse data of member with name " + + dictionary_member_list[member_id]["name"] + + " .." + ) + + file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html" + + tree = lxml.html.soupparser.parse(file_name) + + political_party = tree.xpath( + country_conf_member_info1_parent + + country_conf_member_info1_child_political_party + ) + + print("oi", political_party) + + if len(political_party) > 0: + dictionary_member_list[member_id]["political party"] = political_party[ + 0 + ] + + f = open("crawlers/output/" + country + "MemberList.txt", "w+") + f.write(str(dictionary_member_list)) + f.close diff --git a/crawlers/useMembersParliamentCrawler.py b/crawlers/useMembersParliamentCrawler.py deleted file mode 100644 index ec90b62..0000000 --- a/crawlers/useMembersParliamentCrawler.py +++ /dev/null @@ -1,16 +0,0 @@ -from crawlers.MembersParliamentCrawler import * - - -config = "config.yaml" -listOfCountries = ["nicaragua"] - - -Crawler = membersParliamentCrawler(config) - -# Crawler.downloadMemberListPagesOfCountries(listOfCountries) - -# Crawler.parseMemberListData2dictionary(listOfCountries) - -# Crawler.downloadMemberDataHtmls(listOfCountries) - -Crawler.parseMemberData2dictionary(listOfCountries) diff --git a/crawlers/use_members_parliament_crawler.py b/crawlers/use_members_parliament_crawler.py new file mode 100644 index 0000000..b912a7d --- /dev/null +++ b/crawlers/use_members_parliament_crawler.py @@ -0,0 +1,16 @@ +from crawlers.members_parliament_crawler import * + + +config = "config.yaml" +list_of_countries = ["nicaragua"] + + +crawler = members_parliament_crawler(config) + +# crawler.download_member_list_pages_of_countries(list_of_countries) + +# crawler.parse_member_list_data2dictionary(list_of_countries) + +# crawler.download_member_data_htmls(list_of_countries) + +crawler.parse_member_data2dictionary(list_of_countries) diff --git a/main.py b/main.py index 61926a4..35275a0 100644 --- a/main.py +++ b/main.py @@ -1,31 +1,31 @@ -from crawlers.MembersParliamentCrawler import * +from crawlers.members_parliament_crawler import * -from wikidata.wdPEP import * +from wikidata.wd_PEP import * config = "crawlers/config.yaml" -listOfCountries = ["nicaragua"] +list_of_countries = ["nicaragua"] # doing the crawling of government websites -# Crawler = membersParliamentCrawler(config) +# crawler = members_parliament_crawler(config) -# Crawler.downloadMemberListPagesOfCountries(listOfCountries) +# crawler.download_member_list_pages_of_countries(list_of_countries) -# Crawler.parseMemberListData2dictionary(listOfCountries) +# crawler.parse_member_list_data2dictionary(list_of_countries) -# Crawler.downloadMemberDataHtmls(listOfCountries) +# crawler.download_member_data_htmls(list_of_countries) -# Crawler.parseMemberData2dictionary(listOfCountries) +# crawler.parse_member_data2dictionary(list_of_countries) # processing the resulted dictionary and create wikidata queries -wikidataPEP = WikidataPEP(config) +wikidata_PEP = Wikidata_PEP(config) -# wikidataPEP.importMembersOfParliamentDict(listOfCountries) +# wikidata_PEP.importMembers_of_parliament_dict(list_of_countries) -# wikidataPEP.checkForEntityIds(listOfCountries) +# wikidata_PEP.check_for_entity_ids(list_of_countries) -# wikidataPEP.createMemberOnWikidata() +# wikidata_PEP.create_member_on_wikidata() -wikidataPEP.editMemberOnWikidata("Q116918332") +wikidata_PEP.edit_member_on_wikidata("Q116918332") diff --git a/wikidata/wdPEP.py b/wikidata/wd_PEP.py similarity index 72% rename from wikidata/wdPEP.py rename to wikidata/wd_PEP.py index 8815873..99e1bf2 100644 --- a/wikidata/wdPEP.py +++ b/wikidata/wd_PEP.py @@ -4,57 +4,57 @@ import yaml import json -class WikidataPEP(object): - def __init__(self, configFile): - with open(configFile, "r") as stream: +class wikidata_PEP(object): + def __init__(self, config_file): + with open(config_file, "r") as stream: try: self.config = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) - def importMembersOfParliamentDict(self, listOfCountries): - self.fullDictionaryMemberLists = {} + def import_members_of_parliament_dict(self, list_of_countries): + self.full_dictionary_member_lists = {} - for country in listOfCountries: + for country in list_of_countries: print("started to parse data of members of " + country + " ..") f = open("crawlers/output/" + country + "MemberList.txt") text = f.read() - self.fullDictionaryMemberLists[country] = eval(text) + self.full_dictionary_member_lists[country] = eval(text) - # print(self.fullDictionaryMemberLists) + # print(self.full_dictionary_member_lists) - def checkForEntityIds(self, listOfCountries): + def check_for_entity_ids(self, list_of_countries): from wikibaseintegrator import WikibaseIntegrator from wikibaseintegrator import wbi_helpers - fullDictionaryMemberLists = self.fullDictionaryMemberLists + full_dictionary_member_lists = self.full_dictionary_member_lists - for country in listOfCountries: - for memberId in fullDictionaryMemberLists[country].keys(): - name = fullDictionaryMemberLists[country][memberId]["name"] + for country in list_of_countries: + for member_id in full_dictionary_member_lists[country].keys(): + name = full_dictionary_member_lists[country][member_id]["name"] results = wbi_helpers.search_entities(search_string=name) - for entityId in results: + for entity_id in results: wbi = WikibaseIntegrator() - wikidata_item = wbi.item.get(entity_id=entityId) + wikidata_item = wbi.item.get(entity_id=entity_id) - for claimkey in wikidata_item.get_json()["claims"].keys(): - if claimkey == "P31": + for claim_key in wikidata_item.get_json()["claims"].keys(): + if claim_key == "P31": if ( - wikidata_item.get_json()["claims"][claimkey][0][ + wikidata_item.get_json()["claims"][claim_key][0][ "mainsnak" ]["datavalue"]["value"]["id"] == "Q5" ): - print(entityId) + print(entity_id) print("---------") print(name) print("is a human") - def createMemberOnWikidata(self): + def create_member_on_wikidata(self): from wikibaseintegrator import wbi_login, WikibaseIntegrator from wikibaseintegrator.datatypes import ExternalID, Item from wikibaseintegrator.wbi_config import config as wbi_config @@ -67,18 +67,18 @@ class WikidataPEP(object): wbi = WikibaseIntegrator(login=login_instance) # data type object, e.g. for a NCBI gene entrez ID - isHuman = Item(value="Q5", prop_nr="P31") - occupationPolitician = ExternalID(value="Q82955", prop_nr="P106") - occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106") + is_human = Item(value="Q5", prop_nr="P31") + occupation_politician = ExternalID(value="Q82955", prop_nr="P106") + occupation_deputy = ExternalID(value="Q1055894", prop_nr="P106") # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106') - # print(isHuman) + # print(is_human) # print(referenceURL) # data goes into a list, because many data objects can be provided to - data1 = [isHuman] - data2 = [occupationDeputy] - data3 = [occupationPolitician] + data1 = [is_human] + data2 = [occupation_deputy] + data3 = [occupation_politician] # Create a new item item = wbi.item.new() @@ -115,12 +115,12 @@ class WikidataPEP(object): wbi = WikibaseIntegrator(login=login_instance) # data type object, e.g. for a NCBI gene entrez ID - # isHuman = Item(value='Q5', prop_nr='P31') - # occupationPolitician = Item(value='Q82955', prop_nr='P106') - # occupationDeputy = Item(value='Q1055894', prop_nr='P106') + # is_human = Item(value='Q5', prop_nr='P31') + # occupation_politician = Item(value='Q82955', prop_nr='P106') + # occupation_deputy = Item(value='Q1055894', prop_nr='P106') # referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854') - # print(isHuman) + # print(is_human) # print(referenceURL) references = [ @@ -137,12 +137,12 @@ class WikidataPEP(object): ] ] - occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references) + occupation_deputy = Item(value="Q1055894", prop_nr="P106", references=references) ## data goes into a list, because many data objects can be provided to - # data1 = [isHuman] - data2 = [occupationDeputy] - # data3 = [occupationPolitician] + # data1 = [is_human] + data2 = [occupation_deputy] + # data3 = [occupation_politician] # data4 = [referenceURL] ## get item for Qid item = wbi.item.get(entity_id=Qid)