diff --git a/crawlers/MembersParliamentCrawler.py b/crawlers/MembersParliamentCrawler.py index b057d87..916fafc 100644 --- a/crawlers/MembersParliamentCrawler.py +++ b/crawlers/MembersParliamentCrawler.py @@ -1,5 +1,3 @@ - - import os import yaml @@ -12,172 +10,177 @@ import lxml.html import lxml.html.soupparser - - class membersParliamentCrawler(object): - def __init__(self, configFile): - with open(configFile, "r") as stream: try: self.config = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) - - + # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico'] def downloadMemberListPagesOfCountries(self, listOfCountries): - # download only html pages of the countries specified in input - + for country in listOfCountries: for key in self.config: if key in listOfCountries: try: - memberList = self.config.get(key).get('memberList') + memberList = self.config.get(key).get("memberList") except Exception as e: - print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e) + print( + "There is a problem with the entry memberList in the config.yaml - the original error message is:", + e, + ) try: - memberListLink = memberList.get('link') + memberListLink = memberList.get("link") except Exception as e: - print("No memberListLink defined in config.yaml - the original error message is:", e) - - # download the html page of the List of Members + print( + "No memberListLink defined in config.yaml - the original error message is:", + e, + ) + + # download the html page of the List of Members response = urllib.request.urlopen(memberListLink) - webContent = response.read().decode('UTF-8') + webContent = response.read().decode("UTF-8") # save interim results to files - - f = open('crawlers/pages/' + key +'MemberList.html', 'w+') + + f = open("crawlers/pages/" + key + "MemberList.html", "w+") f.write(webContent) f.close - def parseMemberListData2dictionary(self, listOfCountries): - for country in listOfCountries: - try: - - #use soupparser to handle broken html - - tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html') - -# for e in tree.iter(): -# -# print(e.tag) -# -# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): -# -# #print(etree.tostring(e).decode()) - + # use soupparser to handle broken html + + tree = lxml.html.soupparser.parse( + "crawlers/pages/" + country + "MemberList.html" + ) + + # for e in tree.iter(): + # + # print(e.tag) + # + # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): + # + # #print(etree.tostring(e).decode()) + dictionaryMemberList = {} - + countryConf = self.config.get(country) - countryDomain = countryConf.get('domain') - countryConfMemberList = countryConf.get('memberList') - countryConfMemberListParent = countryConfMemberList.get('parent') - countryConfMemberListChildName = countryConfMemberList.get('child-name') - countryConfMemberListChildLink = countryConfMemberList.get('child-link') - + countryDomain = countryConf.get("domain") + countryConfMemberList = countryConf.get("memberList") + countryConfMemberListParent = countryConfMemberList.get("parent") + countryConfMemberListChildName = countryConfMemberList.get("child-name") + countryConfMemberListChildLink = countryConfMemberList.get("child-link") + for n in range(len(tree.xpath(countryConfMemberListParent))): - - name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName) - link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink) - + name = tree.xpath( + countryConfMemberListParent + + "[" + + str(n) + + "]" + + countryConfMemberListChildName + ) + link = tree.xpath( + countryConfMemberListParent + + "[" + + str(n) + + "]" + + countryConfMemberListChildLink + ) + if len(name) > 0: - dictionaryMemberList[n] = {} - dictionaryMemberList[n]['name'] = name[0] - + dictionaryMemberList[n]["name"] = name[0] + if countryDomain in link[0]: - - dictionaryMemberList[n]['link'] = link[0] - + dictionaryMemberList[n]["link"] = link[0] + if countryDomain not in link[0]: - - dictionaryMemberList[n]['link'] = countryDomain + link[0] - + dictionaryMemberList[n]["link"] = countryDomain + link[0] + except Exception as e: - - print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e) - + print( + "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:", + e, + ) + # save interim results to files - - f = open('crawlers/output/' + country +'MemberList.txt', 'w+') + + f = open("crawlers/output/" + country + "MemberList.txt", "w+") f.write(str(dictionaryMemberList)) f.close def downloadMemberDataHtmls(self, listOfCountries): - for country in listOfCountries: - - f = open('crawlers/output/' + country +'MemberList.txt') + f = open("crawlers/output/" + country + "MemberList.txt") text = f.read() - + dictionaryMemberList = eval(text) - - + for memberid in dictionaryMemberList: - - - memberLink = dictionaryMemberList[memberid]['link'] - - # download the html page of the Member + memberLink = dictionaryMemberList[memberid]["link"] + + # download the html page of the Member response = urllib.request.urlopen(memberLink) - webContent = response.read().decode('UTF-8') + webContent = response.read().decode("UTF-8") # save interim results to files - - filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html' - + + filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html" + os.makedirs(os.path.dirname(filename), exist_ok=True) - f = open( filename, 'w+') + f = open(filename, "w+") f.write(webContent) f.close - - + def parseMemberData2dictionary(self, listOfCountries): - for country in listOfCountries: - - print('started to parse data of member of ' + country + ' ..') - - f = open('crawlers/output/' + country +'MemberList.txt') + print("started to parse data of member of " + country + " ..") + + f = open("crawlers/output/" + country + "MemberList.txt") text = f.read() - + dictionaryMemberList = eval(text) - - + countryConf = self.config.get(country) - countryDomain = countryConf.get('domain') - countryConfMember = countryConf.get('member') - countryConfMemberInfo1 = countryConfMember.get('info-1') - countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent') - countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty') - + countryDomain = countryConf.get("domain") + countryConfMember = countryConf.get("member") + countryConfMemberInfo1 = countryConfMember.get("info-1") + countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent") + countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get( + "child-politicalParty" + ) + for memberid in dictionaryMemberList: - - print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..') - - filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html' - + print( + "started to parse data of member with name " + + dictionaryMemberList[memberid]["name"] + + " .." + ) + + filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html" + tree = lxml.html.soupparser.parse(filename) - - politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty) - - print('oi', politicalParty) - + + politicalParty = tree.xpath( + countryConfMemberInfo1Parent + + countryConfMemberInfo1ChildPoliticalParty + ) + + print("oi", politicalParty) + if len(politicalParty) > 0: - - dictionaryMemberList[memberid]['political party'] = politicalParty[0] - - - - f = open('crawlers/output/' + country +'MemberList.txt', 'w+') + dictionaryMemberList[memberid]["political party"] = politicalParty[ + 0 + ] + + f = open("crawlers/output/" + country + "MemberList.txt", "w+") f.write(str(dictionaryMemberList)) f.close - diff --git a/crawlers/useMembersParliamentCrawler.py b/crawlers/useMembersParliamentCrawler.py index d80509e..ec90b62 100644 --- a/crawlers/useMembersParliamentCrawler.py +++ b/crawlers/useMembersParliamentCrawler.py @@ -1,18 +1,16 @@ - from crawlers.MembersParliamentCrawler import * - -config = 'config.yaml' -listOfCountries = ['nicaragua'] +config = "config.yaml" +listOfCountries = ["nicaragua"] Crawler = membersParliamentCrawler(config) -#Crawler.downloadMemberListPagesOfCountries(listOfCountries) +# Crawler.downloadMemberListPagesOfCountries(listOfCountries) -#Crawler.parseMemberListData2dictionary(listOfCountries) +# Crawler.parseMemberListData2dictionary(listOfCountries) -#Crawler.downloadMemberDataHtmls(listOfCountries) +# Crawler.downloadMemberDataHtmls(listOfCountries) Crawler.parseMemberData2dictionary(listOfCountries) diff --git a/main.py b/main.py index 26f8f77..61926a4 100644 --- a/main.py +++ b/main.py @@ -1,33 +1,31 @@ - from crawlers.MembersParliamentCrawler import * from wikidata.wdPEP import * -config = 'crawlers/config.yaml' -listOfCountries = ['nicaragua'] - +config = "crawlers/config.yaml" +listOfCountries = ["nicaragua"] # doing the crawling of government websites -#Crawler = membersParliamentCrawler(config) +# Crawler = membersParliamentCrawler(config) -#Crawler.downloadMemberListPagesOfCountries(listOfCountries) +# Crawler.downloadMemberListPagesOfCountries(listOfCountries) -#Crawler.parseMemberListData2dictionary(listOfCountries) +# Crawler.parseMemberListData2dictionary(listOfCountries) -#Crawler.downloadMemberDataHtmls(listOfCountries) +# Crawler.downloadMemberDataHtmls(listOfCountries) -#Crawler.parseMemberData2dictionary(listOfCountries) +# Crawler.parseMemberData2dictionary(listOfCountries) # processing the resulted dictionary and create wikidata queries wikidataPEP = WikidataPEP(config) -#wikidataPEP.importMembersOfParliamentDict(listOfCountries) +# wikidataPEP.importMembersOfParliamentDict(listOfCountries) -#wikidataPEP.checkForEntityIds(listOfCountries) +# wikidataPEP.checkForEntityIds(listOfCountries) -#wikidataPEP.createMemberOnWikidata() +# wikidataPEP.createMemberOnWikidata() -wikidataPEP.editMemberOnWikidata('Q116918332') +wikidataPEP.editMemberOnWikidata("Q116918332") diff --git a/wikidata/wdPEP.py b/wikidata/wdPEP.py index f943bd9..8815873 100644 --- a/wikidata/wdPEP.py +++ b/wikidata/wdPEP.py @@ -1,133 +1,117 @@ - - import os import yaml import json - class WikidataPEP(object): - def __init__(self, configFile): - with open(configFile, "r") as stream: try: self.config = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) - - + def importMembersOfParliamentDict(self, listOfCountries): - self.fullDictionaryMemberLists = {} - + for country in listOfCountries: - - print('started to parse data of members of ' + country + ' ..') - - f = open('crawlers/output/' + country +'MemberList.txt') + print("started to parse data of members of " + country + " ..") + + f = open("crawlers/output/" + country + "MemberList.txt") text = f.read() - + self.fullDictionaryMemberLists[country] = eval(text) - - #print(self.fullDictionaryMemberLists) - - + + # print(self.fullDictionaryMemberLists) + def checkForEntityIds(self, listOfCountries): - from wikibaseintegrator import WikibaseIntegrator from wikibaseintegrator import wbi_helpers - + fullDictionaryMemberLists = self.fullDictionaryMemberLists - + for country in listOfCountries: - for memberId in fullDictionaryMemberLists[country].keys(): - - name = fullDictionaryMemberLists[country][memberId]['name'] - + name = fullDictionaryMemberLists[country][memberId]["name"] + results = wbi_helpers.search_entities(search_string=name) - - + for entityId in results: - wbi = WikibaseIntegrator() wikidata_item = wbi.item.get(entity_id=entityId) - - for claimkey in wikidata_item.get_json()['claims'].keys(): - - if claimkey == 'P31': - - if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5': - + + for claimkey in wikidata_item.get_json()["claims"].keys(): + if claimkey == "P31": + if ( + wikidata_item.get_json()["claims"][claimkey][0][ + "mainsnak" + ]["datavalue"]["value"]["id"] + == "Q5" + ): print(entityId) - print('---------') + print("---------") print(name) - print('is a human') - - - + print("is a human") def createMemberOnWikidata(self): - from wikibaseintegrator import wbi_login, WikibaseIntegrator from wikibaseintegrator.datatypes import ExternalID, Item from wikibaseintegrator.wbi_config import config as wbi_config - wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)' - + wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)" + # login object - login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='') - + login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="") + wbi = WikibaseIntegrator(login=login_instance) # data type object, e.g. for a NCBI gene entrez ID - isHuman = Item(value='Q5', prop_nr='P31') - occupationPolitician = ExternalID(value='Q82955', prop_nr='P106') - occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106') - #referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106') - + isHuman = Item(value="Q5", prop_nr="P31") + occupationPolitician = ExternalID(value="Q82955", prop_nr="P106") + occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106") + # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106') + # print(isHuman) # print(referenceURL) - + # data goes into a list, because many data objects can be provided to data1 = [isHuman] data2 = [occupationDeputy] data3 = [occupationPolitician] - + # Create a new item item = wbi.item.new() - + # Set an english label - item.labels.set(language='en', value='Carlos Humberto Ruíz') - + item.labels.set(language="en", value="Carlos Humberto Ruíz") + # Carlos Humberto Ruiz has the Qid Q116918332 - + # Set a French description - item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy') - + item.descriptions.set( + language="en", value="Nicaraguan National Assembly Deputy" + ) + item.claims.add(data1) - #item.claims.add(data2) - #item.claims.add(data3) + # item.claims.add(data2) + # item.claims.add(data3) print(item.write()) - - def editMemberOnWikidata(self, Qid): - from wikibaseintegrator import wbi_login, WikibaseIntegrator from wikibaseintegrator.datatypes import ExternalID, Item, Time, String from wikibaseintegrator.wbi_config import config as wbi_config from wikibaseintegrator.wbi_enums import ActionIfExists from wikibaseintegrator.wbi_enums import WikibaseDatePrecision - - - wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)' - + + wbi_config[ + "USER_AGENT" + ] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)" + # login object - login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='') - + login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="") + wbi = WikibaseIntegrator(login=login_instance) # data type object, e.g. for a NCBI gene entrez ID @@ -135,19 +119,26 @@ class WikidataPEP(object): # occupationPolitician = Item(value='Q82955', prop_nr='P106') # occupationDeputy = Item(value='Q1055894', prop_nr='P106') # referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854') - + # print(isHuman) # print(referenceURL) - + references = [ [ - ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854'), - Time(time='+2023-02-27T00:00:00Z', prop_nr='P813', precision=WikibaseDatePrecision.DAY) + ExternalID( + value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB", + prop_nr="P854", + ), + Time( + time="+2023-02-27T00:00:00Z", + prop_nr="P813", + precision=WikibaseDatePrecision.DAY, + ), ] ] - - occupationDeputy = Item(value='Q1055894', prop_nr='P106', references=references) - + + occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references) + ## data goes into a list, because many data objects can be provided to # data1 = [isHuman] data2 = [occupationDeputy] @@ -155,19 +146,16 @@ class WikidataPEP(object): # data4 = [referenceURL] ## get item for Qid item = wbi.item.get(entity_id=Qid) - + # print(item.claims) - + # Set an english label - #item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP) - + # item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP) + # Set a French description - #item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP) - - #item.claims.add(data4) + # item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP) + + # item.claims.add(data4) item.claims.add(data2) - #item.claims.add(data3) + # item.claims.add(data3) print(item.write()) - - -