automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

161 lines
6.1 KiB

import os
import yaml
import json
class WikidataPEP(object):
def __init__(self, configFile):
with open(configFile, "r") as stream:
try:
self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
def importMembersOfParliamentDict(self, listOfCountries):
self.fullDictionaryMemberLists = {}
for country in listOfCountries:
print("started to parse data of members of " + country + " ..")
f = open("crawlers/output/" + country + "MemberList.txt")
text = f.read()
self.fullDictionaryMemberLists[country] = eval(text)
# print(self.fullDictionaryMemberLists)
def checkForEntityIds(self, listOfCountries):
from wikibaseintegrator import WikibaseIntegrator
from wikibaseintegrator import wbi_helpers
fullDictionaryMemberLists = self.fullDictionaryMemberLists
for country in listOfCountries:
for memberId in fullDictionaryMemberLists[country].keys():
name = fullDictionaryMemberLists[country][memberId]["name"]
results = wbi_helpers.search_entities(search_string=name)
for entityId in results:
wbi = WikibaseIntegrator()
wikidata_item = wbi.item.get(entity_id=entityId)
for claimkey in wikidata_item.get_json()["claims"].keys():
if claimkey == "P31":
if (
wikidata_item.get_json()["claims"][claimkey][0][
"mainsnak"
]["datavalue"]["value"]["id"]
== "Q5"
):
print(entityId)
print("---------")
print(name)
print("is a human")
def createMemberOnWikidata(self):
from wikibaseintegrator import wbi_login, WikibaseIntegrator
from wikibaseintegrator.datatypes import ExternalID, Item
from wikibaseintegrator.wbi_config import config as wbi_config
wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)"
# login object
login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
wbi = WikibaseIntegrator(login=login_instance)
# data type object, e.g. for a NCBI gene entrez ID
isHuman = Item(value="Q5", prop_nr="P31")
occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
# referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
# print(isHuman)
# print(referenceURL)
# data goes into a list, because many data objects can be provided to
data1 = [isHuman]
data2 = [occupationDeputy]
data3 = [occupationPolitician]
# Create a new item
item = wbi.item.new()
# Set an english label
item.labels.set(language="en", value="Carlos Humberto Ruíz")
# Carlos Humberto Ruiz has the Qid Q116918332
# Set a French description
item.descriptions.set(
language="en", value="Nicaraguan National Assembly Deputy"
)
item.claims.add(data1)
# item.claims.add(data2)
# item.claims.add(data3)
print(item.write())
def editMemberOnWikidata(self, Qid):
from wikibaseintegrator import wbi_login, WikibaseIntegrator
from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator.wbi_enums import ActionIfExists
from wikibaseintegrator.wbi_enums import WikibaseDatePrecision
wbi_config[
"USER_AGENT"
] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)"
# login object
login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
wbi = WikibaseIntegrator(login=login_instance)
# data type object, e.g. for a NCBI gene entrez ID
# isHuman = Item(value='Q5', prop_nr='P31')
# occupationPolitician = Item(value='Q82955', prop_nr='P106')
# occupationDeputy = Item(value='Q1055894', prop_nr='P106')
# referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')
# print(isHuman)
# print(referenceURL)
references = [
[
ExternalID(
value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB",
prop_nr="P854",
),
Time(
time="+2023-02-27T00:00:00Z",
prop_nr="P813",
precision=WikibaseDatePrecision.DAY,
),
]
]
occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
## data goes into a list, because many data objects can be provided to
# data1 = [isHuman]
data2 = [occupationDeputy]
# data3 = [occupationPolitician]
# data4 = [referenceURL]
## get item for Qid
item = wbi.item.get(entity_id=Qid)
# print(item.claims)
# Set an english label
# item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
# Set a French description
# item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
# item.claims.add(data4)
item.claims.add(data2)
# item.claims.add(data3)
print(item.write())