automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

70 lines
2.2 KiB

import os
import yaml
import json
class WikidataPEP(object):
def __init__(self, configFile):
with open(configFile, "r") as stream:
try:
self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
def importMembersOfParliamentDict(self, listOfCountries):
self.fullDictionaryMemberLists = {}
for country in listOfCountries:
print('started to parse data of members of ' + country + ' ..')
f = open('crawlers/output/' + country +'MemberList.txt')
text = f.read()
self.fullDictionaryMemberLists[country] = eval(text)
#print(self.fullDictionaryMemberLists)
def checkForEntityIds(self, listOfCountries):
from wikibaseintegrator import WikibaseIntegrator
from wikibaseintegrator import wbi_helpers
fullDictionaryMemberLists = self.fullDictionaryMemberLists
for country in listOfCountries:
for memberId in fullDictionaryMemberLists[country].keys():
name = fullDictionaryMemberLists[country][memberId]['name']
results = wbi_helpers.search_entities(search_string=name)
for entityId in results:
wbi = WikibaseIntegrator()
wikidata_item = wbi.item.get(entity_id=entityId)
for claimkey in wikidata_item.get_json()['claims'].keys():
if claimkey == 'P31':
if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
print(entityId)
print('---------')
print(name)
print('is a human')