automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

112 lines
3.8 KiB

  1. import os
  2. import yaml
  3. import json
  4. class WikidataPEP(object):
  5. def __init__(self, configFile):
  6. with open(configFile, "r") as stream:
  7. try:
  8. self.config = yaml.safe_load(stream)
  9. except yaml.YAMLError as exc:
  10. print(exc)
  11. def importMembersOfParliamentDict(self, listOfCountries):
  12. self.fullDictionaryMemberLists = {}
  13. for country in listOfCountries:
  14. print('started to parse data of members of ' + country + ' ..')
  15. f = open('crawlers/output/' + country +'MemberList.txt')
  16. text = f.read()
  17. self.fullDictionaryMemberLists[country] = eval(text)
  18. #print(self.fullDictionaryMemberLists)
  19. def checkForEntityIds(self, listOfCountries):
  20. from wikibaseintegrator import WikibaseIntegrator
  21. from wikibaseintegrator import wbi_helpers
  22. fullDictionaryMemberLists = self.fullDictionaryMemberLists
  23. for country in listOfCountries:
  24. for memberId in fullDictionaryMemberLists[country].keys():
  25. name = fullDictionaryMemberLists[country][memberId]['name']
  26. results = wbi_helpers.search_entities(search_string=name)
  27. for entityId in results:
  28. wbi = WikibaseIntegrator()
  29. wikidata_item = wbi.item.get(entity_id=entityId)
  30. for claimkey in wikidata_item.get_json()['claims'].keys():
  31. if claimkey == 'P31':
  32. if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
  33. print(entityId)
  34. print('---------')
  35. print(name)
  36. print('is a human')
  37. def createMemberOnWikidata(self):
  38. from wikibaseintegrator import wbi_login, WikibaseIntegrator
  39. from wikibaseintegrator.datatypes import ExternalID, Item
  40. from wikibaseintegrator.wbi_config import config as wbi_config
  41. wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Username)'
  42. # login object
  43. login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
  44. wbi = WikibaseIntegrator(login=login_instance)
  45. # data type object, e.g. for a NCBI gene entrez ID
  46. isHuman = Item(value='Q5', prop_nr='P31')
  47. occupationPolitician = ExternalID(value='Q82955', prop_nr='P106')
  48. occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106')
  49. #referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
  50. # print(isHuman)
  51. # print(referenceURL)
  52. # data goes into a list, because many data objects can be provided to
  53. data1 = [isHuman]
  54. data2 = [occupationDeputy]
  55. data3 = [occupationPolitician]
  56. # Create a new item
  57. item = wbi.item.new()
  58. # Set an english label
  59. item.labels.set(language='en', value='Carlos Humberto Ruíz')
  60. # Set a French description
  61. item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy')
  62. item.claims.add(data1)
  63. #item.claims.add(data2)
  64. #item.claims.add(data3)
  65. print(item.write())