automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
5.7 KiB

  1. import os
  2. import yaml
  3. import json
  4. class WikidataPEP(object):
  5. def __init__(self, configFile):
  6. with open(configFile, "r") as stream:
  7. try:
  8. self.config = yaml.safe_load(stream)
  9. except yaml.YAMLError as exc:
  10. print(exc)
  11. def importMembersOfParliamentDict(self, listOfCountries):
  12. self.fullDictionaryMemberLists = {}
  13. for country in listOfCountries:
  14. print('started to parse data of members of ' + country + ' ..')
  15. f = open('crawlers/output/' + country +'MemberList.txt')
  16. text = f.read()
  17. self.fullDictionaryMemberLists[country] = eval(text)
  18. #print(self.fullDictionaryMemberLists)
  19. def checkForEntityIds(self, listOfCountries):
  20. from wikibaseintegrator import WikibaseIntegrator
  21. from wikibaseintegrator import wbi_helpers
  22. fullDictionaryMemberLists = self.fullDictionaryMemberLists
  23. for country in listOfCountries:
  24. for memberId in fullDictionaryMemberLists[country].keys():
  25. name = fullDictionaryMemberLists[country][memberId]['name']
  26. results = wbi_helpers.search_entities(search_string=name)
  27. for entityId in results:
  28. wbi = WikibaseIntegrator()
  29. wikidata_item = wbi.item.get(entity_id=entityId)
  30. for claimkey in wikidata_item.get_json()['claims'].keys():
  31. if claimkey == 'P31':
  32. if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
  33. print(entityId)
  34. print('---------')
  35. print(name)
  36. print('is a human')
  37. def createMemberOnWikidata(self):
  38. from wikibaseintegrator import wbi_login, WikibaseIntegrator
  39. from wikibaseintegrator.datatypes import ExternalID, Item
  40. from wikibaseintegrator.wbi_config import config as wbi_config
  41. wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)'
  42. # login object
  43. login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
  44. wbi = WikibaseIntegrator(login=login_instance)
  45. # data type object, e.g. for a NCBI gene entrez ID
  46. isHuman = Item(value='Q5', prop_nr='P31')
  47. occupationPolitician = ExternalID(value='Q82955', prop_nr='P106')
  48. occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106')
  49. #referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
  50. # print(isHuman)
  51. # print(referenceURL)
  52. # data goes into a list, because many data objects can be provided to
  53. data1 = [isHuman]
  54. data2 = [occupationDeputy]
  55. data3 = [occupationPolitician]
  56. # Create a new item
  57. item = wbi.item.new()
  58. # Set an english label
  59. item.labels.set(language='en', value='Carlos Humberto Ruíz')
  60. # Set a French description
  61. item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy')
  62. item.claims.add(data1)
  63. #item.claims.add(data2)
  64. #item.claims.add(data3)
  65. print(item.write())
  66. def editMemberOnWikidata(self):
  67. from wikibaseintegrator import wbi_login, WikibaseIntegrator
  68. from wikibaseintegrator.datatypes import ExternalID, Item
  69. from wikibaseintegrator.wbi_config import config as wbi_config
  70. from wikibaseintegrator.wbi_enums import ActionIfExists
  71. wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Username)'
  72. # login object
  73. login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
  74. wbi = WikibaseIntegrator(login=login_instance)
  75. # data type object, e.g. for a NCBI gene entrez ID
  76. isHuman = Item(value='Q5', prop_nr='P31')
  77. occupationPolitician = Item(value='Q82955', prop_nr='P106')
  78. occupationDeputy = Item(value='Q1055894', prop_nr='P106')
  79. referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')
  80. # print(isHuman)
  81. # print(referenceURL)
  82. # data goes into a list, because many data objects can be provided to
  83. data1 = [isHuman]
  84. data2 = [occupationDeputy]
  85. data3 = [occupationPolitician]
  86. data4 = [referenceURL]
  87. # get item for Qid
  88. item = wbi.item.get(entity_id='Q116918332')
  89. # Set an english label
  90. item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
  91. # Set a French description
  92. item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
  93. item.claims.add(data4)
  94. #item.claims.add(data2)
  95. #item.claims.add(data3)
  96. print(item.write())