automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

173 lines
6.4 KiB

  1. import os
  2. import yaml
  3. import json
  4. class WikidataPEP(object):
  5. def __init__(self, configFile):
  6. with open(configFile, "r") as stream:
  7. try:
  8. self.config = yaml.safe_load(stream)
  9. except yaml.YAMLError as exc:
  10. print(exc)
  11. def importMembersOfParliamentDict(self, listOfCountries):
  12. self.fullDictionaryMemberLists = {}
  13. for country in listOfCountries:
  14. print('started to parse data of members of ' + country + ' ..')
  15. f = open('crawlers/output/' + country +'MemberList.txt')
  16. text = f.read()
  17. self.fullDictionaryMemberLists[country] = eval(text)
  18. #print(self.fullDictionaryMemberLists)
  19. def checkForEntityIds(self, listOfCountries):
  20. from wikibaseintegrator import WikibaseIntegrator
  21. from wikibaseintegrator import wbi_helpers
  22. fullDictionaryMemberLists = self.fullDictionaryMemberLists
  23. for country in listOfCountries:
  24. for memberId in fullDictionaryMemberLists[country].keys():
  25. name = fullDictionaryMemberLists[country][memberId]['name']
  26. results = wbi_helpers.search_entities(search_string=name)
  27. for entityId in results:
  28. wbi = WikibaseIntegrator()
  29. wikidata_item = wbi.item.get(entity_id=entityId)
  30. for claimkey in wikidata_item.get_json()['claims'].keys():
  31. if claimkey == 'P31':
  32. if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
  33. print(entityId)
  34. print('---------')
  35. print(name)
  36. print('is a human')
  37. def createMemberOnWikidata(self):
  38. from wikibaseintegrator import wbi_login, WikibaseIntegrator
  39. from wikibaseintegrator.datatypes import ExternalID, Item
  40. from wikibaseintegrator.wbi_config import config as wbi_config
  41. wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)'
  42. # login object
  43. login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
  44. wbi = WikibaseIntegrator(login=login_instance)
  45. # data type object, e.g. for a NCBI gene entrez ID
  46. isHuman = Item(value='Q5', prop_nr='P31')
  47. occupationPolitician = ExternalID(value='Q82955', prop_nr='P106')
  48. occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106')
  49. #referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
  50. # print(isHuman)
  51. # print(referenceURL)
  52. # data goes into a list, because many data objects can be provided to
  53. data1 = [isHuman]
  54. data2 = [occupationDeputy]
  55. data3 = [occupationPolitician]
  56. # Create a new item
  57. item = wbi.item.new()
  58. # Set an english label
  59. item.labels.set(language='en', value='Carlos Humberto Ruíz')
  60. # Carlos Humberto Ruiz has the Qid Q116918332
  61. # Set a French description
  62. item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy')
  63. item.claims.add(data1)
  64. #item.claims.add(data2)
  65. #item.claims.add(data3)
  66. print(item.write())
  67. def editMemberOnWikidata(self, Qid):
  68. from wikibaseintegrator import wbi_login, WikibaseIntegrator
  69. from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
  70. from wikibaseintegrator.wbi_config import config as wbi_config
  71. from wikibaseintegrator.wbi_enums import ActionIfExists
  72. from wikibaseintegrator.wbi_enums import WikibaseDatePrecision
  73. wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)'
  74. # login object
  75. login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
  76. wbi = WikibaseIntegrator(login=login_instance)
  77. # data type object, e.g. for a NCBI gene entrez ID
  78. # isHuman = Item(value='Q5', prop_nr='P31')
  79. # occupationPolitician = Item(value='Q82955', prop_nr='P106')
  80. # occupationDeputy = Item(value='Q1055894', prop_nr='P106')
  81. # referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')
  82. # print(isHuman)
  83. # print(referenceURL)
  84. references = [
  85. [
  86. ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854'),
  87. Time(time='+2023-02-27T00:00:00Z', prop_nr='P813', precision=WikibaseDatePrecision.DAY)
  88. ]
  89. ]
  90. occupationDeputy = Item(value='Q1055894', prop_nr='P106', references=references)
  91. ## data goes into a list, because many data objects can be provided to
  92. # data1 = [isHuman]
  93. data2 = [occupationDeputy]
  94. # data3 = [occupationPolitician]
  95. # data4 = [referenceURL]
  96. ## get item for Qid
  97. item = wbi.item.get(entity_id=Qid)
  98. # print(item.claims)
  99. # Set an english label
  100. #item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
  101. # Set a French description
  102. #item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
  103. #item.claims.add(data4)
  104. item.claims.add(data2)
  105. #item.claims.add(data3)
  106. print(item.write())