automated Pipeline for parsing profiles of politically exposed persons (PEP) into Wikidata
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
6.1 KiB

  1. import os
  2. import yaml
  3. import json
  4. class WikidataPEP(object):
  5. def __init__(self, configFile):
  6. with open(configFile, "r") as stream:
  7. try:
  8. self.config = yaml.safe_load(stream)
  9. except yaml.YAMLError as exc:
  10. print(exc)
  11. def importMembersOfParliamentDict(self, listOfCountries):
  12. self.fullDictionaryMemberLists = {}
  13. for country in listOfCountries:
  14. print("started to parse data of members of " + country + " ..")
  15. f = open("crawlers/output/" + country + "MemberList.txt")
  16. text = f.read()
  17. self.fullDictionaryMemberLists[country] = eval(text)
  18. # print(self.fullDictionaryMemberLists)
  19. def checkForEntityIds(self, listOfCountries):
  20. from wikibaseintegrator import WikibaseIntegrator
  21. from wikibaseintegrator import wbi_helpers
  22. fullDictionaryMemberLists = self.fullDictionaryMemberLists
  23. for country in listOfCountries:
  24. for memberId in fullDictionaryMemberLists[country].keys():
  25. name = fullDictionaryMemberLists[country][memberId]["name"]
  26. results = wbi_helpers.search_entities(search_string=name)
  27. for entityId in results:
  28. wbi = WikibaseIntegrator()
  29. wikidata_item = wbi.item.get(entity_id=entityId)
  30. for claimkey in wikidata_item.get_json()["claims"].keys():
  31. if claimkey == "P31":
  32. if (
  33. wikidata_item.get_json()["claims"][claimkey][0][
  34. "mainsnak"
  35. ]["datavalue"]["value"]["id"]
  36. == "Q5"
  37. ):
  38. print(entityId)
  39. print("---------")
  40. print(name)
  41. print("is a human")
  42. def createMemberOnWikidata(self):
  43. from wikibaseintegrator import wbi_login, WikibaseIntegrator
  44. from wikibaseintegrator.datatypes import ExternalID, Item
  45. from wikibaseintegrator.wbi_config import config as wbi_config
  46. wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)"
  47. # login object
  48. login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
  49. wbi = WikibaseIntegrator(login=login_instance)
  50. # data type object, e.g. for a NCBI gene entrez ID
  51. isHuman = Item(value="Q5", prop_nr="P31")
  52. occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
  53. occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
  54. # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
  55. # print(isHuman)
  56. # print(referenceURL)
  57. # data goes into a list, because many data objects can be provided to
  58. data1 = [isHuman]
  59. data2 = [occupationDeputy]
  60. data3 = [occupationPolitician]
  61. # Create a new item
  62. item = wbi.item.new()
  63. # Set an english label
  64. item.labels.set(language="en", value="Carlos Humberto Ruíz")
  65. # Carlos Humberto Ruiz has the Qid Q116918332
  66. # Set a French description
  67. item.descriptions.set(
  68. language="en", value="Nicaraguan National Assembly Deputy"
  69. )
  70. item.claims.add(data1)
  71. # item.claims.add(data2)
  72. # item.claims.add(data3)
  73. print(item.write())
  74. def editMemberOnWikidata(self, Qid):
  75. from wikibaseintegrator import wbi_login, WikibaseIntegrator
  76. from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
  77. from wikibaseintegrator.wbi_config import config as wbi_config
  78. from wikibaseintegrator.wbi_enums import ActionIfExists
  79. from wikibaseintegrator.wbi_enums import WikibaseDatePrecision
  80. wbi_config[
  81. "USER_AGENT"
  82. ] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)"
  83. # login object
  84. login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
  85. wbi = WikibaseIntegrator(login=login_instance)
  86. # data type object, e.g. for a NCBI gene entrez ID
  87. # isHuman = Item(value='Q5', prop_nr='P31')
  88. # occupationPolitician = Item(value='Q82955', prop_nr='P106')
  89. # occupationDeputy = Item(value='Q1055894', prop_nr='P106')
  90. # referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')
  91. # print(isHuman)
  92. # print(referenceURL)
  93. references = [
  94. [
  95. ExternalID(
  96. value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB",
  97. prop_nr="P854",
  98. ),
  99. Time(
  100. time="+2023-02-27T00:00:00Z",
  101. prop_nr="P813",
  102. precision=WikibaseDatePrecision.DAY,
  103. ),
  104. ]
  105. ]
  106. occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
  107. ## data goes into a list, because many data objects can be provided to
  108. # data1 = [isHuman]
  109. data2 = [occupationDeputy]
  110. # data3 = [occupationPolitician]
  111. # data4 = [referenceURL]
  112. ## get item for Qid
  113. item = wbi.item.get(entity_id=Qid)
  114. # print(item.claims)
  115. # Set an english label
  116. # item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
  117. # Set a French description
  118. # item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
  119. # item.claims.add(data4)
  120. item.claims.add(data2)
  121. # item.claims.add(data3)
  122. print(item.write())