formatted with black

This commit is contained in:
alpcentaur 2023-03-09 16:07:47 +00:00
parent f395b87ab6
commit 69480ecc26
4 changed files with 207 additions and 220 deletions

View file

@ -1,5 +1,3 @@
import os import os
import yaml import yaml
@ -12,58 +10,56 @@ import lxml.html
import lxml.html.soupparser import lxml.html.soupparser
class membersParliamentCrawler(object): class membersParliamentCrawler(object):
def __init__(self, configFile): def __init__(self, configFile):
with open(configFile, "r") as stream: with open(configFile, "r") as stream:
try: try:
self.config = yaml.safe_load(stream) self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc: except yaml.YAMLError as exc:
print(exc) print(exc)
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico'] # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
def downloadMemberListPagesOfCountries(self, listOfCountries): def downloadMemberListPagesOfCountries(self, listOfCountries):
# download only html pages of the countries specified in input # download only html pages of the countries specified in input
for country in listOfCountries: for country in listOfCountries:
for key in self.config: for key in self.config:
if key in listOfCountries: if key in listOfCountries:
try: try:
memberList = self.config.get(key).get('memberList') memberList = self.config.get(key).get("memberList")
except Exception as e: except Exception as e:
print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e) print(
"There is a problem with the entry memberList in the config.yaml - the original error message is:",
e,
)
try: try:
memberListLink = memberList.get('link') memberListLink = memberList.get("link")
except Exception as e: except Exception as e:
print("No memberListLink defined in config.yaml - the original error message is:", e) print(
"No memberListLink defined in config.yaml - the original error message is:",
e,
)
# download the html page of the List of Members # download the html page of the List of Members
response = urllib.request.urlopen(memberListLink) response = urllib.request.urlopen(memberListLink)
webContent = response.read().decode('UTF-8') webContent = response.read().decode("UTF-8")
# save interim results to files # save interim results to files
f = open('crawlers/pages/' + key +'MemberList.html', 'w+') f = open("crawlers/pages/" + key + "MemberList.html", "w+")
f.write(webContent) f.write(webContent)
f.close f.close
def parseMemberListData2dictionary(self, listOfCountries): def parseMemberListData2dictionary(self, listOfCountries):
for country in listOfCountries: for country in listOfCountries:
try: try:
# use soupparser to handle broken html # use soupparser to handle broken html
tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html') tree = lxml.html.soupparser.parse(
"crawlers/pages/" + country + "MemberList.html"
)
# for e in tree.iter(): # for e in tree.iter():
# #
@ -76,108 +72,115 @@ class membersParliamentCrawler(object):
dictionaryMemberList = {} dictionaryMemberList = {}
countryConf = self.config.get(country) countryConf = self.config.get(country)
countryDomain = countryConf.get('domain') countryDomain = countryConf.get("domain")
countryConfMemberList = countryConf.get('memberList') countryConfMemberList = countryConf.get("memberList")
countryConfMemberListParent = countryConfMemberList.get('parent') countryConfMemberListParent = countryConfMemberList.get("parent")
countryConfMemberListChildName = countryConfMemberList.get('child-name') countryConfMemberListChildName = countryConfMemberList.get("child-name")
countryConfMemberListChildLink = countryConfMemberList.get('child-link') countryConfMemberListChildLink = countryConfMemberList.get("child-link")
for n in range(len(tree.xpath(countryConfMemberListParent))): for n in range(len(tree.xpath(countryConfMemberListParent))):
name = tree.xpath(
name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName) countryConfMemberListParent
link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink) + "["
+ str(n)
+ "]"
+ countryConfMemberListChildName
)
link = tree.xpath(
countryConfMemberListParent
+ "["
+ str(n)
+ "]"
+ countryConfMemberListChildLink
)
if len(name) > 0: if len(name) > 0:
dictionaryMemberList[n] = {} dictionaryMemberList[n] = {}
dictionaryMemberList[n]['name'] = name[0] dictionaryMemberList[n]["name"] = name[0]
if countryDomain in link[0]: if countryDomain in link[0]:
dictionaryMemberList[n]["link"] = link[0]
dictionaryMemberList[n]['link'] = link[0]
if countryDomain not in link[0]: if countryDomain not in link[0]:
dictionaryMemberList[n]["link"] = countryDomain + link[0]
dictionaryMemberList[n]['link'] = countryDomain + link[0]
except Exception as e: except Exception as e:
print(
print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e) "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
e,
)
# save interim results to files # save interim results to files
f = open('crawlers/output/' + country +'MemberList.txt', 'w+') f = open("crawlers/output/" + country + "MemberList.txt", "w+")
f.write(str(dictionaryMemberList)) f.write(str(dictionaryMemberList))
f.close f.close
def downloadMemberDataHtmls(self, listOfCountries): def downloadMemberDataHtmls(self, listOfCountries):
for country in listOfCountries: for country in listOfCountries:
f = open("crawlers/output/" + country + "MemberList.txt")
f = open('crawlers/output/' + country +'MemberList.txt')
text = f.read() text = f.read()
dictionaryMemberList = eval(text) dictionaryMemberList = eval(text)
for memberid in dictionaryMemberList: for memberid in dictionaryMemberList:
memberLink = dictionaryMemberList[memberid]["link"]
memberLink = dictionaryMemberList[memberid]['link']
# download the html page of the Member # download the html page of the Member
response = urllib.request.urlopen(memberLink) response = urllib.request.urlopen(memberLink)
webContent = response.read().decode('UTF-8') webContent = response.read().decode("UTF-8")
# save interim results to files # save interim results to files
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html' filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
os.makedirs(os.path.dirname(filename), exist_ok=True) os.makedirs(os.path.dirname(filename), exist_ok=True)
f = open( filename, 'w+') f = open(filename, "w+")
f.write(webContent) f.write(webContent)
f.close f.close
def parseMemberData2dictionary(self, listOfCountries): def parseMemberData2dictionary(self, listOfCountries):
for country in listOfCountries: for country in listOfCountries:
print("started to parse data of member of " + country + " ..")
print('started to parse data of member of ' + country + ' ..') f = open("crawlers/output/" + country + "MemberList.txt")
f = open('crawlers/output/' + country +'MemberList.txt')
text = f.read() text = f.read()
dictionaryMemberList = eval(text) dictionaryMemberList = eval(text)
countryConf = self.config.get(country) countryConf = self.config.get(country)
countryDomain = countryConf.get('domain') countryDomain = countryConf.get("domain")
countryConfMember = countryConf.get('member') countryConfMember = countryConf.get("member")
countryConfMemberInfo1 = countryConfMember.get('info-1') countryConfMemberInfo1 = countryConfMember.get("info-1")
countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent') countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty') countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
"child-politicalParty"
)
for memberid in dictionaryMemberList: for memberid in dictionaryMemberList:
print(
"started to parse data of member with name "
+ dictionaryMemberList[memberid]["name"]
+ " .."
)
print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..') filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
tree = lxml.html.soupparser.parse(filename) tree = lxml.html.soupparser.parse(filename)
politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty) politicalParty = tree.xpath(
countryConfMemberInfo1Parent
+ countryConfMemberInfo1ChildPoliticalParty
)
print('oi', politicalParty) print("oi", politicalParty)
if len(politicalParty) > 0: if len(politicalParty) > 0:
dictionaryMemberList[memberid]["political party"] = politicalParty[
0
]
dictionaryMemberList[memberid]['political party'] = politicalParty[0] f = open("crawlers/output/" + country + "MemberList.txt", "w+")
f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
f.write(str(dictionaryMemberList)) f.write(str(dictionaryMemberList))
f.close f.close

View file

@ -1,10 +1,8 @@
from crawlers.MembersParliamentCrawler import * from crawlers.MembersParliamentCrawler import *
config = "config.yaml"
config = 'config.yaml' listOfCountries = ["nicaragua"]
listOfCountries = ['nicaragua']
Crawler = membersParliamentCrawler(config) Crawler = membersParliamentCrawler(config)

View file

@ -1,11 +1,9 @@
from crawlers.MembersParliamentCrawler import * from crawlers.MembersParliamentCrawler import *
from wikidata.wdPEP import * from wikidata.wdPEP import *
config = 'crawlers/config.yaml' config = "crawlers/config.yaml"
listOfCountries = ['nicaragua'] listOfCountries = ["nicaragua"]
# doing the crawling of government websites # doing the crawling of government websites
@ -30,4 +28,4 @@ wikidataPEP = WikidataPEP(config)
# wikidataPEP.createMemberOnWikidata() # wikidataPEP.createMemberOnWikidata()
wikidataPEP.editMemberOnWikidata('Q116918332') wikidataPEP.editMemberOnWikidata("Q116918332")

View file

@ -1,91 +1,75 @@
import os import os
import yaml import yaml
import json import json
class WikidataPEP(object): class WikidataPEP(object):
def __init__(self, configFile): def __init__(self, configFile):
with open(configFile, "r") as stream: with open(configFile, "r") as stream:
try: try:
self.config = yaml.safe_load(stream) self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc: except yaml.YAMLError as exc:
print(exc) print(exc)
def importMembersOfParliamentDict(self, listOfCountries): def importMembersOfParliamentDict(self, listOfCountries):
self.fullDictionaryMemberLists = {} self.fullDictionaryMemberLists = {}
for country in listOfCountries: for country in listOfCountries:
print("started to parse data of members of " + country + " ..")
print('started to parse data of members of ' + country + ' ..') f = open("crawlers/output/" + country + "MemberList.txt")
f = open('crawlers/output/' + country +'MemberList.txt')
text = f.read() text = f.read()
self.fullDictionaryMemberLists[country] = eval(text) self.fullDictionaryMemberLists[country] = eval(text)
# print(self.fullDictionaryMemberLists) # print(self.fullDictionaryMemberLists)
def checkForEntityIds(self, listOfCountries): def checkForEntityIds(self, listOfCountries):
from wikibaseintegrator import WikibaseIntegrator from wikibaseintegrator import WikibaseIntegrator
from wikibaseintegrator import wbi_helpers from wikibaseintegrator import wbi_helpers
fullDictionaryMemberLists = self.fullDictionaryMemberLists fullDictionaryMemberLists = self.fullDictionaryMemberLists
for country in listOfCountries: for country in listOfCountries:
for memberId in fullDictionaryMemberLists[country].keys(): for memberId in fullDictionaryMemberLists[country].keys():
name = fullDictionaryMemberLists[country][memberId]["name"]
name = fullDictionaryMemberLists[country][memberId]['name']
results = wbi_helpers.search_entities(search_string=name) results = wbi_helpers.search_entities(search_string=name)
for entityId in results: for entityId in results:
wbi = WikibaseIntegrator() wbi = WikibaseIntegrator()
wikidata_item = wbi.item.get(entity_id=entityId) wikidata_item = wbi.item.get(entity_id=entityId)
for claimkey in wikidata_item.get_json()['claims'].keys(): for claimkey in wikidata_item.get_json()["claims"].keys():
if claimkey == "P31":
if claimkey == 'P31': if (
wikidata_item.get_json()["claims"][claimkey][0][
if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5': "mainsnak"
]["datavalue"]["value"]["id"]
== "Q5"
):
print(entityId) print(entityId)
print('---------') print("---------")
print(name) print(name)
print('is a human') print("is a human")
def createMemberOnWikidata(self): def createMemberOnWikidata(self):
from wikibaseintegrator import wbi_login, WikibaseIntegrator from wikibaseintegrator import wbi_login, WikibaseIntegrator
from wikibaseintegrator.datatypes import ExternalID, Item from wikibaseintegrator.datatypes import ExternalID, Item
from wikibaseintegrator.wbi_config import config as wbi_config from wikibaseintegrator.wbi_config import config as wbi_config
wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)' wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)"
# login object # login object
login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='') login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
wbi = WikibaseIntegrator(login=login_instance) wbi = WikibaseIntegrator(login=login_instance)
# data type object, e.g. for a NCBI gene entrez ID # data type object, e.g. for a NCBI gene entrez ID
isHuman = Item(value='Q5', prop_nr='P31') isHuman = Item(value="Q5", prop_nr="P31")
occupationPolitician = ExternalID(value='Q82955', prop_nr='P106') occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106') occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
# referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106') # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
# print(isHuman) # print(isHuman)
@ -100,33 +84,33 @@ class WikidataPEP(object):
item = wbi.item.new() item = wbi.item.new()
# Set an english label # Set an english label
item.labels.set(language='en', value='Carlos Humberto Ruíz') item.labels.set(language="en", value="Carlos Humberto Ruíz")
# Carlos Humberto Ruiz has the Qid Q116918332 # Carlos Humberto Ruiz has the Qid Q116918332
# Set a French description # Set a French description
item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy') item.descriptions.set(
language="en", value="Nicaraguan National Assembly Deputy"
)
item.claims.add(data1) item.claims.add(data1)
# item.claims.add(data2) # item.claims.add(data2)
# item.claims.add(data3) # item.claims.add(data3)
print(item.write()) print(item.write())
def editMemberOnWikidata(self, Qid): def editMemberOnWikidata(self, Qid):
from wikibaseintegrator import wbi_login, WikibaseIntegrator from wikibaseintegrator import wbi_login, WikibaseIntegrator
from wikibaseintegrator.datatypes import ExternalID, Item, Time, String from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
from wikibaseintegrator.wbi_config import config as wbi_config from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator.wbi_enums import ActionIfExists from wikibaseintegrator.wbi_enums import ActionIfExists
from wikibaseintegrator.wbi_enums import WikibaseDatePrecision from wikibaseintegrator.wbi_enums import WikibaseDatePrecision
wbi_config[
wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)' "USER_AGENT"
] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)"
# login object # login object
login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='') login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
wbi = WikibaseIntegrator(login=login_instance) wbi = WikibaseIntegrator(login=login_instance)
@ -141,12 +125,19 @@ class WikidataPEP(object):
references = [ references = [
[ [
ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854'), ExternalID(
Time(time='+2023-02-27T00:00:00Z', prop_nr='P813', precision=WikibaseDatePrecision.DAY) value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB",
prop_nr="P854",
),
Time(
time="+2023-02-27T00:00:00Z",
prop_nr="P813",
precision=WikibaseDatePrecision.DAY,
),
] ]
] ]
occupationDeputy = Item(value='Q1055894', prop_nr='P106', references=references) occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
## data goes into a list, because many data objects can be provided to ## data goes into a list, because many data objects can be provided to
# data1 = [isHuman] # data1 = [isHuman]
@ -168,6 +159,3 @@ class WikidataPEP(object):
item.claims.add(data2) item.claims.add(data2)
# item.claims.add(data3) # item.claims.add(data3)
print(item.write()) print(item.write())