formatted with black
This commit is contained in:
parent
f395b87ab6
commit
69480ecc26
4 changed files with 207 additions and 220 deletions
|
@ -1,5 +1,3 @@
|
|||
|
||||
|
||||
import os
|
||||
|
||||
import yaml
|
||||
|
@ -12,58 +10,56 @@ import lxml.html
|
|||
import lxml.html.soupparser
|
||||
|
||||
|
||||
|
||||
|
||||
class membersParliamentCrawler(object):
|
||||
|
||||
def __init__(self, configFile):
|
||||
|
||||
with open(configFile, "r") as stream:
|
||||
try:
|
||||
self.config = yaml.safe_load(stream)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
|
||||
|
||||
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
|
||||
|
||||
def downloadMemberListPagesOfCountries(self, listOfCountries):
|
||||
|
||||
# download only html pages of the countries specified in input
|
||||
|
||||
for country in listOfCountries:
|
||||
for key in self.config:
|
||||
if key in listOfCountries:
|
||||
try:
|
||||
memberList = self.config.get(key).get('memberList')
|
||||
memberList = self.config.get(key).get("memberList")
|
||||
except Exception as e:
|
||||
print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e)
|
||||
print(
|
||||
"There is a problem with the entry memberList in the config.yaml - the original error message is:",
|
||||
e,
|
||||
)
|
||||
try:
|
||||
memberListLink = memberList.get('link')
|
||||
memberListLink = memberList.get("link")
|
||||
except Exception as e:
|
||||
print("No memberListLink defined in config.yaml - the original error message is:", e)
|
||||
print(
|
||||
"No memberListLink defined in config.yaml - the original error message is:",
|
||||
e,
|
||||
)
|
||||
|
||||
# download the html page of the List of Members
|
||||
|
||||
response = urllib.request.urlopen(memberListLink)
|
||||
webContent = response.read().decode('UTF-8')
|
||||
webContent = response.read().decode("UTF-8")
|
||||
|
||||
# save interim results to files
|
||||
|
||||
f = open('crawlers/pages/' + key +'MemberList.html', 'w+')
|
||||
f = open("crawlers/pages/" + key + "MemberList.html", "w+")
|
||||
f.write(webContent)
|
||||
f.close
|
||||
|
||||
|
||||
def parseMemberListData2dictionary(self, listOfCountries):
|
||||
|
||||
for country in listOfCountries:
|
||||
|
||||
try:
|
||||
|
||||
# use soupparser to handle broken html
|
||||
|
||||
tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html')
|
||||
tree = lxml.html.soupparser.parse(
|
||||
"crawlers/pages/" + country + "MemberList.html"
|
||||
)
|
||||
|
||||
# for e in tree.iter():
|
||||
#
|
||||
|
@ -76,108 +72,115 @@ class membersParliamentCrawler(object):
|
|||
dictionaryMemberList = {}
|
||||
|
||||
countryConf = self.config.get(country)
|
||||
countryDomain = countryConf.get('domain')
|
||||
countryConfMemberList = countryConf.get('memberList')
|
||||
countryConfMemberListParent = countryConfMemberList.get('parent')
|
||||
countryConfMemberListChildName = countryConfMemberList.get('child-name')
|
||||
countryConfMemberListChildLink = countryConfMemberList.get('child-link')
|
||||
countryDomain = countryConf.get("domain")
|
||||
countryConfMemberList = countryConf.get("memberList")
|
||||
countryConfMemberListParent = countryConfMemberList.get("parent")
|
||||
countryConfMemberListChildName = countryConfMemberList.get("child-name")
|
||||
countryConfMemberListChildLink = countryConfMemberList.get("child-link")
|
||||
|
||||
for n in range(len(tree.xpath(countryConfMemberListParent))):
|
||||
|
||||
name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName)
|
||||
link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink)
|
||||
name = tree.xpath(
|
||||
countryConfMemberListParent
|
||||
+ "["
|
||||
+ str(n)
|
||||
+ "]"
|
||||
+ countryConfMemberListChildName
|
||||
)
|
||||
link = tree.xpath(
|
||||
countryConfMemberListParent
|
||||
+ "["
|
||||
+ str(n)
|
||||
+ "]"
|
||||
+ countryConfMemberListChildLink
|
||||
)
|
||||
|
||||
if len(name) > 0:
|
||||
|
||||
dictionaryMemberList[n] = {}
|
||||
dictionaryMemberList[n]['name'] = name[0]
|
||||
dictionaryMemberList[n]["name"] = name[0]
|
||||
|
||||
if countryDomain in link[0]:
|
||||
|
||||
dictionaryMemberList[n]['link'] = link[0]
|
||||
dictionaryMemberList[n]["link"] = link[0]
|
||||
|
||||
if countryDomain not in link[0]:
|
||||
|
||||
dictionaryMemberList[n]['link'] = countryDomain + link[0]
|
||||
dictionaryMemberList[n]["link"] = countryDomain + link[0]
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)
|
||||
print(
|
||||
"parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
|
||||
e,
|
||||
)
|
||||
|
||||
# save interim results to files
|
||||
|
||||
f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
|
||||
f = open("crawlers/output/" + country + "MemberList.txt", "w+")
|
||||
f.write(str(dictionaryMemberList))
|
||||
f.close
|
||||
|
||||
def downloadMemberDataHtmls(self, listOfCountries):
|
||||
|
||||
for country in listOfCountries:
|
||||
|
||||
f = open('crawlers/output/' + country +'MemberList.txt')
|
||||
f = open("crawlers/output/" + country + "MemberList.txt")
|
||||
text = f.read()
|
||||
|
||||
dictionaryMemberList = eval(text)
|
||||
|
||||
|
||||
for memberid in dictionaryMemberList:
|
||||
|
||||
|
||||
memberLink = dictionaryMemberList[memberid]['link']
|
||||
memberLink = dictionaryMemberList[memberid]["link"]
|
||||
|
||||
# download the html page of the Member
|
||||
|
||||
response = urllib.request.urlopen(memberLink)
|
||||
webContent = response.read().decode('UTF-8')
|
||||
webContent = response.read().decode("UTF-8")
|
||||
|
||||
# save interim results to files
|
||||
|
||||
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
|
||||
filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
|
||||
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
f = open( filename, 'w+')
|
||||
f = open(filename, "w+")
|
||||
f.write(webContent)
|
||||
f.close
|
||||
|
||||
|
||||
def parseMemberData2dictionary(self, listOfCountries):
|
||||
|
||||
for country in listOfCountries:
|
||||
print("started to parse data of member of " + country + " ..")
|
||||
|
||||
print('started to parse data of member of ' + country + ' ..')
|
||||
|
||||
f = open('crawlers/output/' + country +'MemberList.txt')
|
||||
f = open("crawlers/output/" + country + "MemberList.txt")
|
||||
text = f.read()
|
||||
|
||||
dictionaryMemberList = eval(text)
|
||||
|
||||
|
||||
countryConf = self.config.get(country)
|
||||
countryDomain = countryConf.get('domain')
|
||||
countryConfMember = countryConf.get('member')
|
||||
countryConfMemberInfo1 = countryConfMember.get('info-1')
|
||||
countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent')
|
||||
countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty')
|
||||
countryDomain = countryConf.get("domain")
|
||||
countryConfMember = countryConf.get("member")
|
||||
countryConfMemberInfo1 = countryConfMember.get("info-1")
|
||||
countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
|
||||
countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
|
||||
"child-politicalParty"
|
||||
)
|
||||
|
||||
for memberid in dictionaryMemberList:
|
||||
print(
|
||||
"started to parse data of member with name "
|
||||
+ dictionaryMemberList[memberid]["name"]
|
||||
+ " .."
|
||||
)
|
||||
|
||||
print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')
|
||||
|
||||
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
|
||||
filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
|
||||
|
||||
tree = lxml.html.soupparser.parse(filename)
|
||||
|
||||
politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty)
|
||||
politicalParty = tree.xpath(
|
||||
countryConfMemberInfo1Parent
|
||||
+ countryConfMemberInfo1ChildPoliticalParty
|
||||
)
|
||||
|
||||
print('oi', politicalParty)
|
||||
print("oi", politicalParty)
|
||||
|
||||
if len(politicalParty) > 0:
|
||||
dictionaryMemberList[memberid]["political party"] = politicalParty[
|
||||
0
|
||||
]
|
||||
|
||||
dictionaryMemberList[memberid]['political party'] = politicalParty[0]
|
||||
|
||||
|
||||
|
||||
f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
|
||||
f = open("crawlers/output/" + country + "MemberList.txt", "w+")
|
||||
f.write(str(dictionaryMemberList))
|
||||
f.close
|
||||
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
|
||||
from crawlers.MembersParliamentCrawler import *
|
||||
|
||||
|
||||
|
||||
config = 'config.yaml'
|
||||
listOfCountries = ['nicaragua']
|
||||
config = "config.yaml"
|
||||
listOfCountries = ["nicaragua"]
|
||||
|
||||
|
||||
Crawler = membersParliamentCrawler(config)
|
||||
|
|
8
main.py
8
main.py
|
@ -1,11 +1,9 @@
|
|||
|
||||
from crawlers.MembersParliamentCrawler import *
|
||||
|
||||
from wikidata.wdPEP import *
|
||||
|
||||
config = 'crawlers/config.yaml'
|
||||
listOfCountries = ['nicaragua']
|
||||
|
||||
config = "crawlers/config.yaml"
|
||||
listOfCountries = ["nicaragua"]
|
||||
|
||||
|
||||
# doing the crawling of government websites
|
||||
|
@ -30,4 +28,4 @@ wikidataPEP = WikidataPEP(config)
|
|||
|
||||
# wikidataPEP.createMemberOnWikidata()
|
||||
|
||||
wikidataPEP.editMemberOnWikidata('Q116918332')
|
||||
wikidataPEP.editMemberOnWikidata("Q116918332")
|
||||
|
|
|
@ -1,91 +1,75 @@
|
|||
|
||||
|
||||
import os
|
||||
|
||||
import yaml
|
||||
import json
|
||||
|
||||
|
||||
|
||||
class WikidataPEP(object):
|
||||
|
||||
def __init__(self, configFile):
|
||||
|
||||
with open(configFile, "r") as stream:
|
||||
try:
|
||||
self.config = yaml.safe_load(stream)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
|
||||
|
||||
def importMembersOfParliamentDict(self, listOfCountries):
|
||||
|
||||
self.fullDictionaryMemberLists = {}
|
||||
|
||||
for country in listOfCountries:
|
||||
print("started to parse data of members of " + country + " ..")
|
||||
|
||||
print('started to parse data of members of ' + country + ' ..')
|
||||
|
||||
f = open('crawlers/output/' + country +'MemberList.txt')
|
||||
f = open("crawlers/output/" + country + "MemberList.txt")
|
||||
text = f.read()
|
||||
|
||||
self.fullDictionaryMemberLists[country] = eval(text)
|
||||
|
||||
# print(self.fullDictionaryMemberLists)
|
||||
|
||||
|
||||
def checkForEntityIds(self, listOfCountries):
|
||||
|
||||
from wikibaseintegrator import WikibaseIntegrator
|
||||
from wikibaseintegrator import wbi_helpers
|
||||
|
||||
fullDictionaryMemberLists = self.fullDictionaryMemberLists
|
||||
|
||||
for country in listOfCountries:
|
||||
|
||||
for memberId in fullDictionaryMemberLists[country].keys():
|
||||
|
||||
name = fullDictionaryMemberLists[country][memberId]['name']
|
||||
name = fullDictionaryMemberLists[country][memberId]["name"]
|
||||
|
||||
results = wbi_helpers.search_entities(search_string=name)
|
||||
|
||||
|
||||
for entityId in results:
|
||||
|
||||
wbi = WikibaseIntegrator()
|
||||
wikidata_item = wbi.item.get(entity_id=entityId)
|
||||
|
||||
for claimkey in wikidata_item.get_json()['claims'].keys():
|
||||
|
||||
if claimkey == 'P31':
|
||||
|
||||
if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
|
||||
|
||||
for claimkey in wikidata_item.get_json()["claims"].keys():
|
||||
if claimkey == "P31":
|
||||
if (
|
||||
wikidata_item.get_json()["claims"][claimkey][0][
|
||||
"mainsnak"
|
||||
]["datavalue"]["value"]["id"]
|
||||
== "Q5"
|
||||
):
|
||||
print(entityId)
|
||||
print('---------')
|
||||
print("---------")
|
||||
print(name)
|
||||
print('is a human')
|
||||
|
||||
|
||||
|
||||
print("is a human")
|
||||
|
||||
def createMemberOnWikidata(self):
|
||||
|
||||
from wikibaseintegrator import wbi_login, WikibaseIntegrator
|
||||
from wikibaseintegrator.datatypes import ExternalID, Item
|
||||
from wikibaseintegrator.wbi_config import config as wbi_config
|
||||
|
||||
wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)'
|
||||
wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)"
|
||||
|
||||
# login object
|
||||
login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
|
||||
login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
|
||||
|
||||
wbi = WikibaseIntegrator(login=login_instance)
|
||||
|
||||
# data type object, e.g. for a NCBI gene entrez ID
|
||||
isHuman = Item(value='Q5', prop_nr='P31')
|
||||
occupationPolitician = ExternalID(value='Q82955', prop_nr='P106')
|
||||
occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106')
|
||||
isHuman = Item(value="Q5", prop_nr="P31")
|
||||
occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
|
||||
occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
|
||||
# referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
|
||||
|
||||
# print(isHuman)
|
||||
|
@ -100,33 +84,33 @@ class WikidataPEP(object):
|
|||
item = wbi.item.new()
|
||||
|
||||
# Set an english label
|
||||
item.labels.set(language='en', value='Carlos Humberto Ruíz')
|
||||
item.labels.set(language="en", value="Carlos Humberto Ruíz")
|
||||
|
||||
# Carlos Humberto Ruiz has the Qid Q116918332
|
||||
|
||||
# Set a French description
|
||||
item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy')
|
||||
item.descriptions.set(
|
||||
language="en", value="Nicaraguan National Assembly Deputy"
|
||||
)
|
||||
|
||||
item.claims.add(data1)
|
||||
# item.claims.add(data2)
|
||||
# item.claims.add(data3)
|
||||
print(item.write())
|
||||
|
||||
|
||||
|
||||
def editMemberOnWikidata(self, Qid):
|
||||
|
||||
from wikibaseintegrator import wbi_login, WikibaseIntegrator
|
||||
from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
|
||||
from wikibaseintegrator.wbi_config import config as wbi_config
|
||||
from wikibaseintegrator.wbi_enums import ActionIfExists
|
||||
from wikibaseintegrator.wbi_enums import WikibaseDatePrecision
|
||||
|
||||
|
||||
wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)'
|
||||
wbi_config[
|
||||
"USER_AGENT"
|
||||
] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)"
|
||||
|
||||
# login object
|
||||
login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
|
||||
login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
|
||||
|
||||
wbi = WikibaseIntegrator(login=login_instance)
|
||||
|
||||
|
@ -141,12 +125,19 @@ class WikidataPEP(object):
|
|||
|
||||
references = [
|
||||
[
|
||||
ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854'),
|
||||
Time(time='+2023-02-27T00:00:00Z', prop_nr='P813', precision=WikibaseDatePrecision.DAY)
|
||||
ExternalID(
|
||||
value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB",
|
||||
prop_nr="P854",
|
||||
),
|
||||
Time(
|
||||
time="+2023-02-27T00:00:00Z",
|
||||
prop_nr="P813",
|
||||
precision=WikibaseDatePrecision.DAY,
|
||||
),
|
||||
]
|
||||
]
|
||||
|
||||
occupationDeputy = Item(value='Q1055894', prop_nr='P106', references=references)
|
||||
occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
|
||||
|
||||
## data goes into a list, because many data objects can be provided to
|
||||
# data1 = [isHuman]
|
||||
|
@ -168,6 +159,3 @@ class WikidataPEP(object):
|
|||
item.claims.add(data2)
|
||||
# item.claims.add(data3)
|
||||
print(item.write())
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue