Browse Source

converted camel case to snake case because of empirical evidence of better readability

master
alpcentaur 1 year ago
parent
commit
dba482c477
6 changed files with 250 additions and 250 deletions
  1. +0
    -186
      crawlers/MembersParliamentCrawler.py
  2. +186
    -0
      crawlers/members_parliament_crawler.py
  3. +0
    -16
      crawlers/useMembersParliamentCrawler.py
  4. +16
    -0
      crawlers/use_members_parliament_crawler.py
  5. +13
    -13
      main.py
  6. +35
    -35
      wikidata/wd_PEP.py

+ 0
- 186
crawlers/MembersParliamentCrawler.py View File

@ -1,186 +0,0 @@
import os
import yaml
import json
import urllib.request, urllib.error, urllib.parse
from lxml import etree
import lxml.html
import lxml.html.soupparser
class membersParliamentCrawler(object):
def __init__(self, configFile):
with open(configFile, "r") as stream:
try:
self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
def downloadMemberListPagesOfCountries(self, listOfCountries):
# download only html pages of the countries specified in input
for country in listOfCountries:
for key in self.config:
if key in listOfCountries:
try:
memberList = self.config.get(key).get("memberList")
except Exception as e:
print(
"There is a problem with the entry memberList in the config.yaml - the original error message is:",
e,
)
try:
memberListLink = memberList.get("link")
except Exception as e:
print(
"No memberListLink defined in config.yaml - the original error message is:",
e,
)
# download the html page of the List of Members
response = urllib.request.urlopen(memberListLink)
webContent = response.read().decode("UTF-8")
# save interim results to files
f = open("crawlers/pages/" + key + "MemberList.html", "w+")
f.write(webContent)
f.close
def parseMemberListData2dictionary(self, listOfCountries):
for country in listOfCountries:
try:
# use soupparser to handle broken html
tree = lxml.html.soupparser.parse(
"crawlers/pages/" + country + "MemberList.html"
)
# for e in tree.iter():
#
# print(e.tag)
#
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
#
# #print(etree.tostring(e).decode())
dictionaryMemberList = {}
countryConf = self.config.get(country)
countryDomain = countryConf.get("domain")
countryConfMemberList = countryConf.get("memberList")
countryConfMemberListParent = countryConfMemberList.get("parent")
countryConfMemberListChildName = countryConfMemberList.get("child-name")
countryConfMemberListChildLink = countryConfMemberList.get("child-link")
for n in range(len(tree.xpath(countryConfMemberListParent))):
name = tree.xpath(
countryConfMemberListParent
+ "["
+ str(n)
+ "]"
+ countryConfMemberListChildName
)
link = tree.xpath(
countryConfMemberListParent
+ "["
+ str(n)
+ "]"
+ countryConfMemberListChildLink
)
if len(name) > 0:
dictionaryMemberList[n] = {}
dictionaryMemberList[n]["name"] = name[0]
if countryDomain in link[0]:
dictionaryMemberList[n]["link"] = link[0]
if countryDomain not in link[0]:
dictionaryMemberList[n]["link"] = countryDomain + link[0]
except Exception as e:
print(
"parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
e,
)
# save interim results to files
f = open("crawlers/output/" + country + "MemberList.txt", "w+")
f.write(str(dictionaryMemberList))
f.close
def downloadMemberDataHtmls(self, listOfCountries):
for country in listOfCountries:
f = open("crawlers/output/" + country + "MemberList.txt")
text = f.read()
dictionaryMemberList = eval(text)
for memberid in dictionaryMemberList:
memberLink = dictionaryMemberList[memberid]["link"]
# download the html page of the Member
response = urllib.request.urlopen(memberLink)
webContent = response.read().decode("UTF-8")
# save interim results to files
filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
os.makedirs(os.path.dirname(filename), exist_ok=True)
f = open(filename, "w+")
f.write(webContent)
f.close
def parseMemberData2dictionary(self, listOfCountries):
for country in listOfCountries:
print("started to parse data of member of " + country + " ..")
f = open("crawlers/output/" + country + "MemberList.txt")
text = f.read()
dictionaryMemberList = eval(text)
countryConf = self.config.get(country)
countryDomain = countryConf.get("domain")
countryConfMember = countryConf.get("member")
countryConfMemberInfo1 = countryConfMember.get("info-1")
countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
"child-politicalParty"
)
for memberid in dictionaryMemberList:
print(
"started to parse data of member with name "
+ dictionaryMemberList[memberid]["name"]
+ " .."
)
filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
tree = lxml.html.soupparser.parse(filename)
politicalParty = tree.xpath(
countryConfMemberInfo1Parent
+ countryConfMemberInfo1ChildPoliticalParty
)
print("oi", politicalParty)
if len(politicalParty) > 0:
dictionaryMemberList[memberid]["political party"] = politicalParty[
0
]
f = open("crawlers/output/" + country + "MemberList.txt", "w+")
f.write(str(dictionaryMemberList))
f.close

+ 186
- 0
crawlers/members_parliament_crawler.py View File

@ -0,0 +1,186 @@
import os
import yaml
import json
import urllib.request, urllib.error, urllib.parse
from lxml import etree
import lxml.html
import lxml.html.soupparser
class members_parliament_crawler(object):
def __init__(self, config_file):
with open(config_file, "r") as stream:
try:
self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
# input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
def download_member_list_pages_of_countries(self, list_of_countries):
# download only html pages of the countries specified in input
for country in list_of_countries:
for key in self.config:
if key in list_of_countries:
try:
member_list = self.config.get(key).get("memberList")
except Exception as e:
print(
"There is a problem with the entry memberList in the config.yaml - the original error message is:",
e,
)
try:
member_list_link = member_list.get("link")
except Exception as e:
print(
"No memberListLink defined in config.yaml - the original error message is:",
e,
)
# download the html page of the List of Members
response = urllib.request.urlopen(member_list_link)
web_content = response.read().decode("UTF-8")
# save interim results to files
f = open("crawlers/pages/" + key + "MemberList.html", "w+")
f.write(webContent)
f.close
def parse_member_list_data2dictionary(self, list_of_countries):
for country in list_of_countries:
try:
# use soupparser to handle broken html
tree = lxml.html.soupparser.parse(
"crawlers/pages/" + country + "MemberList.html"
)
# for e in tree.iter():
#
# print(e.tag)
#
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
#
# #print(etree.tostring(e).decode())
dictionary_member_list = {}
country_conf = self.config.get(country)
country_domain = country_conf.get("domain")
country_conf_member_list = country_conf.get("memberList")
country_conf_member_list_parent = country_conf_member_list.get("parent")
country_conf_member_list_child_name = country_conf_member_list.get("child-name")
country_conf_member_list_child_link = country_conf_member_list.get("child-link")
for n in range(len(tree.xpath(country_conf_member_list_parent))):
name = tree.xpath(
country_conf_member_list_parent
+ "["
+ str(n)
+ "]"
+ country_conf_member_list_child_name
)
link = tree.xpath(
country_conf_member_list_parent
+ "["
+ str(n)
+ "]"
+ country_conf_member_list_child_link
)
if len(name) > 0:
dictionary_member_list[n] = {}
dictionary_member_list[n]["name"] = name[0]
if country_domain in link[0]:
dictionary_member_list[n]["link"] = link[0]
if country_domain not in link[0]:
dictionary_member_list[n]["link"] = country_domain + link[0]
except Exception as e:
print(
"parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
e,
)
# save interim results to files
f = open("crawlers/output/" + country + "MemberList.txt", "w+")
f.write(str(dictionary_member_list))
f.close
def download_member_data_htmls(self, list_of_countries):
for country in list_of_countries:
f = open("crawlers/output/" + country + "MemberList.txt")
text = f.read()
dictionary_member_list = eval(text)
for member_id in dictionary_member_list:
member_link = dictionary_member_list[member_id]["link"]
# download the html page of the Member
response = urllib.request.urlopen(member_link)
web_content = response.read().decode("UTF-8")
# save interim results to files
file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html"
os.makedirs(os.path.dirname(file_name), exist_ok=True)
f = open(file_name, "w+")
f.write(web_content)
f.close
def parse_member_data2dictionary(self, list_of_countries):
for country in list_of_countries:
print("started to parse data of member of " + country + " ..")
f = open("crawlers/output/" + country + "MemberList.txt")
text = f.read()
dictionary_member_list = eval(text)
country_conf = self.config.get(country)
country_domain = country_conf.get("domain")
country_conf_member = country_conf.get("member")
country_conf_member_info1 = country_conf_member.get("info-1")
country_conf_member_info1_parent = country_conf_member_info1.get("parent")
country_conf_member_info1_child_political_party = country_conf_member_info1.get(
"child-politicalParty"
)
for member_id in dictionary_member_list:
print(
"started to parse data of member with name "
+ dictionary_member_list[member_id]["name"]
+ " .."
)
file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html"
tree = lxml.html.soupparser.parse(file_name)
political_party = tree.xpath(
country_conf_member_info1_parent
+ country_conf_member_info1_child_political_party
)
print("oi", political_party)
if len(political_party) > 0:
dictionary_member_list[member_id]["political party"] = political_party[
0
]
f = open("crawlers/output/" + country + "MemberList.txt", "w+")
f.write(str(dictionary_member_list))
f.close

+ 0
- 16
crawlers/useMembersParliamentCrawler.py View File

@ -1,16 +0,0 @@
from crawlers.MembersParliamentCrawler import *
config = "config.yaml"
listOfCountries = ["nicaragua"]
Crawler = membersParliamentCrawler(config)
# Crawler.downloadMemberListPagesOfCountries(listOfCountries)
# Crawler.parseMemberListData2dictionary(listOfCountries)
# Crawler.downloadMemberDataHtmls(listOfCountries)
Crawler.parseMemberData2dictionary(listOfCountries)

+ 16
- 0
crawlers/use_members_parliament_crawler.py View File

@ -0,0 +1,16 @@
from crawlers.members_parliament_crawler import *
config = "config.yaml"
list_of_countries = ["nicaragua"]
crawler = members_parliament_crawler(config)
# crawler.download_member_list_pages_of_countries(list_of_countries)
# crawler.parse_member_list_data2dictionary(list_of_countries)
# crawler.download_member_data_htmls(list_of_countries)
crawler.parse_member_data2dictionary(list_of_countries)

+ 13
- 13
main.py View File

@ -1,31 +1,31 @@
from crawlers.MembersParliamentCrawler import *
from crawlers.members_parliament_crawler import *
from wikidata.wdPEP import *
from wikidata.wd_PEP import *
config = "crawlers/config.yaml"
listOfCountries = ["nicaragua"]
list_of_countries = ["nicaragua"]
# doing the crawling of government websites
# Crawler = membersParliamentCrawler(config)
# crawler = members_parliament_crawler(config)
# Crawler.downloadMemberListPagesOfCountries(listOfCountries)
# crawler.download_member_list_pages_of_countries(list_of_countries)
# Crawler.parseMemberListData2dictionary(listOfCountries)
# crawler.parse_member_list_data2dictionary(list_of_countries)
# Crawler.downloadMemberDataHtmls(listOfCountries)
# crawler.download_member_data_htmls(list_of_countries)
# Crawler.parseMemberData2dictionary(listOfCountries)
# crawler.parse_member_data2dictionary(list_of_countries)
# processing the resulted dictionary and create wikidata queries
wikidataPEP = WikidataPEP(config)
wikidata_PEP = Wikidata_PEP(config)
# wikidataPEP.importMembersOfParliamentDict(listOfCountries)
# wikidata_PEP.importMembers_of_parliament_dict(list_of_countries)
# wikidataPEP.checkForEntityIds(listOfCountries)
# wikidata_PEP.check_for_entity_ids(list_of_countries)
# wikidataPEP.createMemberOnWikidata()
# wikidata_PEP.create_member_on_wikidata()
wikidataPEP.editMemberOnWikidata("Q116918332")
wikidata_PEP.edit_member_on_wikidata("Q116918332")

wikidata/wdPEP.py → wikidata/wd_PEP.py View File

@ -4,57 +4,57 @@ import yaml
import json
class WikidataPEP(object):
def __init__(self, configFile):
with open(configFile, "r") as stream:
class wikidata_PEP(object):
def __init__(self, config_file):
with open(config_file, "r") as stream:
try:
self.config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
def importMembersOfParliamentDict(self, listOfCountries):
self.fullDictionaryMemberLists = {}
def import_members_of_parliament_dict(self, list_of_countries):
self.full_dictionary_member_lists = {}
for country in listOfCountries:
for country in list_of_countries:
print("started to parse data of members of " + country + " ..")
f = open("crawlers/output/" + country + "MemberList.txt")
text = f.read()
self.fullDictionaryMemberLists[country] = eval(text)
self.full_dictionary_member_lists[country] = eval(text)
# print(self.fullDictionaryMemberLists)
# print(self.full_dictionary_member_lists)
def checkForEntityIds(self, listOfCountries):
def check_for_entity_ids(self, list_of_countries):
from wikibaseintegrator import WikibaseIntegrator
from wikibaseintegrator import wbi_helpers
fullDictionaryMemberLists = self.fullDictionaryMemberLists
full_dictionary_member_lists = self.full_dictionary_member_lists
for country in listOfCountries:
for memberId in fullDictionaryMemberLists[country].keys():
name = fullDictionaryMemberLists[country][memberId]["name"]
for country in list_of_countries:
for member_id in full_dictionary_member_lists[country].keys():
name = full_dictionary_member_lists[country][member_id]["name"]
results = wbi_helpers.search_entities(search_string=name)
for entityId in results:
for entity_id in results:
wbi = WikibaseIntegrator()
wikidata_item = wbi.item.get(entity_id=entityId)
wikidata_item = wbi.item.get(entity_id=entity_id)
for claimkey in wikidata_item.get_json()["claims"].keys():
if claimkey == "P31":
for claim_key in wikidata_item.get_json()["claims"].keys():
if claim_key == "P31":
if (
wikidata_item.get_json()["claims"][claimkey][0][
wikidata_item.get_json()["claims"][claim_key][0][
"mainsnak"
]["datavalue"]["value"]["id"]
== "Q5"
):
print(entityId)
print(entity_id)
print("---------")
print(name)
print("is a human")
def createMemberOnWikidata(self):
def create_member_on_wikidata(self):
from wikibaseintegrator import wbi_login, WikibaseIntegrator
from wikibaseintegrator.datatypes import ExternalID, Item
from wikibaseintegrator.wbi_config import config as wbi_config
@ -67,18 +67,18 @@ class WikidataPEP(object):
wbi = WikibaseIntegrator(login=login_instance)
# data type object, e.g. for a NCBI gene entrez ID
isHuman = Item(value="Q5", prop_nr="P31")
occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
is_human = Item(value="Q5", prop_nr="P31")
occupation_politician = ExternalID(value="Q82955", prop_nr="P106")
occupation_deputy = ExternalID(value="Q1055894", prop_nr="P106")
# referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
# print(isHuman)
# print(is_human)
# print(referenceURL)
# data goes into a list, because many data objects can be provided to
data1 = [isHuman]
data2 = [occupationDeputy]
data3 = [occupationPolitician]
data1 = [is_human]
data2 = [occupation_deputy]
data3 = [occupation_politician]
# Create a new item
item = wbi.item.new()
@ -115,12 +115,12 @@ class WikidataPEP(object):
wbi = WikibaseIntegrator(login=login_instance)
# data type object, e.g. for a NCBI gene entrez ID
# isHuman = Item(value='Q5', prop_nr='P31')
# occupationPolitician = Item(value='Q82955', prop_nr='P106')
# occupationDeputy = Item(value='Q1055894', prop_nr='P106')
# is_human = Item(value='Q5', prop_nr='P31')
# occupation_politician = Item(value='Q82955', prop_nr='P106')
# occupation_deputy = Item(value='Q1055894', prop_nr='P106')
# referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')
# print(isHuman)
# print(is_human)
# print(referenceURL)
references = [
@ -137,12 +137,12 @@ class WikidataPEP(object):
]
]
occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
occupation_deputy = Item(value="Q1055894", prop_nr="P106", references=references)
## data goes into a list, because many data objects can be provided to
# data1 = [isHuman]
data2 = [occupationDeputy]
# data3 = [occupationPolitician]
# data1 = [is_human]
data2 = [occupation_deputy]
# data3 = [occupation_politician]
# data4 = [referenceURL]
## get item for Qid
item = wbi.item.get(entity_id=Qid)

Loading…
Cancel
Save