converted camel case to snake case because of empirical evidence of better readability

1 year ago · dba482c477
--- a/crawlers/MembersParliamentCrawler.py
+++ b/crawlers/MembersParliamentCrawler.py
@ -1,186 +0,0 @@
 import os
 import yaml
 import json
 import urllib.request, urllib.error, urllib.parse
 from lxml import etree
 import lxml.html
 import lxml.html.soupparser
 class membersParliamentCrawler(object):
    def __init__(self, configFile):
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
    # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
    def downloadMemberListPagesOfCountries(self, listOfCountries):
        # download only html pages of the countries specified in input
        for country in listOfCountries:
            for key in self.config:
                if key in listOfCountries:
                    try:
                        memberList = self.config.get(key).get("memberList")
                    except Exception as e:
                        print(
                            "There is a problem with the entry memberList in the config.yaml - the original error message is:",
                            e,
                        )
                    try:
                        memberListLink = memberList.get("link")
                    except Exception as e:
                        print(
                            "No memberListLink defined in config.yaml - the original error message is:",
                            e,
                        )
                    # download the html page of the List of Members
                    response = urllib.request.urlopen(memberListLink)
                    webContent = response.read().decode("UTF-8")
                    # save interim results to files
                    f = open("crawlers/pages/" + key + "MemberList.html", "w+")
                    f.write(webContent)
                    f.close
    def parseMemberListData2dictionary(self, listOfCountries):
        for country in listOfCountries:
            try:
                # use soupparser to handle broken html
                tree = lxml.html.soupparser.parse(
                    "crawlers/pages/" + country + "MemberList.html"
                )
                #                for e in tree.iter():
                #
                #                    print(e.tag)
                #
                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
                #
                #                     #print(etree.tostring(e).decode())
                dictionaryMemberList = {}
                countryConf = self.config.get(country)
                countryDomain = countryConf.get("domain")
                countryConfMemberList = countryConf.get("memberList")
                countryConfMemberListParent = countryConfMemberList.get("parent")
                countryConfMemberListChildName = countryConfMemberList.get("child-name")
                countryConfMemberListChildLink = countryConfMemberList.get("child-link")
                for n in range(len(tree.xpath(countryConfMemberListParent))):
                    name = tree.xpath(
                        countryConfMemberListParent
                        + "["
                        + str(n)
                        + "]"
                        + countryConfMemberListChildName
                    )
                    link = tree.xpath(
                        countryConfMemberListParent
                        + "["
                        + str(n)
                        + "]"
                        + countryConfMemberListChildLink
                    )
                    if len(name) > 0:
                        dictionaryMemberList[n] = {}
                        dictionaryMemberList[n]["name"] = name[0]
                        if countryDomain in link[0]:
                            dictionaryMemberList[n]["link"] = link[0]
                        if countryDomain not in link[0]:
                            dictionaryMemberList[n]["link"] = countryDomain + link[0]
            except Exception as e:
                print(
                    "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
                    e,
                )
            # save interim results to files
            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close
    def downloadMemberDataHtmls(self, listOfCountries):
        for country in listOfCountries:
            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
            dictionaryMemberList = eval(text)
            for memberid in dictionaryMemberList:
                memberLink = dictionaryMemberList[memberid]["link"]
                # download the html page of the Member
                response = urllib.request.urlopen(memberLink)
                webContent = response.read().decode("UTF-8")
                # save interim results to files
                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                f = open(filename, "w+")
                f.write(webContent)
                f.close
    def parseMemberData2dictionary(self, listOfCountries):
        for country in listOfCountries:
            print("started to parse data of member of " + country + " ..")
            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
            dictionaryMemberList = eval(text)
            countryConf = self.config.get(country)
            countryDomain = countryConf.get("domain")
            countryConfMember = countryConf.get("member")
            countryConfMemberInfo1 = countryConfMember.get("info-1")
            countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
                "child-politicalParty"
            )
            for memberid in dictionaryMemberList:
                print(
                    "started to parse data of member with name "
                    + dictionaryMemberList[memberid]["name"]
                    + " .."
                )
                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
                tree = lxml.html.soupparser.parse(filename)
                politicalParty = tree.xpath(
                    countryConfMemberInfo1Parent
                    + countryConfMemberInfo1ChildPoliticalParty
                )
                print("oi", politicalParty)
                if len(politicalParty) > 0:
                    dictionaryMemberList[memberid]["political party"] = politicalParty[
                        0
                    ]
            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close
--- a/crawlers/members_parliament_crawler.py
+++ b/crawlers/members_parliament_crawler.py
@ -0,0 +1,186 @@
 import os
 import yaml
 import json
 import urllib.request, urllib.error, urllib.parse
 from lxml import etree
 import lxml.html
 import lxml.html.soupparser
 class members_parliament_crawler(object):
    def __init__(self, config_file):
        with open(config_file, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
    # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
    def download_member_list_pages_of_countries(self, list_of_countries):
        # download only html pages of the countries specified in input
        for country in list_of_countries:
            for key in self.config:
                if key in list_of_countries:
                    try:
                        member_list = self.config.get(key).get("memberList")
                    except Exception as e:
                        print(
                            "There is a problem with the entry memberList in the config.yaml - the original error message is:",
                            e,
                        )
                    try:
                        member_list_link = member_list.get("link")
                    except Exception as e:
                        print(
                            "No memberListLink defined in config.yaml - the original error message is:",
                            e,
                        )
                    # download the html page of the List of Members
                    response = urllib.request.urlopen(member_list_link)
                    web_content = response.read().decode("UTF-8")
                    # save interim results to files
                    f = open("crawlers/pages/" + key + "MemberList.html", "w+")
                    f.write(webContent)
                    f.close
    def parse_member_list_data2dictionary(self, list_of_countries):
        for country in list_of_countries:
            try:
                # use soupparser to handle broken html
                tree = lxml.html.soupparser.parse(
                    "crawlers/pages/" + country + "MemberList.html"
                )
                #                for e in tree.iter():
                #
                #                    print(e.tag)
                #
                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
                #
                #                     #print(etree.tostring(e).decode())
                dictionary_member_list = {}
                country_conf = self.config.get(country)
                country_domain = country_conf.get("domain")
                country_conf_member_list = country_conf.get("memberList")
                country_conf_member_list_parent = country_conf_member_list.get("parent")
                country_conf_member_list_child_name = country_conf_member_list.get("child-name")
                country_conf_member_list_child_link = country_conf_member_list.get("child-link")
                for n in range(len(tree.xpath(country_conf_member_list_parent))):
                    name = tree.xpath(
                        country_conf_member_list_parent
                        + "["
                        + str(n)
                        + "]"
                        + country_conf_member_list_child_name
                    )
                    link = tree.xpath(
                        country_conf_member_list_parent
                        + "["
                        + str(n)
                        + "]"
                        + country_conf_member_list_child_link
                    )
                    if len(name) > 0:
                        dictionary_member_list[n] = {}
                        dictionary_member_list[n]["name"] = name[0]
                        if country_domain in link[0]:
                            dictionary_member_list[n]["link"] = link[0]
                        if country_domain not in link[0]:
                            dictionary_member_list[n]["link"] = country_domain + link[0]
            except Exception as e:
                print(
                    "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
                    e,
                )
            # save interim results to files
            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionary_member_list))
            f.close
    def download_member_data_htmls(self, list_of_countries):
        for country in list_of_countries:
            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
            dictionary_member_list = eval(text)
            for member_id in dictionary_member_list:
                member_link = dictionary_member_list[member_id]["link"]
                # download the html page of the Member
                response = urllib.request.urlopen(member_link)
                web_content = response.read().decode("UTF-8")
                # save interim results to files
                file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html"
                os.makedirs(os.path.dirname(file_name), exist_ok=True)
                f = open(file_name, "w+")
                f.write(web_content)
                f.close
    def parse_member_data2dictionary(self, list_of_countries):
        for country in list_of_countries:
            print("started to parse data of member of " + country + " ..")
            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
            dictionary_member_list = eval(text)
            country_conf = self.config.get(country)
            country_domain = country_conf.get("domain")
            country_conf_member = country_conf.get("member")
            country_conf_member_info1 = country_conf_member.get("info-1")
            country_conf_member_info1_parent = country_conf_member_info1.get("parent")
            country_conf_member_info1_child_political_party = country_conf_member_info1.get(
                "child-politicalParty"
            )
            for member_id in dictionary_member_list:
                print(
                    "started to parse data of member with name "
                    + dictionary_member_list[member_id]["name"]
                    + " .."
                )
                file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html"
                tree = lxml.html.soupparser.parse(file_name)
                political_party = tree.xpath(
                    country_conf_member_info1_parent
                    + country_conf_member_info1_child_political_party
                )
                print("oi", political_party)
                if len(political_party) > 0:
                    dictionary_member_list[member_id]["political party"] = political_party[
                        0
                    ]
            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionary_member_list))
            f.close
--- a/crawlers/useMembersParliamentCrawler.py
+++ b/crawlers/useMembersParliamentCrawler.py
@ -1,16 +0,0 @@
 from crawlers.MembersParliamentCrawler import *
 config = "config.yaml"
 listOfCountries = ["nicaragua"]
 Crawler = membersParliamentCrawler(config)
 # Crawler.downloadMemberListPagesOfCountries(listOfCountries)
 # Crawler.parseMemberListData2dictionary(listOfCountries)
 # Crawler.downloadMemberDataHtmls(listOfCountries)
 Crawler.parseMemberData2dictionary(listOfCountries)
--- a/crawlers/use_members_parliament_crawler.py
+++ b/crawlers/use_members_parliament_crawler.py
@ -0,0 +1,16 @@
 from crawlers.members_parliament_crawler import *
 config = "config.yaml"
 list_of_countries = ["nicaragua"]
 crawler = members_parliament_crawler(config)
 # crawler.download_member_list_pages_of_countries(list_of_countries)
 # crawler.parse_member_list_data2dictionary(list_of_countries)
 # crawler.download_member_data_htmls(list_of_countries)
 crawler.parse_member_data2dictionary(list_of_countries)
--- a/main.py
+++ b/main.py
@ -1,31 +1,31 @@
 from crawlers.MembersParliamentCrawler import *
 from crawlers.members_parliament_crawler import *
 from wikidata.wdPEP import *
 from wikidata.wd_PEP import *
 config = "crawlers/config.yaml"
 listOfCountries = ["nicaragua"]
 list_of_countries = ["nicaragua"]
 # doing the crawling of government websites
 # Crawler = membersParliamentCrawler(config)
 # crawler = members_parliament_crawler(config)
 # Crawler.downloadMemberListPagesOfCountries(listOfCountries)
 # crawler.download_member_list_pages_of_countries(list_of_countries)
 # Crawler.parseMemberListData2dictionary(listOfCountries)
 # crawler.parse_member_list_data2dictionary(list_of_countries)
 # Crawler.downloadMemberDataHtmls(listOfCountries)
 # crawler.download_member_data_htmls(list_of_countries)
 # Crawler.parseMemberData2dictionary(listOfCountries)
 # crawler.parse_member_data2dictionary(list_of_countries)
 # processing the resulted dictionary and create wikidata queries
 wikidataPEP = WikidataPEP(config)
 wikidata_PEP = Wikidata_PEP(config)
 # wikidataPEP.importMembersOfParliamentDict(listOfCountries)
 # wikidata_PEP.importMembers_of_parliament_dict(list_of_countries)
 # wikidataPEP.checkForEntityIds(listOfCountries)
 # wikidata_PEP.check_for_entity_ids(list_of_countries)
 # wikidataPEP.createMemberOnWikidata()
 # wikidata_PEP.create_member_on_wikidata()
 wikidataPEP.editMemberOnWikidata("Q116918332")
 wikidata_PEP.edit_member_on_wikidata("Q116918332")
--- a/wikidata/wd_PEP.py
+++ b/wikidata/wd_PEP.py
@ -4,57 +4,57 @@ import yaml
 import json
 class WikidataPEP(object):
    def __init__(self, configFile):
        with open(configFile, "r") as stream:
 class wikidata_PEP(object):
    def __init__(self, config_file):
        with open(config_file, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
    def importMembersOfParliamentDict(self, listOfCountries):
        self.fullDictionaryMemberLists = {}
    def import_members_of_parliament_dict(self, list_of_countries):
        self.full_dictionary_member_lists = {}
        for country in listOfCountries:
        for country in list_of_countries:
            print("started to parse data of members of " + country + " ..")
            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
            self.fullDictionaryMemberLists[country] = eval(text)
            self.full_dictionary_member_lists[country] = eval(text)
        # print(self.fullDictionaryMemberLists)
        # print(self.full_dictionary_member_lists)
    def checkForEntityIds(self, listOfCountries):
    def check_for_entity_ids(self, list_of_countries):
        from wikibaseintegrator import WikibaseIntegrator
        from wikibaseintegrator import wbi_helpers
        fullDictionaryMemberLists = self.fullDictionaryMemberLists
        full_dictionary_member_lists = self.full_dictionary_member_lists
        for country in listOfCountries:
            for memberId in fullDictionaryMemberLists[country].keys():
                name = fullDictionaryMemberLists[country][memberId]["name"]
        for country in list_of_countries:
            for member_id in full_dictionary_member_lists[country].keys():
                name = full_dictionary_member_lists[country][member_id]["name"]
                results = wbi_helpers.search_entities(search_string=name)
                for entityId in results:
                for entity_id in results:
                    wbi = WikibaseIntegrator()
                    wikidata_item = wbi.item.get(entity_id=entityId)
                    wikidata_item = wbi.item.get(entity_id=entity_id)
                    for claimkey in wikidata_item.get_json()["claims"].keys():
                        if claimkey == "P31":
                    for claim_key in wikidata_item.get_json()["claims"].keys():
                        if claim_key == "P31":
                            if (
                                wikidata_item.get_json()["claims"][claimkey][0][
                                wikidata_item.get_json()["claims"][claim_key][0][
                                    "mainsnak"
                                ]["datavalue"]["value"]["id"]
                                == "Q5"
                            ):
                                print(entityId)
                                print(entity_id)
                                print("---------")
                                print(name)
                                print("is a human")
    def createMemberOnWikidata(self):
    def create_member_on_wikidata(self):
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item
        from wikibaseintegrator.wbi_config import config as wbi_config
@ -67,18 +67,18 @@ class WikidataPEP(object):
        wbi = WikibaseIntegrator(login=login_instance)
        # data type object, e.g. for a NCBI gene entrez ID
        isHuman = Item(value="Q5", prop_nr="P31")
        occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
        occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
        is_human = Item(value="Q5", prop_nr="P31")
        occupation_politician = ExternalID(value="Q82955", prop_nr="P106")
        occupation_deputy = ExternalID(value="Q1055894", prop_nr="P106")
        # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
        # print(isHuman)
        # print(is_human)
        # print(referenceURL)
        # data goes into a list, because many data objects can be provided to
        data1 = [isHuman]
        data2 = [occupationDeputy]
        data3 = [occupationPolitician]
        data1 = [is_human]
        data2 = [occupation_deputy]
        data3 = [occupation_politician]
        # Create a new item
        item = wbi.item.new()
@ -115,12 +115,12 @@ class WikidataPEP(object):
        wbi = WikibaseIntegrator(login=login_instance)
        # data type object, e.g. for a NCBI gene entrez ID
        # isHuman = Item(value='Q5', prop_nr='P31')
        # occupationPolitician = Item(value='Q82955', prop_nr='P106')
        # occupationDeputy = Item(value='Q1055894', prop_nr='P106')
        # is_human = Item(value='Q5', prop_nr='P31')
        # occupation_politician = Item(value='Q82955', prop_nr='P106')
        # occupation_deputy = Item(value='Q1055894', prop_nr='P106')
        # referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')
        # print(isHuman)
        # print(is_human)
        # print(referenceURL)
        references = [
@ -137,12 +137,12 @@ class WikidataPEP(object):
            ]
        ]
        occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
        occupation_deputy = Item(value="Q1055894", prop_nr="P106", references=references)
        ## data goes into a list, because many data objects can be provided to
        # data1 = [isHuman]
        data2 = [occupationDeputy]
        # data3 = [occupationPolitician]
        # data1 = [is_human]
        data2 = [occupation_deputy]
        # data3 = [occupation_politician]
        # data4 = [referenceURL]
        ## get item for Qid
        item = wbi.item.get(entity_id=Qid)