converted camel case to snake case because of empirical evidence of better readability

2023-03-11 14:37:11 +00:00 · 2023-03-11 14:37:11 +00:00 · dba482c477
commit dba482c477
parent 69480ecc26
6 changed files with 250 additions and 250 deletions
--- a/crawlers/MembersParliamentCrawler.py
+++ b/crawlers/MembersParliamentCrawler.py
@ -1,186 +0,0 @@
-import os
-
-import yaml
-import json
-
-import urllib.request, urllib.error, urllib.parse
-
-from lxml import etree
-import lxml.html
-import lxml.html.soupparser
-
-
-class membersParliamentCrawler(object):
-    def __init__(self, configFile):
-        with open(configFile, "r") as stream:
-            try:
-                self.config = yaml.safe_load(stream)
-            except yaml.YAMLError as exc:
-                print(exc)
-
-    # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
-
-    def downloadMemberListPagesOfCountries(self, listOfCountries):
-        # download only html pages of the countries specified in input
-
-        for country in listOfCountries:
-            for key in self.config:
-                if key in listOfCountries:
-                    try:
-                        memberList = self.config.get(key).get("memberList")
-                    except Exception as e:
-                        print(
-                            "There is a problem with the entry memberList in the config.yaml - the original error message is:",
-                            e,
-                        )
-                    try:
-                        memberListLink = memberList.get("link")
-                    except Exception as e:
-                        print(
-                            "No memberListLink defined in config.yaml - the original error message is:",
-                            e,
-                        )
-
-                    # download the html page of the List of Members
-
-                    response = urllib.request.urlopen(memberListLink)
-                    webContent = response.read().decode("UTF-8")
-
-                    # save interim results to files
-
-                    f = open("crawlers/pages/" + key + "MemberList.html", "w+")
-                    f.write(webContent)
-                    f.close
-
-    def parseMemberListData2dictionary(self, listOfCountries):
-        for country in listOfCountries:
-            try:
-                # use soupparser to handle broken html
-
-                tree = lxml.html.soupparser.parse(
-                    "crawlers/pages/" + country + "MemberList.html"
-                )
-
-                #                for e in tree.iter():
-                #
-                #                    print(e.tag)
-                #
-                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
-                #
-                #                     #print(etree.tostring(e).decode())
-
-                dictionaryMemberList = {}
-
-                countryConf = self.config.get(country)
-                countryDomain = countryConf.get("domain")
-                countryConfMemberList = countryConf.get("memberList")
-                countryConfMemberListParent = countryConfMemberList.get("parent")
-                countryConfMemberListChildName = countryConfMemberList.get("child-name")
-                countryConfMemberListChildLink = countryConfMemberList.get("child-link")
-
-                for n in range(len(tree.xpath(countryConfMemberListParent))):
-                    name = tree.xpath(
-                        countryConfMemberListParent
-                        + "["
-                        + str(n)
-                        + "]"
-                        + countryConfMemberListChildName
-                    )
-                    link = tree.xpath(
-                        countryConfMemberListParent
-                        + "["
-                        + str(n)
-                        + "]"
-                        + countryConfMemberListChildLink
-                    )
-
-                    if len(name) > 0:
-                        dictionaryMemberList[n] = {}
-                        dictionaryMemberList[n]["name"] = name[0]
-
-                        if countryDomain in link[0]:
-                            dictionaryMemberList[n]["link"] = link[0]
-
-                        if countryDomain not in link[0]:
-                            dictionaryMemberList[n]["link"] = countryDomain + link[0]
-
-            except Exception as e:
-                print(
-                    "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
-                    e,
-                )
-
-            # save interim results to files
-
-            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
-            f.write(str(dictionaryMemberList))
-            f.close
-
-    def downloadMemberDataHtmls(self, listOfCountries):
-        for country in listOfCountries:
-            f = open("crawlers/output/" + country + "MemberList.txt")
-            text = f.read()
-
-            dictionaryMemberList = eval(text)
-
-            for memberid in dictionaryMemberList:
-                memberLink = dictionaryMemberList[memberid]["link"]
-
-                # download the html page of the Member
-
-                response = urllib.request.urlopen(memberLink)
-                webContent = response.read().decode("UTF-8")
-
-                # save interim results to files
-
-                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
-
-                os.makedirs(os.path.dirname(filename), exist_ok=True)
-                f = open(filename, "w+")
-                f.write(webContent)
-                f.close
-
-    def parseMemberData2dictionary(self, listOfCountries):
-        for country in listOfCountries:
-            print("started to parse data of member of " + country + " ..")
-
-            f = open("crawlers/output/" + country + "MemberList.txt")
-            text = f.read()
-
-            dictionaryMemberList = eval(text)
-
-            countryConf = self.config.get(country)
-            countryDomain = countryConf.get("domain")
-            countryConfMember = countryConf.get("member")
-            countryConfMemberInfo1 = countryConfMember.get("info-1")
-            countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
-            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
-                "child-politicalParty"
-            )
-
-            for memberid in dictionaryMemberList:
-                print(
-                    "started to parse data of member with name "
-                    + dictionaryMemberList[memberid]["name"]
-                    + " .."
-                )
-
-                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
-
-                tree = lxml.html.soupparser.parse(filename)
-
-                politicalParty = tree.xpath(
-                    countryConfMemberInfo1Parent
-                    + countryConfMemberInfo1ChildPoliticalParty
-                )
-
-                print("oi", politicalParty)
-
-                if len(politicalParty) > 0:
-                    dictionaryMemberList[memberid]["political party"] = politicalParty[
-                        0
-                    ]
-
-            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
-            f.write(str(dictionaryMemberList))
-            f.close
--- a/crawlers/members_parliament_crawler.py
+++ b/crawlers/members_parliament_crawler.py
@ -0,0 +1,186 @@
+import os
+
+import yaml
+import json
+
+import urllib.request, urllib.error, urllib.parse
+
+from lxml import etree
+import lxml.html
+import lxml.html.soupparser
+
+
+class members_parliament_crawler(object):
+    def __init__(self, config_file):
+        with open(config_file, "r") as stream:
+            try:
+                self.config = yaml.safe_load(stream)
+            except yaml.YAMLError as exc:
+                print(exc)
+
+    # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
+
+    def download_member_list_pages_of_countries(self, list_of_countries):
+        # download only html pages of the countries specified in input
+
+        for country in list_of_countries:
+            for key in self.config:
+                if key in list_of_countries:
+                    try:
+                        member_list = self.config.get(key).get("memberList")
+                    except Exception as e:
+                        print(
+                            "There is a problem with the entry memberList in the config.yaml - the original error message is:",
+                            e,
+                        )
+                    try:
+                        member_list_link = member_list.get("link")
+                    except Exception as e:
+                        print(
+                            "No memberListLink defined in config.yaml - the original error message is:",
+                            e,
+                        )
+
+                    # download the html page of the List of Members
+
+                    response = urllib.request.urlopen(member_list_link)
+                    web_content = response.read().decode("UTF-8")
+
+                    # save interim results to files
+
+                    f = open("crawlers/pages/" + key + "MemberList.html", "w+")
+                    f.write(webContent)
+                    f.close
+
+    def parse_member_list_data2dictionary(self, list_of_countries):
+        for country in list_of_countries:
+            try:
+                # use soupparser to handle broken html
+
+                tree = lxml.html.soupparser.parse(
+                    "crawlers/pages/" + country + "MemberList.html"
+                )
+
+                #                for e in tree.iter():
+                #
+                #                    print(e.tag)
+                #
+                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
+                #
+                #                     #print(etree.tostring(e).decode())
+
+                dictionary_member_list = {}
+
+                country_conf = self.config.get(country)
+                country_domain = country_conf.get("domain")
+                country_conf_member_list = country_conf.get("memberList")
+                country_conf_member_list_parent = country_conf_member_list.get("parent")
+                country_conf_member_list_child_name = country_conf_member_list.get("child-name")
+                country_conf_member_list_child_link = country_conf_member_list.get("child-link")
+
+                for n in range(len(tree.xpath(country_conf_member_list_parent))):
+                    name = tree.xpath(
+                        country_conf_member_list_parent
+                        + "["
+                        + str(n)
+                        + "]"
+                        + country_conf_member_list_child_name
+                    )
+                    link = tree.xpath(
+                        country_conf_member_list_parent
+                        + "["
+                        + str(n)
+                        + "]"
+                        + country_conf_member_list_child_link
+                    )
+
+                    if len(name) > 0:
+                        dictionary_member_list[n] = {}
+                        dictionary_member_list[n]["name"] = name[0]
+
+                        if country_domain in link[0]:
+                            dictionary_member_list[n]["link"] = link[0]
+
+                        if country_domain not in link[0]:
+                            dictionary_member_list[n]["link"] = country_domain + link[0]
+
+            except Exception as e:
+                print(
+                    "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
+                    e,
+                )
+
+            # save interim results to files
+
+            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
+            f.write(str(dictionary_member_list))
+            f.close
+
+    def download_member_data_htmls(self, list_of_countries):
+        for country in list_of_countries:
+            f = open("crawlers/output/" + country + "MemberList.txt")
+            text = f.read()
+
+            dictionary_member_list = eval(text)
+
+            for member_id in dictionary_member_list:
+                member_link = dictionary_member_list[member_id]["link"]
+
+                # download the html page of the Member
+
+                response = urllib.request.urlopen(member_link)
+                web_content = response.read().decode("UTF-8")
+
+                # save interim results to files
+
+                file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html"
+
+                os.makedirs(os.path.dirname(file_name), exist_ok=True)
+                f = open(file_name, "w+")
+                f.write(web_content)
+                f.close
+
+    def parse_member_data2dictionary(self, list_of_countries):
+        for country in list_of_countries:
+            print("started to parse data of member of " + country + " ..")
+
+            f = open("crawlers/output/" + country + "MemberList.txt")
+            text = f.read()
+
+            dictionary_member_list = eval(text)
+
+            country_conf = self.config.get(country)
+            country_domain = country_conf.get("domain")
+            country_conf_member = country_conf.get("member")
+            country_conf_member_info1 = country_conf_member.get("info-1")
+            country_conf_member_info1_parent = country_conf_member_info1.get("parent")
+            country_conf_member_info1_child_political_party = country_conf_member_info1.get(
+                "child-politicalParty"
+            )
+
+            for member_id in dictionary_member_list:
+                print(
+                    "started to parse data of member with name "
+                    + dictionary_member_list[member_id]["name"]
+                    + " .."
+                )
+
+                file_name = "crawlers/pages/" + country + "/" + str(member_id) + ".html"
+
+                tree = lxml.html.soupparser.parse(file_name)
+
+                political_party = tree.xpath(
+                    country_conf_member_info1_parent
+                    + country_conf_member_info1_child_political_party
+                )
+
+                print("oi", political_party)
+
+                if len(political_party) > 0:
+                    dictionary_member_list[member_id]["political party"] = political_party[
+                        0
+                    ]
+
+            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
+            f.write(str(dictionary_member_list))
+            f.close
--- a/crawlers/useMembersParliamentCrawler.py
+++ b/crawlers/useMembersParliamentCrawler.py
@ -1,16 +0,0 @@
-from crawlers.MembersParliamentCrawler import *
-
-
-config = "config.yaml"
-listOfCountries = ["nicaragua"]
-
-
-Crawler = membersParliamentCrawler(config)
-
-# Crawler.downloadMemberListPagesOfCountries(listOfCountries)
-
-# Crawler.parseMemberListData2dictionary(listOfCountries)
-
-# Crawler.downloadMemberDataHtmls(listOfCountries)
-
-Crawler.parseMemberData2dictionary(listOfCountries)
--- a/crawlers/use_members_parliament_crawler.py
+++ b/crawlers/use_members_parliament_crawler.py
@ -0,0 +1,16 @@
+from crawlers.members_parliament_crawler import *
+
+
+config = "config.yaml"
+list_of_countries = ["nicaragua"]
+
+
+crawler = members_parliament_crawler(config)
+
+# crawler.download_member_list_pages_of_countries(list_of_countries)
+
+# crawler.parse_member_list_data2dictionary(list_of_countries)
+
+# crawler.download_member_data_htmls(list_of_countries)
+
+crawler.parse_member_data2dictionary(list_of_countries)
--- a/main.py
+++ b/main.py
@ -1,31 +1,31 @@
-from crawlers.MembersParliamentCrawler import *
+from crawlers.members_parliament_crawler import *

-from wikidata.wdPEP import *
+from wikidata.wd_PEP import *

 config = "crawlers/config.yaml"
-listOfCountries = ["nicaragua"]
+list_of_countries = ["nicaragua"]


 # doing the crawling of government websites

-# Crawler = membersParliamentCrawler(config)
+# crawler = members_parliament_crawler(config)

-# Crawler.downloadMemberListPagesOfCountries(listOfCountries)
+# crawler.download_member_list_pages_of_countries(list_of_countries)

-# Crawler.parseMemberListData2dictionary(listOfCountries)
+# crawler.parse_member_list_data2dictionary(list_of_countries)

-# Crawler.downloadMemberDataHtmls(listOfCountries)
+# crawler.download_member_data_htmls(list_of_countries)

-# Crawler.parseMemberData2dictionary(listOfCountries)
+# crawler.parse_member_data2dictionary(list_of_countries)

 # processing the resulted dictionary and create wikidata queries

-wikidataPEP = WikidataPEP(config)
+wikidata_PEP = Wikidata_PEP(config)

-# wikidataPEP.importMembersOfParliamentDict(listOfCountries)
+# wikidata_PEP.importMembers_of_parliament_dict(list_of_countries)

-# wikidataPEP.checkForEntityIds(listOfCountries)
+# wikidata_PEP.check_for_entity_ids(list_of_countries)

-# wikidataPEP.createMemberOnWikidata()
+# wikidata_PEP.create_member_on_wikidata()

-wikidataPEP.editMemberOnWikidata("Q116918332")
+wikidata_PEP.edit_member_on_wikidata("Q116918332")
--- a/wikidata/wd_PEP.py
+++ b/wikidata/wd_PEP.py
@ -4,57 +4,57 @@ import yaml
 import json


-class WikidataPEP(object):
-    def __init__(self, configFile):
-        with open(configFile, "r") as stream:
+class wikidata_PEP(object):
+    def __init__(self, config_file):
+        with open(config_file, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)

-    def importMembersOfParliamentDict(self, listOfCountries):
-        self.fullDictionaryMemberLists = {}
+    def import_members_of_parliament_dict(self, list_of_countries):
+        self.full_dictionary_member_lists = {}

-        for country in listOfCountries:
+        for country in list_of_countries:
            print("started to parse data of members of " + country + " ..")

            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()

-            self.fullDictionaryMemberLists[country] = eval(text)
+            self.full_dictionary_member_lists[country] = eval(text)

-        # print(self.fullDictionaryMemberLists)
+        # print(self.full_dictionary_member_lists)

-    def checkForEntityIds(self, listOfCountries):
+    def check_for_entity_ids(self, list_of_countries):
        from wikibaseintegrator import WikibaseIntegrator
        from wikibaseintegrator import wbi_helpers

-        fullDictionaryMemberLists = self.fullDictionaryMemberLists
+        full_dictionary_member_lists = self.full_dictionary_member_lists

-        for country in listOfCountries:
-            for memberId in fullDictionaryMemberLists[country].keys():
-                name = fullDictionaryMemberLists[country][memberId]["name"]
+        for country in list_of_countries:
+            for member_id in full_dictionary_member_lists[country].keys():
+                name = full_dictionary_member_lists[country][member_id]["name"]

                results = wbi_helpers.search_entities(search_string=name)

-                for entityId in results:
+                for entity_id in results:
                    wbi = WikibaseIntegrator()
-                    wikidata_item = wbi.item.get(entity_id=entityId)
+                    wikidata_item = wbi.item.get(entity_id=entity_id)

-                    for claimkey in wikidata_item.get_json()["claims"].keys():
-                        if claimkey == "P31":
+                    for claim_key in wikidata_item.get_json()["claims"].keys():
+                        if claim_key == "P31":
                            if (
-                                wikidata_item.get_json()["claims"][claimkey][0][
+                                wikidata_item.get_json()["claims"][claim_key][0][
                                    "mainsnak"
                                ]["datavalue"]["value"]["id"]
                                == "Q5"
                            ):
-                                print(entityId)
+                                print(entity_id)
                                print("---------")
                                print(name)
                                print("is a human")

-    def createMemberOnWikidata(self):
+    def create_member_on_wikidata(self):
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item
        from wikibaseintegrator.wbi_config import config as wbi_config
@ -67,18 +67,18 @@ class WikidataPEP(object):
        wbi = WikibaseIntegrator(login=login_instance)

        # data type object, e.g. for a NCBI gene entrez ID
-        isHuman = Item(value="Q5", prop_nr="P31")
-        occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
-        occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
+        is_human = Item(value="Q5", prop_nr="P31")
+        occupation_politician = ExternalID(value="Q82955", prop_nr="P106")
+        occupation_deputy = ExternalID(value="Q1055894", prop_nr="P106")
        # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')

-        # print(isHuman)
+        # print(is_human)
        # print(referenceURL)

        # data goes into a list, because many data objects can be provided to
-        data1 = [isHuman]
-        data2 = [occupationDeputy]
-        data3 = [occupationPolitician]
+        data1 = [is_human]
+        data2 = [occupation_deputy]
+        data3 = [occupation_politician]

        # Create a new item
        item = wbi.item.new()
@ -115,12 +115,12 @@ class WikidataPEP(object):
        wbi = WikibaseIntegrator(login=login_instance)

        # data type object, e.g. for a NCBI gene entrez ID
-        # isHuman = Item(value='Q5', prop_nr='P31')
-        # occupationPolitician = Item(value='Q82955', prop_nr='P106')
-        # occupationDeputy = Item(value='Q1055894', prop_nr='P106')
+        # is_human = Item(value='Q5', prop_nr='P31')
+        # occupation_politician = Item(value='Q82955', prop_nr='P106')
+        # occupation_deputy = Item(value='Q1055894', prop_nr='P106')
        # referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')

-        # print(isHuman)
+        # print(is_human)
        # print(referenceURL)

        references = [
@ -137,12 +137,12 @@ class WikidataPEP(object):
            ]
        ]

-        occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
+        occupation_deputy = Item(value="Q1055894", prop_nr="P106", references=references)

        ## data goes into a list, because many data objects can be provided to
-        # data1 = [isHuman]
-        data2 = [occupationDeputy]
-        # data3 = [occupationPolitician]
+        # data1 = [is_human]
+        data2 = [occupation_deputy]
+        # data3 = [occupation_politician]
        # data4 = [referenceURL]
        ## get item for Qid
        item = wbi.item.get(entity_id=Qid)