formatted with black

2023-03-09 16:07:47 +00:00 · 2023-03-09 16:07:47 +00:00 · 69480ecc26
commit 69480ecc26
parent f395b87ab6
4 changed files with 207 additions and 220 deletions
--- a/crawlers/MembersParliamentCrawler.py
+++ b/crawlers/MembersParliamentCrawler.py
@ -1,5 +1,3 @@
 import os
 import yaml
@ -12,172 +10,177 @@ import lxml.html
 import lxml.html.soupparser
 class membersParliamentCrawler(object):
    def __init__(self, configFile):
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
    # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
    def downloadMemberListPagesOfCountries(self, listOfCountries):
        # download only html pages of the countries specified in input
        for country in listOfCountries:
            for key in self.config:
                if key in listOfCountries:
                    try:
-                        memberList = self.config.get(key).get('memberList')
+                        memberList = self.config.get(key).get("memberList")
                    except Exception as e:
-                        print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e)
+                        print(
                            "There is a problem with the entry memberList in the config.yaml - the original error message is:",
                            e,
                        )
                    try:
-                        memberListLink = memberList.get('link')
+                        memberListLink = memberList.get("link")
                    except Exception as e:
-                        print("No memberListLink defined in config.yaml - the original error message is:", e)
+                        print(
                            "No memberListLink defined in config.yaml - the original error message is:",
                            e,
                        )
                    # download the html page of the List of Members
                    response = urllib.request.urlopen(memberListLink)
-                    webContent = response.read().decode('UTF-8')
+                    webContent = response.read().decode("UTF-8")
                    # save interim results to files
-                    f = open('crawlers/pages/' + key +'MemberList.html', 'w+')
+                    f = open("crawlers/pages/" + key + "MemberList.html", "w+")
                    f.write(webContent)
                    f.close
    def parseMemberListData2dictionary(self, listOfCountries):
        for country in listOfCountries:
            try:
                # use soupparser to handle broken html
-                #use soupparser to handle broken html
+                tree = lxml.html.soupparser.parse(
                    "crawlers/pages/" + country + "MemberList.html"
                )
-                tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html')
+                #                for e in tree.iter():
-
+                #
-#                for e in tree.iter():
+                #                    print(e.tag)
-#
+                #
-#                    print(e.tag)
+                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
-#                
+                #
-#                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
+                #                     #print(etree.tostring(e).decode())
 #                     
 #                     #print(etree.tostring(e).decode())
                dictionaryMemberList = {}
                countryConf = self.config.get(country)
-                countryDomain = countryConf.get('domain')
+                countryDomain = countryConf.get("domain")
-                countryConfMemberList = countryConf.get('memberList')
+                countryConfMemberList = countryConf.get("memberList")
-                countryConfMemberListParent = countryConfMemberList.get('parent')
+                countryConfMemberListParent = countryConfMemberList.get("parent")
-                countryConfMemberListChildName = countryConfMemberList.get('child-name')
+                countryConfMemberListChildName = countryConfMemberList.get("child-name")
-                countryConfMemberListChildLink = countryConfMemberList.get('child-link')
+                countryConfMemberListChildLink = countryConfMemberList.get("child-link")
                for n in range(len(tree.xpath(countryConfMemberListParent))):
-                    
+                    name = tree.xpath(
-                    name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName)
+                        countryConfMemberListParent
-                    link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink)
+                        + "["
                        + str(n)
                        + "]"
                        + countryConfMemberListChildName
                    )
                    link = tree.xpath(
                        countryConfMemberListParent
                        + "["
                        + str(n)
                        + "]"
                        + countryConfMemberListChildLink
                    )
                    if len(name) > 0:
                        dictionaryMemberList[n] = {}
-                        dictionaryMemberList[n]['name'] = name[0]
+                        dictionaryMemberList[n]["name"] = name[0]
                        if countryDomain in link[0]:
-                        
+                            dictionaryMemberList[n]["link"] = link[0]
                            dictionaryMemberList[n]['link'] = link[0]
                        if countryDomain not in link[0]:
-                            
+                            dictionaryMemberList[n]["link"] = countryDomain + link[0]
                            dictionaryMemberList[n]['link'] = countryDomain + link[0]
            except Exception as e:
-                
+                print(
-                print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)
+                    "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
                    e,
                )
            # save interim results to files
-            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
+            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close
    def downloadMemberDataHtmls(self, listOfCountries):
        for country in listOfCountries:
-            
+            f = open("crawlers/output/" + country + "MemberList.txt")
            f = open('crawlers/output/' + country +'MemberList.txt')
            text = f.read()
            dictionaryMemberList = eval(text)
            for memberid in dictionaryMemberList:
-                
+                memberLink = dictionaryMemberList[memberid]["link"]
                memberLink = dictionaryMemberList[memberid]['link']
                # download the html page of the Member
                response = urllib.request.urlopen(memberLink)
-                webContent = response.read().decode('UTF-8')
+                webContent = response.read().decode("UTF-8")
                # save interim results to files
-                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
+                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
                os.makedirs(os.path.dirname(filename), exist_ok=True)
-                f = open( filename, 'w+')
+                f = open(filename, "w+")
                f.write(webContent)
                f.close
    def parseMemberData2dictionary(self, listOfCountries):
        for country in listOfCountries:
            print("started to parse data of member of " + country + " ..")
-            print('started to parse data of member of ' + country + ' ..')
+            f = open("crawlers/output/" + country + "MemberList.txt")
            f = open('crawlers/output/' + country +'MemberList.txt')
            text = f.read()
            dictionaryMemberList = eval(text)
            countryConf = self.config.get(country)
-            countryDomain = countryConf.get('domain')
+            countryDomain = countryConf.get("domain")
-            countryConfMember = countryConf.get('member')
+            countryConfMember = countryConf.get("member")
-            countryConfMemberInfo1 = countryConfMember.get('info-1')
+            countryConfMemberInfo1 = countryConfMember.get("info-1")
-            countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent')
+            countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
-            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty')
+            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
                "child-politicalParty"
            )
            for memberid in dictionaryMemberList:
                print(
                    "started to parse data of member with name "
                    + dictionaryMemberList[memberid]["name"]
                    + " .."
                )
-                print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')
+                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
                tree = lxml.html.soupparser.parse(filename)
-                politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty)
+                politicalParty = tree.xpath(
                    countryConfMemberInfo1Parent
                    + countryConfMemberInfo1ChildPoliticalParty
                )
-                print('oi', politicalParty)
+                print("oi", politicalParty)
                if len(politicalParty) > 0:
                    dictionaryMemberList[memberid]["political party"] = politicalParty[
                        0
                    ]
-                    dictionaryMemberList[memberid]['political party'] = politicalParty[0]
+            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
            f.write(str(dictionaryMemberList))
            f.close
--- a/crawlers/useMembersParliamentCrawler.py
+++ b/crawlers/useMembersParliamentCrawler.py
@ -1,18 +1,16 @@
 from crawlers.MembersParliamentCrawler import *
-
+config = "config.yaml"
-config = 'config.yaml'
+listOfCountries = ["nicaragua"]
 listOfCountries = ['nicaragua']
 Crawler = membersParliamentCrawler(config)
-#Crawler.downloadMemberListPagesOfCountries(listOfCountries)
+# Crawler.downloadMemberListPagesOfCountries(listOfCountries)
-#Crawler.parseMemberListData2dictionary(listOfCountries)
+# Crawler.parseMemberListData2dictionary(listOfCountries)
-#Crawler.downloadMemberDataHtmls(listOfCountries)
+# Crawler.downloadMemberDataHtmls(listOfCountries)
 Crawler.parseMemberData2dictionary(listOfCountries)
--- a/main.py
+++ b/main.py
@ -1,33 +1,31 @@
 from crawlers.MembersParliamentCrawler import *
 from wikidata.wdPEP import *
-config = 'crawlers/config.yaml'
+config = "crawlers/config.yaml"
-listOfCountries = ['nicaragua']
+listOfCountries = ["nicaragua"]
 # doing the crawling of government websites
-#Crawler = membersParliamentCrawler(config)
+# Crawler = membersParliamentCrawler(config)
-#Crawler.downloadMemberListPagesOfCountries(listOfCountries)
+# Crawler.downloadMemberListPagesOfCountries(listOfCountries)
-#Crawler.parseMemberListData2dictionary(listOfCountries)
+# Crawler.parseMemberListData2dictionary(listOfCountries)
-#Crawler.downloadMemberDataHtmls(listOfCountries)
+# Crawler.downloadMemberDataHtmls(listOfCountries)
-#Crawler.parseMemberData2dictionary(listOfCountries)
+# Crawler.parseMemberData2dictionary(listOfCountries)
 # processing the resulted dictionary and create wikidata queries
 wikidataPEP = WikidataPEP(config)
-#wikidataPEP.importMembersOfParliamentDict(listOfCountries)
+# wikidataPEP.importMembersOfParliamentDict(listOfCountries)
-#wikidataPEP.checkForEntityIds(listOfCountries)
+# wikidataPEP.checkForEntityIds(listOfCountries)
-#wikidataPEP.createMemberOnWikidata()
+# wikidataPEP.createMemberOnWikidata()
-wikidataPEP.editMemberOnWikidata('Q116918332')
+wikidataPEP.editMemberOnWikidata("Q116918332")
--- a/wikidata/wdPEP.py
+++ b/wikidata/wdPEP.py
@ -1,92 +1,76 @@
 import os
 import yaml
 import json
 class WikidataPEP(object):
    def __init__(self, configFile):
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
    def importMembersOfParliamentDict(self, listOfCountries):
        self.fullDictionaryMemberLists = {}
        for country in listOfCountries:
            print("started to parse data of members of " + country + " ..")
-            print('started to parse data of members of ' + country + ' ..')
+            f = open("crawlers/output/" + country + "MemberList.txt")
            f = open('crawlers/output/' + country +'MemberList.txt')
            text = f.read()
            self.fullDictionaryMemberLists[country] = eval(text)
-        #print(self.fullDictionaryMemberLists)
+        # print(self.fullDictionaryMemberLists)
    def checkForEntityIds(self, listOfCountries):
        from wikibaseintegrator import WikibaseIntegrator
        from wikibaseintegrator import wbi_helpers
        fullDictionaryMemberLists = self.fullDictionaryMemberLists
        for country in listOfCountries:
            for memberId in fullDictionaryMemberLists[country].keys():
-                
+                name = fullDictionaryMemberLists[country][memberId]["name"]
                name = fullDictionaryMemberLists[country][memberId]['name']
                results = wbi_helpers.search_entities(search_string=name)
                for entityId in results:
                    wbi = WikibaseIntegrator()
                    wikidata_item = wbi.item.get(entity_id=entityId)
-                    for claimkey in wikidata_item.get_json()['claims'].keys():
+                    for claimkey in wikidata_item.get_json()["claims"].keys():
-                        
+                        if claimkey == "P31":
-                        if claimkey == 'P31':
+                            if (
-                            
+                                wikidata_item.get_json()["claims"][claimkey][0][
-                            if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
+                                    "mainsnak"
-                                
+                                ]["datavalue"]["value"]["id"]
                                == "Q5"
                            ):
                                print(entityId)
-                                print('---------')
+                                print("---------")
                                print(name)
-                                print('is a human')
+                                print("is a human")
    def createMemberOnWikidata(self):
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item
        from wikibaseintegrator.wbi_config import config as wbi_config
-        wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)'
+        wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)"
        # login object
-        login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
+        login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
        wbi = WikibaseIntegrator(login=login_instance)
        # data type object, e.g. for a NCBI gene entrez ID
-        isHuman = Item(value='Q5', prop_nr='P31')
+        isHuman = Item(value="Q5", prop_nr="P31")
-        occupationPolitician = ExternalID(value='Q82955', prop_nr='P106')
+        occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
-        occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106')
+        occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
-        #referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
+        # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
        # print(isHuman)
        # print(referenceURL)
@ -100,33 +84,33 @@ class WikidataPEP(object):
        item = wbi.item.new()
        # Set an english label
-        item.labels.set(language='en', value='Carlos Humberto Ruíz')
+        item.labels.set(language="en", value="Carlos Humberto Ruíz")
        # Carlos Humberto Ruiz has the Qid Q116918332
        # Set a French description
-        item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy')
+        item.descriptions.set(
            language="en", value="Nicaraguan National Assembly Deputy"
        )
        item.claims.add(data1)
-        #item.claims.add(data2)
+        # item.claims.add(data2)
-        #item.claims.add(data3)
+        # item.claims.add(data3)
        print(item.write())
    def editMemberOnWikidata(self, Qid):
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
        from wikibaseintegrator.wbi_config import config as wbi_config
        from wikibaseintegrator.wbi_enums import ActionIfExists
        from wikibaseintegrator.wbi_enums import WikibaseDatePrecision
-        
+        wbi_config[
-        wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)'
+            "USER_AGENT"
        ] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)"
        # login object
-        login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
+        login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
        wbi = WikibaseIntegrator(login=login_instance)
@ -141,12 +125,19 @@ class WikidataPEP(object):
        references = [
            [
-                ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854'),
+                ExternalID(
-                Time(time='+2023-02-27T00:00:00Z', prop_nr='P813', precision=WikibaseDatePrecision.DAY)
+                    value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB",
                    prop_nr="P854",
                ),
                Time(
                    time="+2023-02-27T00:00:00Z",
                    prop_nr="P813",
                    precision=WikibaseDatePrecision.DAY,
                ),
            ]
        ]
-        occupationDeputy = Item(value='Q1055894', prop_nr='P106', references=references)
+        occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
        ## data goes into a list, because many data objects can be provided to
        # data1 = [isHuman]
@ -159,15 +150,12 @@ class WikidataPEP(object):
        # print(item.claims)
        # Set an english label
-        #item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
+        # item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
        # Set a French description
-        #item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
+        # item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
-        #item.claims.add(data4)
+        # item.claims.add(data4)
        item.claims.add(data2)
-        #item.claims.add(data3)
+        # item.claims.add(data3)
        print(item.write())