formatted with black

2023-03-09 16:07:47 +00:00 · 2023-03-09 16:07:47 +00:00 · 69480ecc26
commit 69480ecc26
parent f395b87ab6
4 changed files with 207 additions and 220 deletions
--- a/crawlers/MembersParliamentCrawler.py
+++ b/crawlers/MembersParliamentCrawler.py
@ -1,5 +1,3 @@
-
-
 import os

 import yaml
@ -12,172 +10,177 @@ import lxml.html
 import lxml.html.soupparser


-
-
 class membersParliamentCrawler(object):
-    
    def __init__(self, configFile):
-        
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
-  
-    
+
    # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']

    def downloadMemberListPagesOfCountries(self, listOfCountries):
-        
        # download only html pages of the countries specified in input
-        
+
        for country in listOfCountries:
            for key in self.config:
                if key in listOfCountries:
                    try:
-                        memberList = self.config.get(key).get('memberList')
+                        memberList = self.config.get(key).get("memberList")
                    except Exception as e:
-                        print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e)
+                        print(
+                            "There is a problem with the entry memberList in the config.yaml - the original error message is:",
+                            e,
+                        )
                    try:
-                        memberListLink = memberList.get('link')
+                        memberListLink = memberList.get("link")
                    except Exception as e:
-                        print("No memberListLink defined in config.yaml - the original error message is:", e)
-                    
-                    # download the html page of the List of Members                    
+                        print(
+                            "No memberListLink defined in config.yaml - the original error message is:",
+                            e,
+                        )
+
+                    # download the html page of the List of Members

                    response = urllib.request.urlopen(memberListLink)
-                    webContent = response.read().decode('UTF-8')
+                    webContent = response.read().decode("UTF-8")

                    # save interim results to files
-                    
-                    f = open('crawlers/pages/' + key +'MemberList.html', 'w+')
+
+                    f = open("crawlers/pages/" + key + "MemberList.html", "w+")
                    f.write(webContent)
                    f.close

-                    
    def parseMemberListData2dictionary(self, listOfCountries):
-        
        for country in listOfCountries:
-            
            try:
-                                
-                #use soupparser to handle broken html
-                
-                tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html')
+                # use soupparser to handle broken html
+
+                tree = lxml.html.soupparser.parse(
+                    "crawlers/pages/" + country + "MemberList.html"
+                )
+
+                #                for e in tree.iter():
+                #
+                #                    print(e.tag)
+                #
+                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
+                #
+                #                     #print(etree.tostring(e).decode())

-#                for e in tree.iter():
-#
-#                    print(e.tag)
-#                
-#                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
-#                     
-#                     #print(etree.tostring(e).decode())
-                
                dictionaryMemberList = {}
-                
+
                countryConf = self.config.get(country)
-                countryDomain = countryConf.get('domain')
-                countryConfMemberList = countryConf.get('memberList')
-                countryConfMemberListParent = countryConfMemberList.get('parent')
-                countryConfMemberListChildName = countryConfMemberList.get('child-name')
-                countryConfMemberListChildLink = countryConfMemberList.get('child-link')
-                
+                countryDomain = countryConf.get("domain")
+                countryConfMemberList = countryConf.get("memberList")
+                countryConfMemberListParent = countryConfMemberList.get("parent")
+                countryConfMemberListChildName = countryConfMemberList.get("child-name")
+                countryConfMemberListChildLink = countryConfMemberList.get("child-link")
+
                for n in range(len(tree.xpath(countryConfMemberListParent))):
-                    
-                    name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName)
-                    link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink)
-                    
+                    name = tree.xpath(
+                        countryConfMemberListParent
+                        + "["
+                        + str(n)
+                        + "]"
+                        + countryConfMemberListChildName
+                    )
+                    link = tree.xpath(
+                        countryConfMemberListParent
+                        + "["
+                        + str(n)
+                        + "]"
+                        + countryConfMemberListChildLink
+                    )
+
                    if len(name) > 0:
-                    
                        dictionaryMemberList[n] = {}
-                        dictionaryMemberList[n]['name'] = name[0]
-                        
+                        dictionaryMemberList[n]["name"] = name[0]
+
                        if countryDomain in link[0]:
-                        
-                            dictionaryMemberList[n]['link'] = link[0]
-                        
+                            dictionaryMemberList[n]["link"] = link[0]
+
                        if countryDomain not in link[0]:
-                            
-                            dictionaryMemberList[n]['link'] = countryDomain + link[0]
-                            
+                            dictionaryMemberList[n]["link"] = countryDomain + link[0]
+
            except Exception as e:
-                
-                print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)
-            
+                print(
+                    "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
+                    e,
+                )
+
            # save interim results to files
-            
-            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
+
+            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close

    def downloadMemberDataHtmls(self, listOfCountries):
-        
        for country in listOfCountries:
-            
-            f = open('crawlers/output/' + country +'MemberList.txt')
+            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
-            
+
            dictionaryMemberList = eval(text)
-            
-            
+
            for memberid in dictionaryMemberList:
-                
-                
-                memberLink = dictionaryMemberList[memberid]['link']
-                
-                # download the html page of the Member                    
+                memberLink = dictionaryMemberList[memberid]["link"]
+
+                # download the html page of the Member

                response = urllib.request.urlopen(memberLink)
-                webContent = response.read().decode('UTF-8')
+                webContent = response.read().decode("UTF-8")

                # save interim results to files
-                
-                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
-                
+
+                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
+
                os.makedirs(os.path.dirname(filename), exist_ok=True)
-                f = open( filename, 'w+')
+                f = open(filename, "w+")
                f.write(webContent)
                f.close
-                
-            
+
    def parseMemberData2dictionary(self, listOfCountries):
-        
        for country in listOfCountries:
-            
-            print('started to parse data of member of ' + country + ' ..')
-            
-            f = open('crawlers/output/' + country +'MemberList.txt')
+            print("started to parse data of member of " + country + " ..")
+
+            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
-            
+
            dictionaryMemberList = eval(text)
-            
-            
+
            countryConf = self.config.get(country)
-            countryDomain = countryConf.get('domain')
-            countryConfMember = countryConf.get('member')
-            countryConfMemberInfo1 = countryConfMember.get('info-1')
-            countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent')
-            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty')
-        
+            countryDomain = countryConf.get("domain")
+            countryConfMember = countryConf.get("member")
+            countryConfMemberInfo1 = countryConfMember.get("info-1")
+            countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
+            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
+                "child-politicalParty"
+            )
+
            for memberid in dictionaryMemberList:
-                
-                print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')
-                
-                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
-                
+                print(
+                    "started to parse data of member with name "
+                    + dictionaryMemberList[memberid]["name"]
+                    + " .."
+                )
+
+                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
+
                tree = lxml.html.soupparser.parse(filename)
-                
-                politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty)
-                
-                print('oi', politicalParty)
-                
+
+                politicalParty = tree.xpath(
+                    countryConfMemberInfo1Parent
+                    + countryConfMemberInfo1ChildPoliticalParty
+                )
+
+                print("oi", politicalParty)
+
                if len(politicalParty) > 0:
-                    
-                    dictionaryMemberList[memberid]['political party'] = politicalParty[0]
-                
-                
-                
-            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
+                    dictionaryMemberList[memberid]["political party"] = politicalParty[
+                        0
+                    ]
+
+            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close
-        
--- a/crawlers/useMembersParliamentCrawler.py
+++ b/crawlers/useMembersParliamentCrawler.py
@ -1,18 +1,16 @@
-
 from crawlers.MembersParliamentCrawler import *


-
-config = 'config.yaml'
-listOfCountries = ['nicaragua']
+config = "config.yaml"
+listOfCountries = ["nicaragua"]


 Crawler = membersParliamentCrawler(config)

-#Crawler.downloadMemberListPagesOfCountries(listOfCountries)
+# Crawler.downloadMemberListPagesOfCountries(listOfCountries)

-#Crawler.parseMemberListData2dictionary(listOfCountries)
+# Crawler.parseMemberListData2dictionary(listOfCountries)

-#Crawler.downloadMemberDataHtmls(listOfCountries)
+# Crawler.downloadMemberDataHtmls(listOfCountries)

 Crawler.parseMemberData2dictionary(listOfCountries)
--- a/main.py
+++ b/main.py
@ -1,33 +1,31 @@
-
 from crawlers.MembersParliamentCrawler import *

 from wikidata.wdPEP import *

-config = 'crawlers/config.yaml'
-listOfCountries = ['nicaragua']
-
+config = "crawlers/config.yaml"
+listOfCountries = ["nicaragua"]


 # doing the crawling of government websites

-#Crawler = membersParliamentCrawler(config)
+# Crawler = membersParliamentCrawler(config)

-#Crawler.downloadMemberListPagesOfCountries(listOfCountries)
+# Crawler.downloadMemberListPagesOfCountries(listOfCountries)

-#Crawler.parseMemberListData2dictionary(listOfCountries)
+# Crawler.parseMemberListData2dictionary(listOfCountries)

-#Crawler.downloadMemberDataHtmls(listOfCountries)
+# Crawler.downloadMemberDataHtmls(listOfCountries)

-#Crawler.parseMemberData2dictionary(listOfCountries)
+# Crawler.parseMemberData2dictionary(listOfCountries)

 # processing the resulted dictionary and create wikidata queries

 wikidataPEP = WikidataPEP(config)

-#wikidataPEP.importMembersOfParliamentDict(listOfCountries)
+# wikidataPEP.importMembersOfParliamentDict(listOfCountries)

-#wikidataPEP.checkForEntityIds(listOfCountries)
+# wikidataPEP.checkForEntityIds(listOfCountries)

-#wikidataPEP.createMemberOnWikidata()
+# wikidataPEP.createMemberOnWikidata()

-wikidataPEP.editMemberOnWikidata('Q116918332')
+wikidataPEP.editMemberOnWikidata("Q116918332")
--- a/wikidata/wdPEP.py
+++ b/wikidata/wdPEP.py
@ -1,133 +1,117 @@
-
-
 import os

 import yaml
 import json


-
 class WikidataPEP(object):
-    
    def __init__(self, configFile):
-        
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
-  
-    
+
    def importMembersOfParliamentDict(self, listOfCountries):
-        
        self.fullDictionaryMemberLists = {}
-        
+
        for country in listOfCountries:
-            
-            print('started to parse data of members of ' + country + ' ..')
-            
-            f = open('crawlers/output/' + country +'MemberList.txt')
+            print("started to parse data of members of " + country + " ..")
+
+            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
-            
+
            self.fullDictionaryMemberLists[country] = eval(text)
-            
-        #print(self.fullDictionaryMemberLists)
-            
-            
+
+        # print(self.fullDictionaryMemberLists)
+
    def checkForEntityIds(self, listOfCountries):
-        
        from wikibaseintegrator import WikibaseIntegrator
        from wikibaseintegrator import wbi_helpers
-        
+
        fullDictionaryMemberLists = self.fullDictionaryMemberLists
-        
+
        for country in listOfCountries:
-            
            for memberId in fullDictionaryMemberLists[country].keys():
-                
-                name = fullDictionaryMemberLists[country][memberId]['name']
-                
+                name = fullDictionaryMemberLists[country][memberId]["name"]
+
                results = wbi_helpers.search_entities(search_string=name)
-                
-            
+
                for entityId in results:
-                
                    wbi = WikibaseIntegrator()
                    wikidata_item = wbi.item.get(entity_id=entityId)
-                    
-                    for claimkey in wikidata_item.get_json()['claims'].keys():
-                        
-                        if claimkey == 'P31':
-                            
-                            if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
-                                
-                                print(entityId)
-                                print('---------')
-                                print(name)
-                                print('is a human')
-                        
-            

+                    for claimkey in wikidata_item.get_json()["claims"].keys():
+                        if claimkey == "P31":
+                            if (
+                                wikidata_item.get_json()["claims"][claimkey][0][
+                                    "mainsnak"
+                                ]["datavalue"]["value"]["id"]
+                                == "Q5"
+                            ):
+                                print(entityId)
+                                print("---------")
+                                print(name)
+                                print("is a human")

    def createMemberOnWikidata(self):
-        
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item
        from wikibaseintegrator.wbi_config import config as wbi_config

-        wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)'
-        
+        wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)"
+
        # login object
-        login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
-        
+        login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
+
        wbi = WikibaseIntegrator(login=login_instance)

        # data type object, e.g. for a NCBI gene entrez ID
-        isHuman = Item(value='Q5', prop_nr='P31')
-        occupationPolitician = ExternalID(value='Q82955', prop_nr='P106')
-        occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106')
-        #referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
-        
+        isHuman = Item(value="Q5", prop_nr="P31")
+        occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
+        occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
+        # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
+
        # print(isHuman)
        # print(referenceURL)
-        
+
        # data goes into a list, because many data objects can be provided to
        data1 = [isHuman]
        data2 = [occupationDeputy]
        data3 = [occupationPolitician]
-        
+
        # Create a new item
        item = wbi.item.new()
-        
+
        # Set an english label
-        item.labels.set(language='en', value='Carlos Humberto Ruíz')
-        
+        item.labels.set(language="en", value="Carlos Humberto Ruíz")
+
        # Carlos Humberto Ruiz has the Qid Q116918332
-        
+
        # Set a French description
-        item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy')
-        
+        item.descriptions.set(
+            language="en", value="Nicaraguan National Assembly Deputy"
+        )
+
        item.claims.add(data1)
-        #item.claims.add(data2)
-        #item.claims.add(data3)
+        # item.claims.add(data2)
+        # item.claims.add(data3)
        print(item.write())

-
-
    def editMemberOnWikidata(self, Qid):
-        
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
        from wikibaseintegrator.wbi_config import config as wbi_config
        from wikibaseintegrator.wbi_enums import ActionIfExists
        from wikibaseintegrator.wbi_enums import WikibaseDatePrecision
-        
-        
-        wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)'
-        
+
+        wbi_config[
+            "USER_AGENT"
+        ] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)"
+
        # login object
-        login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
-        
+        login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
+
        wbi = WikibaseIntegrator(login=login_instance)

        # data type object, e.g. for a NCBI gene entrez ID
@ -135,19 +119,26 @@ class WikidataPEP(object):
        # occupationPolitician = Item(value='Q82955', prop_nr='P106')
        # occupationDeputy = Item(value='Q1055894', prop_nr='P106')
        # referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')
-        
+
        # print(isHuman)
        # print(referenceURL)
-        
+
        references = [
            [
-                ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854'),
-                Time(time='+2023-02-27T00:00:00Z', prop_nr='P813', precision=WikibaseDatePrecision.DAY)
+                ExternalID(
+                    value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB",
+                    prop_nr="P854",
+                ),
+                Time(
+                    time="+2023-02-27T00:00:00Z",
+                    prop_nr="P813",
+                    precision=WikibaseDatePrecision.DAY,
+                ),
            ]
        ]
-        
-        occupationDeputy = Item(value='Q1055894', prop_nr='P106', references=references)
-        
+
+        occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
+
        ## data goes into a list, because many data objects can be provided to
        # data1 = [isHuman]
        data2 = [occupationDeputy]
@ -155,19 +146,16 @@ class WikidataPEP(object):
        # data4 = [referenceURL]
        ## get item for Qid
        item = wbi.item.get(entity_id=Qid)
-        
-        # print(item.claims)
-        
-        # Set an english label
-        #item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
-        
-        # Set a French description
-        #item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
-        
-        #item.claims.add(data4)
-        item.claims.add(data2)
-        #item.claims.add(data3)
-        print(item.write())

-        
-        
+        # print(item.claims)
+
+        # Set an english label
+        # item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
+
+        # Set a French description
+        # item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
+
+        # item.claims.add(data4)
+        item.claims.add(data2)
+        # item.claims.add(data3)
+        print(item.write())