formatted with black

2023-03-09 16:07:47 +00:00 · 2023-03-09 16:07:47 +00:00 · 69480ecc26
commit 69480ecc26
parent f395b87ab6
4 changed files with 207 additions and 220 deletions
--- a/crawlers/MembersParliamentCrawler.py
+++ b/crawlers/MembersParliamentCrawler.py
@ -1,5 +1,3 @@
-
-
 import os

 import yaml
@ -12,58 +10,56 @@ import lxml.html
 import lxml.html.soupparser


-
-
 class membersParliamentCrawler(object):
-    
    def __init__(self, configFile):
-        
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)

-    
    # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']

    def downloadMemberListPagesOfCountries(self, listOfCountries):
-        
        # download only html pages of the countries specified in input

        for country in listOfCountries:
            for key in self.config:
                if key in listOfCountries:
                    try:
-                        memberList = self.config.get(key).get('memberList')
+                        memberList = self.config.get(key).get("memberList")
                    except Exception as e:
-                        print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e)
+                        print(
+                            "There is a problem with the entry memberList in the config.yaml - the original error message is:",
+                            e,
+                        )
                    try:
-                        memberListLink = memberList.get('link')
+                        memberListLink = memberList.get("link")
                    except Exception as e:
-                        print("No memberListLink defined in config.yaml - the original error message is:", e)
+                        print(
+                            "No memberListLink defined in config.yaml - the original error message is:",
+                            e,
+                        )

                    # download the html page of the List of Members

                    response = urllib.request.urlopen(memberListLink)
-                    webContent = response.read().decode('UTF-8')
+                    webContent = response.read().decode("UTF-8")

                    # save interim results to files

-                    f = open('crawlers/pages/' + key +'MemberList.html', 'w+')
+                    f = open("crawlers/pages/" + key + "MemberList.html", "w+")
                    f.write(webContent)
                    f.close

-                    
    def parseMemberListData2dictionary(self, listOfCountries):
-        
        for country in listOfCountries:
-            
            try:
-                                
                # use soupparser to handle broken html

-                tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html')
+                tree = lxml.html.soupparser.parse(
+                    "crawlers/pages/" + country + "MemberList.html"
+                )

                #                for e in tree.iter():
                #
@ -76,108 +72,115 @@ class membersParliamentCrawler(object):
                dictionaryMemberList = {}

                countryConf = self.config.get(country)
-                countryDomain = countryConf.get('domain')
-                countryConfMemberList = countryConf.get('memberList')
-                countryConfMemberListParent = countryConfMemberList.get('parent')
-                countryConfMemberListChildName = countryConfMemberList.get('child-name')
-                countryConfMemberListChildLink = countryConfMemberList.get('child-link')
+                countryDomain = countryConf.get("domain")
+                countryConfMemberList = countryConf.get("memberList")
+                countryConfMemberListParent = countryConfMemberList.get("parent")
+                countryConfMemberListChildName = countryConfMemberList.get("child-name")
+                countryConfMemberListChildLink = countryConfMemberList.get("child-link")

                for n in range(len(tree.xpath(countryConfMemberListParent))):
-                    
-                    name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName)
-                    link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink)
+                    name = tree.xpath(
+                        countryConfMemberListParent
+                        + "["
+                        + str(n)
+                        + "]"
+                        + countryConfMemberListChildName
+                    )
+                    link = tree.xpath(
+                        countryConfMemberListParent
+                        + "["
+                        + str(n)
+                        + "]"
+                        + countryConfMemberListChildLink
+                    )

                    if len(name) > 0:
-                    
                        dictionaryMemberList[n] = {}
-                        dictionaryMemberList[n]['name'] = name[0]
+                        dictionaryMemberList[n]["name"] = name[0]

                        if countryDomain in link[0]:
-                        
-                            dictionaryMemberList[n]['link'] = link[0]
+                            dictionaryMemberList[n]["link"] = link[0]

                        if countryDomain not in link[0]:
-                            
-                            dictionaryMemberList[n]['link'] = countryDomain + link[0]
+                            dictionaryMemberList[n]["link"] = countryDomain + link[0]

            except Exception as e:
-                
-                print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)
+                print(
+                    "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
+                    e,
+                )

            # save interim results to files

-            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
+            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close

    def downloadMemberDataHtmls(self, listOfCountries):
-        
        for country in listOfCountries:
-            
-            f = open('crawlers/output/' + country +'MemberList.txt')
+            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()

            dictionaryMemberList = eval(text)

-            
            for memberid in dictionaryMemberList:
-                
-                
-                memberLink = dictionaryMemberList[memberid]['link']
+                memberLink = dictionaryMemberList[memberid]["link"]

                # download the html page of the Member

                response = urllib.request.urlopen(memberLink)
-                webContent = response.read().decode('UTF-8')
+                webContent = response.read().decode("UTF-8")

                # save interim results to files

-                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
+                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"

                os.makedirs(os.path.dirname(filename), exist_ok=True)
-                f = open( filename, 'w+')
+                f = open(filename, "w+")
                f.write(webContent)
                f.close

-            
    def parseMemberData2dictionary(self, listOfCountries):
-        
        for country in listOfCountries:
+            print("started to parse data of member of " + country + " ..")

-            print('started to parse data of member of ' + country + ' ..')
-            
-            f = open('crawlers/output/' + country +'MemberList.txt')
+            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()

            dictionaryMemberList = eval(text)

-            
            countryConf = self.config.get(country)
-            countryDomain = countryConf.get('domain')
-            countryConfMember = countryConf.get('member')
-            countryConfMemberInfo1 = countryConfMember.get('info-1')
-            countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent')
-            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty')
+            countryDomain = countryConf.get("domain")
+            countryConfMember = countryConf.get("member")
+            countryConfMemberInfo1 = countryConfMember.get("info-1")
+            countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
+            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
+                "child-politicalParty"
+            )

            for memberid in dictionaryMemberList:
+                print(
+                    "started to parse data of member with name "
+                    + dictionaryMemberList[memberid]["name"]
+                    + " .."
+                )

-                print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')
-                
-                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
+                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"

                tree = lxml.html.soupparser.parse(filename)

-                politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty)
+                politicalParty = tree.xpath(
+                    countryConfMemberInfo1Parent
+                    + countryConfMemberInfo1ChildPoliticalParty
+                )

-                print('oi', politicalParty)
+                print("oi", politicalParty)

                if len(politicalParty) > 0:
+                    dictionaryMemberList[memberid]["political party"] = politicalParty[
+                        0
+                    ]

-                    dictionaryMemberList[memberid]['political party'] = politicalParty[0]
-                
-                
-                
-            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
+            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close
-        
--- a/crawlers/useMembersParliamentCrawler.py
+++ b/crawlers/useMembersParliamentCrawler.py
@ -1,10 +1,8 @@
-
 from crawlers.MembersParliamentCrawler import *


-
-config = 'config.yaml'
-listOfCountries = ['nicaragua']
+config = "config.yaml"
+listOfCountries = ["nicaragua"]


 Crawler = membersParliamentCrawler(config)
--- a/main.py
+++ b/main.py
@ -1,11 +1,9 @@
-
 from crawlers.MembersParliamentCrawler import *

 from wikidata.wdPEP import *

-config = 'crawlers/config.yaml'
-listOfCountries = ['nicaragua']
-
+config = "crawlers/config.yaml"
+listOfCountries = ["nicaragua"]


 # doing the crawling of government websites
@ -30,4 +28,4 @@ wikidataPEP = WikidataPEP(config)

 # wikidataPEP.createMemberOnWikidata()

-wikidataPEP.editMemberOnWikidata('Q116918332')
+wikidataPEP.editMemberOnWikidata("Q116918332")
--- a/wikidata/wdPEP.py
+++ b/wikidata/wdPEP.py
@ -1,91 +1,75 @@
-
-
 import os

 import yaml
 import json


-
 class WikidataPEP(object):
-    
    def __init__(self, configFile):
-        
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)

-    
    def importMembersOfParliamentDict(self, listOfCountries):
-        
        self.fullDictionaryMemberLists = {}

        for country in listOfCountries:
+            print("started to parse data of members of " + country + " ..")

-            print('started to parse data of members of ' + country + ' ..')
-            
-            f = open('crawlers/output/' + country +'MemberList.txt')
+            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()

            self.fullDictionaryMemberLists[country] = eval(text)

        # print(self.fullDictionaryMemberLists)

-            
    def checkForEntityIds(self, listOfCountries):
-        
        from wikibaseintegrator import WikibaseIntegrator
        from wikibaseintegrator import wbi_helpers

        fullDictionaryMemberLists = self.fullDictionaryMemberLists

        for country in listOfCountries:
-            
            for memberId in fullDictionaryMemberLists[country].keys():
-                
-                name = fullDictionaryMemberLists[country][memberId]['name']
+                name = fullDictionaryMemberLists[country][memberId]["name"]

                results = wbi_helpers.search_entities(search_string=name)

-            
                for entityId in results:
-                
                    wbi = WikibaseIntegrator()
                    wikidata_item = wbi.item.get(entity_id=entityId)

-                    for claimkey in wikidata_item.get_json()['claims'].keys():
-                        
-                        if claimkey == 'P31':
-                            
-                            if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
-                                
+                    for claimkey in wikidata_item.get_json()["claims"].keys():
+                        if claimkey == "P31":
+                            if (
+                                wikidata_item.get_json()["claims"][claimkey][0][
+                                    "mainsnak"
+                                ]["datavalue"]["value"]["id"]
+                                == "Q5"
+                            ):
                                print(entityId)
-                                print('---------')
+                                print("---------")
                                print(name)
-                                print('is a human')
-                        
-            
-
+                                print("is a human")

    def createMemberOnWikidata(self):
-        
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item
        from wikibaseintegrator.wbi_config import config as wbi_config

-        wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)'
+        wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)"

        # login object
-        login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
+        login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")

        wbi = WikibaseIntegrator(login=login_instance)

        # data type object, e.g. for a NCBI gene entrez ID
-        isHuman = Item(value='Q5', prop_nr='P31')
-        occupationPolitician = ExternalID(value='Q82955', prop_nr='P106')
-        occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106')
+        isHuman = Item(value="Q5", prop_nr="P31")
+        occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
+        occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
        # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')

        # print(isHuman)
@ -100,33 +84,33 @@ class WikidataPEP(object):
        item = wbi.item.new()

        # Set an english label
-        item.labels.set(language='en', value='Carlos Humberto Ruíz')
+        item.labels.set(language="en", value="Carlos Humberto Ruíz")

        # Carlos Humberto Ruiz has the Qid Q116918332

        # Set a French description
-        item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy')
+        item.descriptions.set(
+            language="en", value="Nicaraguan National Assembly Deputy"
+        )

        item.claims.add(data1)
        # item.claims.add(data2)
        # item.claims.add(data3)
        print(item.write())

-
-
    def editMemberOnWikidata(self, Qid):
-        
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
        from wikibaseintegrator.wbi_config import config as wbi_config
        from wikibaseintegrator.wbi_enums import ActionIfExists
        from wikibaseintegrator.wbi_enums import WikibaseDatePrecision

-        
-        wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)'
+        wbi_config[
+            "USER_AGENT"
+        ] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)"

        # login object
-        login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
+        login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")

        wbi = WikibaseIntegrator(login=login_instance)

@ -141,12 +125,19 @@ class WikidataPEP(object):

        references = [
            [
-                ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854'),
-                Time(time='+2023-02-27T00:00:00Z', prop_nr='P813', precision=WikibaseDatePrecision.DAY)
+                ExternalID(
+                    value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB",
+                    prop_nr="P854",
+                ),
+                Time(
+                    time="+2023-02-27T00:00:00Z",
+                    prop_nr="P813",
+                    precision=WikibaseDatePrecision.DAY,
+                ),
            ]
        ]

-        occupationDeputy = Item(value='Q1055894', prop_nr='P106', references=references)
+        occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)

        ## data goes into a list, because many data objects can be provided to
        # data1 = [isHuman]
@ -168,6 +159,3 @@ class WikidataPEP(object):
        item.claims.add(data2)
        # item.claims.add(data3)
        print(item.write())
-
-        
-