formatted with black

1 year ago · 69480ecc26
--- a/crawlers/MembersParliamentCrawler.py
+++ b/crawlers/MembersParliamentCrawler.py
@ -1,5 +1,3 @@
 import os
 import yaml
@ -12,172 +10,177 @@ import lxml.html
 import lxml.html.soupparser
 class membersParliamentCrawler(object):
    def __init__(self, configFile):
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
    # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico']
    def downloadMemberListPagesOfCountries(self, listOfCountries):
        # download only html pages of the countries specified in input
        for country in listOfCountries:
            for key in self.config:
                if key in listOfCountries:
                    try:
                        memberList = self.config.get(key).get('memberList')
                        memberList = self.config.get(key).get("memberList")
                    except Exception as e:
                        print("There is a problem with the entry memberList in the config.yaml - the original error message is:", e)
                        print(
                            "There is a problem with the entry memberList in the config.yaml - the original error message is:",
                            e,
                        )
                    try:
                        memberListLink = memberList.get('link')
                        memberListLink = memberList.get("link")
                    except Exception as e:
                        print("No memberListLink defined in config.yaml - the original error message is:", e)
                    # download the html page of the List of Members                    
                        print(
                            "No memberListLink defined in config.yaml - the original error message is:",
                            e,
                        )
                    # download the html page of the List of Members
                    response = urllib.request.urlopen(memberListLink)
                    webContent = response.read().decode('UTF-8')
                    webContent = response.read().decode("UTF-8")
                    # save interim results to files
                    f = open('crawlers/pages/' + key +'MemberList.html', 'w+')
                    f = open("crawlers/pages/" + key + "MemberList.html", "w+")
                    f.write(webContent)
                    f.close
    def parseMemberListData2dictionary(self, listOfCountries):
        for country in listOfCountries:
            try:
                #use soupparser to handle broken html
                tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html')
 #                for e in tree.iter():
 #
 #                    print(e.tag)
 #                
 #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
 #                     
 #                     #print(etree.tostring(e).decode())
                # use soupparser to handle broken html
                tree = lxml.html.soupparser.parse(
                    "crawlers/pages/" + country + "MemberList.html"
                )
                #                for e in tree.iter():
                #
                #                    print(e.tag)
                #
                #                 for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
                #
                #                     #print(etree.tostring(e).decode())
                dictionaryMemberList = {}
                countryConf = self.config.get(country)
                countryDomain = countryConf.get('domain')
                countryConfMemberList = countryConf.get('memberList')
                countryConfMemberListParent = countryConfMemberList.get('parent')
                countryConfMemberListChildName = countryConfMemberList.get('child-name')
                countryConfMemberListChildLink = countryConfMemberList.get('child-link')
                countryDomain = countryConf.get("domain")
                countryConfMemberList = countryConf.get("memberList")
                countryConfMemberListParent = countryConfMemberList.get("parent")
                countryConfMemberListChildName = countryConfMemberList.get("child-name")
                countryConfMemberListChildLink = countryConfMemberList.get("child-link")
                for n in range(len(tree.xpath(countryConfMemberListParent))):
                    name = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildName)
                    link = tree.xpath(countryConfMemberListParent + '[' + str(n) + ']' + countryConfMemberListChildLink)
                    name = tree.xpath(
                        countryConfMemberListParent
                        + "["
                        + str(n)
                        + "]"
                        + countryConfMemberListChildName
                    )
                    link = tree.xpath(
                        countryConfMemberListParent
                        + "["
                        + str(n)
                        + "]"
                        + countryConfMemberListChildLink
                    )
                    if len(name) > 0:
                        dictionaryMemberList[n] = {}
                        dictionaryMemberList[n]['name'] = name[0]
                        dictionaryMemberList[n]["name"] = name[0]
                        if countryDomain in link[0]:
                            dictionaryMemberList[n]['link'] = link[0]
                            dictionaryMemberList[n]["link"] = link[0]
                        if countryDomain not in link[0]:
                            dictionaryMemberList[n]['link'] = countryDomain + link[0]
                            dictionaryMemberList[n]["link"] = countryDomain + link[0]
            except Exception as e:
                print('parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:', e)
                print(
                    "parsing the html did not work. Possibly you first have to downloadMemberListPagesOfCountries(). The original error message is:",
                    e,
                )
            # save interim results to files
            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close
    def downloadMemberDataHtmls(self, listOfCountries):
        for country in listOfCountries:
            f = open('crawlers/output/' + country +'MemberList.txt')
            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
            dictionaryMemberList = eval(text)
            for memberid in dictionaryMemberList:
                memberLink = dictionaryMemberList[memberid]['link']
                # download the html page of the Member                    
                memberLink = dictionaryMemberList[memberid]["link"]
                # download the html page of the Member
                response = urllib.request.urlopen(memberLink)
                webContent = response.read().decode('UTF-8')
                webContent = response.read().decode("UTF-8")
                # save interim results to files
                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                f = open( filename, 'w+')
                f = open(filename, "w+")
                f.write(webContent)
                f.close
    def parseMemberData2dictionary(self, listOfCountries):
        for country in listOfCountries:
            print('started to parse data of member of ' + country + ' ..')
            f = open('crawlers/output/' + country +'MemberList.txt')
            print("started to parse data of member of " + country + " ..")
            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
            dictionaryMemberList = eval(text)
            countryConf = self.config.get(country)
            countryDomain = countryConf.get('domain')
            countryConfMember = countryConf.get('member')
            countryConfMemberInfo1 = countryConfMember.get('info-1')
            countryConfMemberInfo1Parent = countryConfMemberInfo1.get('parent')
            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get('child-politicalParty')
            countryDomain = countryConf.get("domain")
            countryConfMember = countryConf.get("member")
            countryConfMemberInfo1 = countryConfMember.get("info-1")
            countryConfMemberInfo1Parent = countryConfMemberInfo1.get("parent")
            countryConfMemberInfo1ChildPoliticalParty = countryConfMemberInfo1.get(
                "child-politicalParty"
            )
            for memberid in dictionaryMemberList:
                print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')
                filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
                print(
                    "started to parse data of member with name "
                    + dictionaryMemberList[memberid]["name"]
                    + " .."
                )
                filename = "crawlers/pages/" + country + "/" + str(memberid) + ".html"
                tree = lxml.html.soupparser.parse(filename)
                politicalParty = tree.xpath(countryConfMemberInfo1Parent + countryConfMemberInfo1ChildPoliticalParty)
                print('oi', politicalParty)
                politicalParty = tree.xpath(
                    countryConfMemberInfo1Parent
                    + countryConfMemberInfo1ChildPoliticalParty
                )
                print("oi", politicalParty)
                if len(politicalParty) > 0:
                    dictionaryMemberList[memberid]['political party'] = politicalParty[0]
            f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
                    dictionaryMemberList[memberid]["political party"] = politicalParty[
                        0
                    ]
            f = open("crawlers/output/" + country + "MemberList.txt", "w+")
            f.write(str(dictionaryMemberList))
            f.close
--- a/crawlers/useMembersParliamentCrawler.py
+++ b/crawlers/useMembersParliamentCrawler.py
@ -1,18 +1,16 @@
 from crawlers.MembersParliamentCrawler import *
 config = 'config.yaml'
 listOfCountries = ['nicaragua']
 config = "config.yaml"
 listOfCountries = ["nicaragua"]
 Crawler = membersParliamentCrawler(config)
 #Crawler.downloadMemberListPagesOfCountries(listOfCountries)
 # Crawler.downloadMemberListPagesOfCountries(listOfCountries)
 #Crawler.parseMemberListData2dictionary(listOfCountries)
 # Crawler.parseMemberListData2dictionary(listOfCountries)
 #Crawler.downloadMemberDataHtmls(listOfCountries)
 # Crawler.downloadMemberDataHtmls(listOfCountries)
 Crawler.parseMemberData2dictionary(listOfCountries)
--- a/main.py
+++ b/main.py
@ -1,33 +1,31 @@
 from crawlers.MembersParliamentCrawler import *
 from wikidata.wdPEP import *
 config = 'crawlers/config.yaml'
 listOfCountries = ['nicaragua']
 config = "crawlers/config.yaml"
 listOfCountries = ["nicaragua"]
 # doing the crawling of government websites
 #Crawler = membersParliamentCrawler(config)
 # Crawler = membersParliamentCrawler(config)
 #Crawler.downloadMemberListPagesOfCountries(listOfCountries)
 # Crawler.downloadMemberListPagesOfCountries(listOfCountries)
 #Crawler.parseMemberListData2dictionary(listOfCountries)
 # Crawler.parseMemberListData2dictionary(listOfCountries)
 #Crawler.downloadMemberDataHtmls(listOfCountries)
 # Crawler.downloadMemberDataHtmls(listOfCountries)
 #Crawler.parseMemberData2dictionary(listOfCountries)
 # Crawler.parseMemberData2dictionary(listOfCountries)
 # processing the resulted dictionary and create wikidata queries
 wikidataPEP = WikidataPEP(config)
 #wikidataPEP.importMembersOfParliamentDict(listOfCountries)
 # wikidataPEP.importMembersOfParliamentDict(listOfCountries)
 #wikidataPEP.checkForEntityIds(listOfCountries)
 # wikidataPEP.checkForEntityIds(listOfCountries)
 #wikidataPEP.createMemberOnWikidata()
 # wikidataPEP.createMemberOnWikidata()
 wikidataPEP.editMemberOnWikidata('Q116918332')
 wikidataPEP.editMemberOnWikidata("Q116918332")
--- a/wikidata/wdPEP.py
+++ b/wikidata/wdPEP.py
@ -1,133 +1,117 @@
 import os
 import yaml
 import json
 class WikidataPEP(object):
    def __init__(self, configFile):
        with open(configFile, "r") as stream:
            try:
                self.config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
    def importMembersOfParliamentDict(self, listOfCountries):
        self.fullDictionaryMemberLists = {}
        for country in listOfCountries:
            print('started to parse data of members of ' + country + ' ..')
            f = open('crawlers/output/' + country +'MemberList.txt')
            print("started to parse data of members of " + country + " ..")
            f = open("crawlers/output/" + country + "MemberList.txt")
            text = f.read()
            self.fullDictionaryMemberLists[country] = eval(text)
        #print(self.fullDictionaryMemberLists)
        # print(self.fullDictionaryMemberLists)
    def checkForEntityIds(self, listOfCountries):
        from wikibaseintegrator import WikibaseIntegrator
        from wikibaseintegrator import wbi_helpers
        fullDictionaryMemberLists = self.fullDictionaryMemberLists
        for country in listOfCountries:
            for memberId in fullDictionaryMemberLists[country].keys():
                name = fullDictionaryMemberLists[country][memberId]['name']
                name = fullDictionaryMemberLists[country][memberId]["name"]
                results = wbi_helpers.search_entities(search_string=name)
                for entityId in results:
                    wbi = WikibaseIntegrator()
                    wikidata_item = wbi.item.get(entity_id=entityId)
                    for claimkey in wikidata_item.get_json()['claims'].keys():
                        if claimkey == 'P31':
                            if wikidata_item.get_json()['claims'][claimkey][0]['mainsnak']['datavalue']['value']['id'] == 'Q5':
                    for claimkey in wikidata_item.get_json()["claims"].keys():
                        if claimkey == "P31":
                            if (
                                wikidata_item.get_json()["claims"][claimkey][0][
                                    "mainsnak"
                                ]["datavalue"]["value"]["id"]
                                == "Q5"
                            ):
                                print(entityId)
                                print('---------')
                                print("---------")
                                print(name)
                                print('is a human')
                                print("is a human")
    def createMemberOnWikidata(self):
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item
        from wikibaseintegrator.wbi_config import config as wbi_config
        wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:)'
        wbi_config["USER_AGENT"] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:)"
        # login object
        login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
        login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
        wbi = WikibaseIntegrator(login=login_instance)
        # data type object, e.g. for a NCBI gene entrez ID
        isHuman = Item(value='Q5', prop_nr='P31')
        occupationPolitician = ExternalID(value='Q82955', prop_nr='P106')
        occupationDeputy = ExternalID(value='Q1055894', prop_nr='P106')
        #referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
        isHuman = Item(value="Q5", prop_nr="P31")
        occupationPolitician = ExternalID(value="Q82955", prop_nr="P106")
        occupationDeputy = ExternalID(value="Q1055894", prop_nr="P106")
        # referenceURL = URL(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P106')
        # print(isHuman)
        # print(referenceURL)
        # data goes into a list, because many data objects can be provided to
        data1 = [isHuman]
        data2 = [occupationDeputy]
        data3 = [occupationPolitician]
        # Create a new item
        item = wbi.item.new()
        # Set an english label
        item.labels.set(language='en', value='Carlos Humberto Ruíz')
        item.labels.set(language="en", value="Carlos Humberto Ruíz")
        # Carlos Humberto Ruiz has the Qid Q116918332
        # Set a French description
        item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy')
        item.descriptions.set(
            language="en", value="Nicaraguan National Assembly Deputy"
        )
        item.claims.add(data1)
        #item.claims.add(data2)
        #item.claims.add(data3)
        # item.claims.add(data2)
        # item.claims.add(data3)
        print(item.write())
    def editMemberOnWikidata(self, Qid):
        from wikibaseintegrator import wbi_login, WikibaseIntegrator
        from wikibaseintegrator.datatypes import ExternalID, Item, Time, String
        from wikibaseintegrator.wbi_config import config as wbi_config
        from wikibaseintegrator.wbi_enums import ActionIfExists
        from wikibaseintegrator.wbi_enums import WikibaseDatePrecision
        wbi_config['USER_AGENT'] = 'PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)'
        wbi_config[
            "USER_AGENT"
        ] = "PEPimport/1.0 (https://www.wikidata.org/wiki/User:Alpcentaur)"
        # login object
        login_instance = wbi_login.OAuth2(consumer_token='', consumer_secret='')
        login_instance = wbi_login.OAuth2(consumer_token="", consumer_secret="")
        wbi = WikibaseIntegrator(login=login_instance)
        # data type object, e.g. for a NCBI gene entrez ID
@ -135,19 +119,26 @@ class WikidataPEP(object):
        # occupationPolitician = Item(value='Q82955', prop_nr='P106')
        # occupationDeputy = Item(value='Q1055894', prop_nr='P106')
        # referenceURL = ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854')
        # print(isHuman)
        # print(referenceURL)
        references = [
            [
                ExternalID(value='http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB', prop_nr='P854'),
                Time(time='+2023-02-27T00:00:00Z', prop_nr='P813', precision=WikibaseDatePrecision.DAY)
                ExternalID(
                    value="http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/InfoDiputado.xsp?documentId=3D4CFDC4B3006D70062587C5007C29E1&action=openDocument&SessionID=1868803A06AB73D50B7F89BD0AB",
                    prop_nr="P854",
                ),
                Time(
                    time="+2023-02-27T00:00:00Z",
                    prop_nr="P813",
                    precision=WikibaseDatePrecision.DAY,
                ),
            ]
        ]
        occupationDeputy = Item(value='Q1055894', prop_nr='P106', references=references)
        occupationDeputy = Item(value="Q1055894", prop_nr="P106", references=references)
        ## data goes into a list, because many data objects can be provided to
        # data1 = [isHuman]
        data2 = [occupationDeputy]
@ -155,19 +146,16 @@ class WikidataPEP(object):
        # data4 = [referenceURL]
        ## get item for Qid
        item = wbi.item.get(entity_id=Qid)
        # print(item.claims)
        # Set an english label
        #item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
        # item.labels.set(language='en', value='Carlos Humberto Ruíz', action_if_exists=ActionIfExists.KEEP)
        # Set a French description
        #item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
        #item.claims.add(data4)
        # item.descriptions.set(language='en', value='Nicaraguan National Assembly Deputy', action_if_exists=ActionIfExists.KEEP)
        # item.claims.add(data4)
        item.claims.add(data2)
        #item.claims.add(data3)
        # item.claims.add(data3)
        print(item.write())