diff --git a/crawlers/.MembersParliamentCrawler.py.kate-swp b/crawlers/.MembersParliamentCrawler.py.kate-swp new file mode 100644 index 0000000..6560bfb Binary files /dev/null and b/crawlers/.MembersParliamentCrawler.py.kate-swp differ diff --git a/crawlers/MembersParliamentCrawler.py b/crawlers/MembersParliamentCrawler.py new file mode 100644 index 0000000..4e53004 --- /dev/null +++ b/crawlers/MembersParliamentCrawler.py @@ -0,0 +1,49 @@ + +import yaml + +import urllib.request, urllib.error, urllib.parse + +class membersParliamentCrawler(object): + + def __init__(self, configFile): + + with open(configFile, "r") as stream: + try: + self.config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + + # input list of countries in form of ['nicaragua', 'honduras', .. , 'mexico'] + + def downloadMemberListPagesOfCountries(self, listOfCountries): + + # download only html pages of the countries specified in input + + for country in listOfCountries: + for key in self.config: + if key in listOfCountries: + try: + memberList = self.config.get(key).get('memberList') + except: + print("There is a problem with the entry memberList in the config.yaml") + try: + memberListLink = memberList.get('link') + except: + print("No memberListLink defined in config.yaml") + print(memberListLink) + + + # download the html page of the List of Members + + + + response = urllib.request.urlopen(memberListLink) + webContent = response.read().decode('UTF-8') + + f = open('pages/' + key +'MemberList.html', 'w+') + f.write(webContent) + f.close + + + diff --git a/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc b/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc new file mode 100644 index 0000000..ab964e7 Binary files /dev/null and b/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc differ diff --git a/crawlers/countries.yaml b/crawlers/countries.yaml index d72ceb4..879b6a4 100644 --- a/crawlers/countries.yaml +++ b/crawlers/countries.yaml @@ -3,13 +3,14 @@ nicaragua: memberList: - link: http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/Main.xsp - parent: [html, body, form, table, tbody, tr, td, table, tbody] - child-name: [html, body, form, table, tbody, tr, td, table, tbody, tr, td.null, a.text] - child-link: [html, body, form, table, tbody, tr, td, table, tbody, tr, td.null, a.href] + link: http://legislacion.asamblea.gob.ni/Tablas%20Generales.nsf/Main.xsp + parent: [html, body, form, table, tbody, tr, td, table, tbody] + child-name: [html, body, form, table, tbody, tr, td, table, tbody, tr, td.null, a.text] + child-link: [html, body, form, table, tbody, tr, td, table, tbody, tr, td.null, a.href] member: info-1: - parent: [html, body, form, table, tbody] - child-name: [html, body, form, table, tbody, tr.0, td.1, span] - child-image: [html, body, form, table, tbody, tr.1, td.0, span, img] - child-role: [html, body, form, table, tbody, tr.1, td.2, span + label.1] + parent: [html, body, form, table, tbody] + child-name: [html, body, form, table, tbody, tr.0, td.1, span] + child-image: [html, body, form, table, tbody, tr.1, td.0, span, img] + child-role: [html, body, form, table, tbody, tr.1, td.2, span + label.1] + child-politicalParty: [html, body, form, table, tbody, tr.4, td, span] diff --git a/crawlers/pages/nicaraguaMemberList.html b/crawlers/pages/nicaraguaMemberList.html new file mode 100644 index 0000000..4f42d75 --- /dev/null +++ b/crawlers/pages/nicaraguaMemberList.html @@ -0,0 +1,334 @@ + + +
+