diff --git a/crawlers/MembersParliamentCrawler.py b/crawlers/MembersParliamentCrawler.py index 1cbe372..b057d87 100644 --- a/crawlers/MembersParliamentCrawler.py +++ b/crawlers/MembersParliamentCrawler.py @@ -50,7 +50,7 @@ class membersParliamentCrawler(object): # save interim results to files - f = open('pages/' + key +'MemberList.html', 'w+') + f = open('crawlers/pages/' + key +'MemberList.html', 'w+') f.write(webContent) f.close @@ -63,7 +63,7 @@ class membersParliamentCrawler(object): #use soupparser to handle broken html - tree = lxml.html.soupparser.parse('pages/' + country + 'MemberList.html') + tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html') # for e in tree.iter(): # @@ -106,7 +106,7 @@ class membersParliamentCrawler(object): # save interim results to files - f = open('output/' + country +'MemberList.txt', 'w+') + f = open('crawlers/output/' + country +'MemberList.txt', 'w+') f.write(str(dictionaryMemberList)) f.close @@ -114,7 +114,7 @@ class membersParliamentCrawler(object): for country in listOfCountries: - f = open('output/' + country +'MemberList.txt') + f = open('crawlers/output/' + country +'MemberList.txt') text = f.read() dictionaryMemberList = eval(text) @@ -132,7 +132,7 @@ class membersParliamentCrawler(object): # save interim results to files - filename = 'pages/' + country + '/' + str(memberid) +'.html' + filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html' os.makedirs(os.path.dirname(filename), exist_ok=True) f = open( filename, 'w+') @@ -146,7 +146,7 @@ class membersParliamentCrawler(object): print('started to parse data of member of ' + country + ' ..') - f = open('output/' + country +'MemberList.txt') + f = open('crawlers/output/' + country +'MemberList.txt') text = f.read() dictionaryMemberList = eval(text) @@ -163,7 +163,7 @@ class membersParliamentCrawler(object): print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..') - filename = 'pages/' + country + '/' + str(memberid) +'.html' + filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html' tree = lxml.html.soupparser.parse(filename) @@ -177,7 +177,7 @@ class membersParliamentCrawler(object): - f = open('output/' + country +'MemberList.txt', 'w+') + f = open('crawlers/output/' + country +'MemberList.txt', 'w+') f.write(str(dictionaryMemberList)) f.close diff --git a/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc b/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc index 75d8bd4..15221c4 100644 Binary files a/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc and b/crawlers/__pycache__/MembersParliamentCrawler.cpython-310.pyc differ diff --git a/crawlers/useMembersParliamentCrawler.py b/crawlers/useMembersParliamentCrawler.py index d5d8077..d80509e 100644 --- a/crawlers/useMembersParliamentCrawler.py +++ b/crawlers/useMembersParliamentCrawler.py @@ -1,5 +1,5 @@ -from MembersParliamentCrawler import * +from crawlers.MembersParliamentCrawler import * diff --git a/main.py b/main.py new file mode 100644 index 0000000..fdf73e1 --- /dev/null +++ b/main.py @@ -0,0 +1,18 @@ + +from crawlers.MembersParliamentCrawler import * + + + +config = 'crawlers/config.yaml' +listOfCountries = ['nicaragua'] + + +Crawler = membersParliamentCrawler(config) + +#Crawler.downloadMemberListPagesOfCountries(listOfCountries) + +#Crawler.parseMemberListData2dictionary(listOfCountries) + +#Crawler.downloadMemberDataHtmls(listOfCountries) + +Crawler.parseMemberData2dictionary(listOfCountries)