|
@ -50,7 +50,7 @@ class membersParliamentCrawler(object): |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
# save interim results to files |
|
|
|
|
|
|
|
|
f = open('pages/' + key +'MemberList.html', 'w+') |
|
|
|
|
|
|
|
|
f = open('crawlers/pages/' + key +'MemberList.html', 'w+') |
|
|
f.write(webContent) |
|
|
f.write(webContent) |
|
|
f.close |
|
|
f.close |
|
|
|
|
|
|
|
@ -63,7 +63,7 @@ class membersParliamentCrawler(object): |
|
|
|
|
|
|
|
|
#use soupparser to handle broken html |
|
|
#use soupparser to handle broken html |
|
|
|
|
|
|
|
|
tree = lxml.html.soupparser.parse('pages/' + country + 'MemberList.html') |
|
|
|
|
|
|
|
|
tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html') |
|
|
|
|
|
|
|
|
# for e in tree.iter(): |
|
|
# for e in tree.iter(): |
|
|
# |
|
|
# |
|
@ -106,7 +106,7 @@ class membersParliamentCrawler(object): |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
# save interim results to files |
|
|
|
|
|
|
|
|
f = open('output/' + country +'MemberList.txt', 'w+') |
|
|
|
|
|
|
|
|
f = open('crawlers/output/' + country +'MemberList.txt', 'w+') |
|
|
f.write(str(dictionaryMemberList)) |
|
|
f.write(str(dictionaryMemberList)) |
|
|
f.close |
|
|
f.close |
|
|
|
|
|
|
|
@ -114,7 +114,7 @@ class membersParliamentCrawler(object): |
|
|
|
|
|
|
|
|
for country in listOfCountries: |
|
|
for country in listOfCountries: |
|
|
|
|
|
|
|
|
f = open('output/' + country +'MemberList.txt') |
|
|
|
|
|
|
|
|
f = open('crawlers/output/' + country +'MemberList.txt') |
|
|
text = f.read() |
|
|
text = f.read() |
|
|
|
|
|
|
|
|
dictionaryMemberList = eval(text) |
|
|
dictionaryMemberList = eval(text) |
|
@ -132,7 +132,7 @@ class membersParliamentCrawler(object): |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
# save interim results to files |
|
|
|
|
|
|
|
|
filename = 'pages/' + country + '/' + str(memberid) +'.html' |
|
|
|
|
|
|
|
|
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html' |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(filename), exist_ok=True) |
|
|
os.makedirs(os.path.dirname(filename), exist_ok=True) |
|
|
f = open( filename, 'w+') |
|
|
f = open( filename, 'w+') |
|
@ -146,7 +146,7 @@ class membersParliamentCrawler(object): |
|
|
|
|
|
|
|
|
print('started to parse data of member of ' + country + ' ..') |
|
|
print('started to parse data of member of ' + country + ' ..') |
|
|
|
|
|
|
|
|
f = open('output/' + country +'MemberList.txt') |
|
|
|
|
|
|
|
|
f = open('crawlers/output/' + country +'MemberList.txt') |
|
|
text = f.read() |
|
|
text = f.read() |
|
|
|
|
|
|
|
|
dictionaryMemberList = eval(text) |
|
|
dictionaryMemberList = eval(text) |
|
@ -163,7 +163,7 @@ class membersParliamentCrawler(object): |
|
|
|
|
|
|
|
|
print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..') |
|
|
print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..') |
|
|
|
|
|
|
|
|
filename = 'pages/' + country + '/' + str(memberid) +'.html' |
|
|
|
|
|
|
|
|
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html' |
|
|
|
|
|
|
|
|
tree = lxml.html.soupparser.parse(filename) |
|
|
tree = lxml.html.soupparser.parse(filename) |
|
|
|
|
|
|
|
@ -177,7 +177,7 @@ class membersParliamentCrawler(object): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
f = open('output/' + country +'MemberList.txt', 'w+') |
|
|
|
|
|
|
|
|
f = open('crawlers/output/' + country +'MemberList.txt', 'w+') |
|
|
f.write(str(dictionaryMemberList)) |
|
|
f.write(str(dictionaryMemberList)) |
|
|
f.close |
|
|
f.close |
|
|
|
|
|
|