changed dir and import structure to be used in root of project
This commit is contained in:
parent
a05b5bc33f
commit
ee1d5d8919
4 changed files with 27 additions and 9 deletions
|
@ -50,7 +50,7 @@ class membersParliamentCrawler(object):
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
|
||||||
f = open('pages/' + key +'MemberList.html', 'w+')
|
f = open('crawlers/pages/' + key +'MemberList.html', 'w+')
|
||||||
f.write(webContent)
|
f.write(webContent)
|
||||||
f.close
|
f.close
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ class membersParliamentCrawler(object):
|
||||||
|
|
||||||
#use soupparser to handle broken html
|
#use soupparser to handle broken html
|
||||||
|
|
||||||
tree = lxml.html.soupparser.parse('pages/' + country + 'MemberList.html')
|
tree = lxml.html.soupparser.parse('crawlers/pages/' + country + 'MemberList.html')
|
||||||
|
|
||||||
# for e in tree.iter():
|
# for e in tree.iter():
|
||||||
#
|
#
|
||||||
|
@ -106,7 +106,7 @@ class membersParliamentCrawler(object):
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
|
||||||
f = open('output/' + country +'MemberList.txt', 'w+')
|
f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
|
||||||
f.write(str(dictionaryMemberList))
|
f.write(str(dictionaryMemberList))
|
||||||
f.close
|
f.close
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ class membersParliamentCrawler(object):
|
||||||
|
|
||||||
for country in listOfCountries:
|
for country in listOfCountries:
|
||||||
|
|
||||||
f = open('output/' + country +'MemberList.txt')
|
f = open('crawlers/output/' + country +'MemberList.txt')
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
dictionaryMemberList = eval(text)
|
dictionaryMemberList = eval(text)
|
||||||
|
@ -132,7 +132,7 @@ class membersParliamentCrawler(object):
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
|
||||||
filename = 'pages/' + country + '/' + str(memberid) +'.html'
|
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||||
f = open( filename, 'w+')
|
f = open( filename, 'w+')
|
||||||
|
@ -146,7 +146,7 @@ class membersParliamentCrawler(object):
|
||||||
|
|
||||||
print('started to parse data of member of ' + country + ' ..')
|
print('started to parse data of member of ' + country + ' ..')
|
||||||
|
|
||||||
f = open('output/' + country +'MemberList.txt')
|
f = open('crawlers/output/' + country +'MemberList.txt')
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
dictionaryMemberList = eval(text)
|
dictionaryMemberList = eval(text)
|
||||||
|
@ -163,7 +163,7 @@ class membersParliamentCrawler(object):
|
||||||
|
|
||||||
print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')
|
print('started to parse data of member with name ' + dictionaryMemberList[memberid]['name'] + ' ..')
|
||||||
|
|
||||||
filename = 'pages/' + country + '/' + str(memberid) +'.html'
|
filename = 'crawlers/pages/' + country + '/' + str(memberid) +'.html'
|
||||||
|
|
||||||
tree = lxml.html.soupparser.parse(filename)
|
tree = lxml.html.soupparser.parse(filename)
|
||||||
|
|
||||||
|
@ -177,7 +177,7 @@ class membersParliamentCrawler(object):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
f = open('output/' + country +'MemberList.txt', 'w+')
|
f = open('crawlers/output/' + country +'MemberList.txt', 'w+')
|
||||||
f.write(str(dictionaryMemberList))
|
f.write(str(dictionaryMemberList))
|
||||||
f.close
|
f.close
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
from MembersParliamentCrawler import *
|
from crawlers.MembersParliamentCrawler import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
18
main.py
Normal file
18
main.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
|
||||||
|
from crawlers.MembersParliamentCrawler import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
config = 'crawlers/config.yaml'
|
||||||
|
listOfCountries = ['nicaragua']
|
||||||
|
|
||||||
|
|
||||||
|
Crawler = membersParliamentCrawler(config)
|
||||||
|
|
||||||
|
#Crawler.downloadMemberListPagesOfCountries(listOfCountries)
|
||||||
|
|
||||||
|
#Crawler.parseMemberListData2dictionary(listOfCountries)
|
||||||
|
|
||||||
|
#Crawler.downloadMemberDataHtmls(listOfCountries)
|
||||||
|
|
||||||
|
Crawler.parseMemberData2dictionary(listOfCountries)
|
Loading…
Reference in a new issue