|
@ -61,13 +61,13 @@ class fdb_spider(object): |
|
|
|
|
|
|
|
|
# download the html page of the List of entrys |
|
|
# download the html page of the List of entrys |
|
|
|
|
|
|
|
|
response = urllib.request.urlopen(entry_list_link1 + string(i) + entry_list_link2) |
|
|
|
|
|
|
|
|
response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2) |
|
|
web_content = response.read().decode("UTF-8") |
|
|
web_content = response.read().decode("UTF-8") |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
# save interim results to files |
|
|
|
|
|
|
|
|
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") |
|
|
f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") |
|
|
f.write(webContent) |
|
|
|
|
|
|
|
|
f.write(web_content) |
|
|
f.close |
|
|
f.close |
|
|
|
|
|
|
|
|
def parse_entry_list_data2dictionary(self, list_of_fdbs): |
|
|
def parse_entry_list_data2dictionary(self, list_of_fdbs): |
|
@ -82,6 +82,7 @@ class fdb_spider(object): |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
for i in iteration_var_list: |
|
|
for i in iteration_var_list: |
|
|
|
|
|
print(i) |
|
|
try: |
|
|
try: |
|
|
# use soupparser to handle broken html |
|
|
# use soupparser to handle broken html |
|
|
|
|
|
|
|
@ -89,13 +90,17 @@ class fdb_spider(object): |
|
|
"spiders/pages/" + fdb + str(i) + "entryList.html" |
|
|
"spiders/pages/" + fdb + str(i) + "entryList.html" |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
# for e in tree.iter(): |
|
|
|
|
|
# |
|
|
|
|
|
# print(e.tag) |
|
|
|
|
|
# |
|
|
|
|
|
# for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('oioioioioioioioioioioiOIOI') |
|
|
|
|
|
|
|
|
|
|
|
for e in tree.iter(): |
|
|
|
|
|
|
|
|
|
|
|
print(e.tag) |
|
|
# |
|
|
# |
|
|
# #print(etree.tostring(e).decode()) |
|
|
|
|
|
|
|
|
for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): |
|
|
|
|
|
|
|
|
|
|
|
print(etree.tostring(e).decode()) |
|
|
|
|
|
|
|
|
dictionary_entry_list = {} |
|
|
dictionary_entry_list = {} |
|
|
|
|
|
|
|
@ -121,6 +126,8 @@ class fdb_spider(object): |
|
|
+ "]" |
|
|
+ "]" |
|
|
+ fdb_conf_entry_list_child_link |
|
|
+ fdb_conf_entry_list_child_link |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print('oi' + name) |
|
|
|
|
|
|
|
|
if len(name) > 0: |
|
|
if len(name) > 0: |
|
|
dictionary_entry_list[n] = {} |
|
|
dictionary_entry_list[n] = {} |
|
@ -183,7 +190,7 @@ class fdb_spider(object): |
|
|
def parse_entry_data2dictionary(self, list_of_fdbs): |
|
|
def parse_entry_data2dictionary(self, list_of_fdbs): |
|
|
for fdb in list_of_fdbs: |
|
|
for fdb in list_of_fdbs: |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
try: |
|
|
iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) |
|
|
iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print( |
|
|
print( |
|
|