added functions for uniform and not uniform entry end points - non uniform endpoints are generally parsed as text from any paragraph xml element p
This commit is contained in:
parent
b2cf4b67ce
commit
14ece9bceb
4 changed files with 82 additions and 18 deletions
17
README.md
17
README.md
|
@ -0,0 +1,17 @@
|
|||
|
||||
|
||||
__ _ _ _ _
|
||||
/ _| __| | |__ ___ _ __ (_) __| | ___ _ __
|
||||
| |_ / _` | '_ \ _____/ __| '_ \| |/ _` |/ _ \ '__|
|
||||
| _| (_| | |_) |_____\__ \ |_) | | (_| | __/ |
|
||||
|_| \__,_|_.__/ |___/ .__/|_|\__,_|\___|_|
|
||||
|_|
|
||||
|
||||
Configure fdb-spider in a yaml file.
|
||||
Spider Multi page databases of links.
|
||||
Filter and serialize content to json.
|
||||
|
||||
Filter either by xpath syntax.
|
||||
Or Filter with the help of Artificial Neural Networks.
|
||||
|
||||
|
4
main.py
4
main.py
|
@ -15,7 +15,7 @@ spider = fdb_spider(config)
|
|||
|
||||
#spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||
|
||||
spider.download_entry_data_htmls(list_of_fdbs)
|
||||
# spider.download_entry_data_htmls(list_of_fdbs)
|
||||
|
||||
# spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||
spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||
|
||||
|
|
Binary file not shown.
|
@ -386,12 +386,11 @@ class fdb_spider(object):
|
|||
fdb_conf = self.config.get(fdb)
|
||||
fdb_domain = fdb_conf.get("domain")
|
||||
fdb_conf_entry = fdb_conf.get("entry")
|
||||
fdb_conf_entry_info1 = fdb_conf_entry.get("info-1")
|
||||
fdb_conf_entry_info1_parent = fdb_conf_entry_info1.get("parent")
|
||||
fdb_conf_entry_info1_child_1 = fdb_conf_entry_info1.get(
|
||||
"child-1"
|
||||
)
|
||||
|
||||
print('balubaluba', fdb_conf_entry)
|
||||
fdb_conf_entry_general = fdb_conf_entry.get("general")
|
||||
print(fdb_conf_entry_general)
|
||||
|
||||
|
||||
for entry_id in dictionary_entry_list:
|
||||
print(
|
||||
"started to parse data of entry with name "
|
||||
|
@ -401,20 +400,68 @@ class fdb_spider(object):
|
|||
|
||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||
|
||||
tree = lxml.html.soupparser.parse(file_name)
|
||||
try:
|
||||
|
||||
child_1 = tree.xpath(
|
||||
fdb_conf_entry_info1_parent
|
||||
+ fdb_conf_entry_info1_child_1
|
||||
)
|
||||
tree = lxml.html.soupparser.parse(file_name)
|
||||
|
||||
except Exception as e:
|
||||
tree = html.parse(file_name)
|
||||
print(
|
||||
"parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
|
||||
e,
|
||||
)
|
||||
|
||||
if fdb_conf_entry_general["uniform"] == 'TRUE':
|
||||
fdb_conf_entry_unitrue = fdb_conf_entry.get("unitrue")
|
||||
|
||||
for key in fdb_conf_entry_unitrue:
|
||||
fdb_conf_entry_unitrue_child = fdb_conf_entry_unitrue.get(key)
|
||||
|
||||
|
||||
|
||||
|
||||
print("oi", child_1)
|
||||
child = tree.xpath(
|
||||
fdb_conf_entry_unitrue_entry_child
|
||||
)
|
||||
|
||||
if len(child_1) > 0:
|
||||
dictionary_entry_list[entry_id]["child_1"] = child_1[
|
||||
0
|
||||
]
|
||||
print("oi", child)
|
||||
|
||||
if len(child) > 0:
|
||||
dictionary_entry_list[entry_id][key] = child[
|
||||
0
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
else:
|
||||
fdb_conf_entry_unifalse = fdb_conf_entry.get("unifalse")
|
||||
fdb_conf_entry_unifalse_wordlist = fdb_conf_entry_unifalse.get("wordlist")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
text = tree.xpath(
|
||||
"//p//text()"
|
||||
)
|
||||
|
||||
print("oi", text)
|
||||
generaltext = ''
|
||||
for n in range(len(text)):
|
||||
|
||||
if len(text[n]) > 0:
|
||||
generaltext += text[n] + ' '
|
||||
|
||||
dictionary_entry_list[entry_id]["text"] = generaltext
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
f = open("spiders/output/" + fdb + str(i) + "entryList.txt", "w+")
|
||||
f.write(str(dictionary_entry_list))
|
||||
f.close
|
||||
|
|
Loading…
Reference in a new issue