@ -72,6 +72,78 @@ class fdb_spider(object):
f . write ( web_content )
f . close
def find_config_parameter ( self , list_of_fdbs ) :
for fdb in list_of_fdbs :
try :
iteration_var_list = eval ( self . config . get ( fdb ) . get ( " entry-list " ) . get ( " iteration-var-list " ) )
except Exception as e :
print (
" There is a problem with the configuration variable entryList iteration var list in the config.yaml " ,
e ,
)
fdb_conf = self . config . get ( fdb )
fdb_domain = fdb_conf . get ( " domain " )
fdb_conf_entry_list = fdb_conf . get ( " entry-list " )
fdb_conf_entry_list_parent = fdb_conf_entry_list . get ( " parent " )
fdb_conf_entry_list_child_name = fdb_conf_entry_list . get ( " child-name " )
fdb_conf_entry_list_child_link = fdb_conf_entry_list . get ( " child-link " )
for i in iteration_var_list :
print ( i )
try :
# use soupparser to handle broken html
tree = lxml . html . soupparser . parse (
" spiders/pages/ " + fdb + str ( i ) + " entryList.html "
)
except Exception as e :
tree = html . parse ( " spiders/pages/ " + fdb + str ( i ) + " entryList.html " )
print (
" parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been " ,
e ,
)
try :
print ( ' this is the n looped elements of the parent specified in config.yaml: ' )
#print('entrylistparent', fdb_conf_entry_list_parent)
#print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"))
#print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode())
for n in range ( len ( tree . xpath ( fdb_conf_entry_list_parent ) ) ) :
print ( ' ----------------------------------------------------------------------------------------------------------------------------------------- ' )
print ( etree . tostring ( tree . xpath ( fdb_conf_entry_list_parent ) [ n ] ) . decode ( ) )
print ( ' this is the first actual name element: ' )
name_element = tree . xpath ( fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name )
print ( name_element )
for name in name_element :
print ( name )
print ( ' this is the first actual link element: ' )
link_element = tree . xpath ( fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link )
print ( link_element )
#for link in link_element:
# print(link)
except Exception as e :
print (
" parsing the html did not work. " ,
e ,
)
def parse_entry_list_data2dictionary ( self , list_of_fdbs ) :
for fdb in list_of_fdbs :
@ -101,15 +173,18 @@ class fdb_spider(object):
try :
print ( ' oioioioioioioioioioioiOIOI ' )
#print('this is the n looped elements of the parent specified in config.yaml:' )
#for e in tree.iter():
# print(e.tag)
#
for e in tree . xpath ( " //html//body//div//main//div//div[@class= ' row ' ]//section[@class= ' l-search-result-list ' ]//div//div[@class= ' c-search-result__text-wrapper ' ]//span[@class= ' c-search-result__title ' ][text()] " ) :
#for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):
#for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']"):
# print(etree.tostring(e).decode())
print ( etree . tostring ( e ) . decode ( ) )
dictionary_entry_list = { }
@ -120,35 +195,37 @@ class fdb_spider(object):
fdb_conf_entry_list_child_name = fdb_conf_entry_list . get ( " child-name " )
fdb_conf_entry_list_child_link = fdb_conf_entry_list . get ( " child-link " )
print ( ' blabliblub ' )
print ( ' len ' , len ( tree . xpath ( fdb_conf_entry_list_parent ) ) )
for n in range ( len ( tree . xpath ( fdb_conf_entry_list_parent ) ) ) :
print ( ' oi inside the loop ' )
name = tree . xpath (
fdb_conf_entry_list_parent
+ " [ "
+ str ( n )
+ " ] "
+ fdb_conf_entry_list_child_name
)
print ( ' oi ' + name + ' oi ' )
) [ n ]
print ( ' oi ' , name )
print ( ' blablidubbiduub ' )
link = tree . xpath (
fdb_conf_entry_list_parent
+ " [ "
+ str ( n )
+ " ] "
# + "[ "
# + str(n )
# + "] "
+ fdb_conf_entry_list_child_link
)
) [ n ]
print ( ' oi ' + name )
if len ( name ) > 0 :
dictionary_entry_list [ n ] = { }
dictionary_entry_list [ n ] [ " name " ] = name [ 0 ]
dictionary_entry_list [ n ] [ " name " ] = name
if fdb_domain in link [ 0 ] :
dictionary_entry_list [ n ] [ " link " ] = link [ 0 ]
if fdb_domain in link :
dictionary_entry_list [ n ] [ " link " ] = link
if fdb_domain not in link [ 0 ] :
dictionary_entry_list [ n ] [ " link " ] = fdb_domain + link [ 0 ]
if fdb_domain not in link :
if link [ - 1 ] == ' / ' :
dictionary_entry_list [ n ] [ " link " ] = fdb_domain + link
else :
dictionary_entry_list [ n ] [ " link " ] = fdb_domain + ' / ' + link
except Exception as e :
print (