@ -8,6 +8,8 @@ import urllib.request, urllib.error, urllib.parse
from lxml import etree
from lxml import etree
import lxml.html
import lxml.html
import lxml.html.soupparser
import lxml.html.soupparser
from lxml import html
class fdb_spider ( object ) :
class fdb_spider ( object ) :
@ -90,15 +92,22 @@ class fdb_spider(object):
" spiders/pages/ " + fdb + str ( i ) + " entryList.html "
" spiders/pages/ " + fdb + str ( i ) + " entryList.html "
)
)
except Exception as e :
tree = html . parse ( " spiders/pages/ " + fdb + str ( i ) + " entryList.html " )
print (
" parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is: " ,
e ,
)
try :
print ( ' oioioioioioioioioioioiOIOI ' )
print ( ' oioioioioioioioioioioiOIOI ' )
for e in tree . iter ( ) :
#for e in tree.iter() :
print ( e . tag )
# print(e.tag )
#
#
for e in tree . xpath ( ' //html//body//form//table//tr//td//table//tr ') :
for e in tree . xpath ( " //html//body//div//main//div//div[@class= ' row ' ]//section[@class= ' l-search-result-list ' ]//div//div[@class= ' c-search-result__text-wrapper ' ]//span[@class= ' c-search-result__title ' ][text()] ") :
print ( etree . tostring ( e ) . decode ( ) )
print ( etree . tostring ( e ) . decode ( ) )
@ -110,7 +119,7 @@ class fdb_spider(object):
fdb_conf_entry_list_parent = fdb_conf_entry_list . get ( " parent " )
fdb_conf_entry_list_parent = fdb_conf_entry_list . get ( " parent " )
fdb_conf_entry_list_child_name = fdb_conf_entry_list . get ( " child-name " )
fdb_conf_entry_list_child_name = fdb_conf_entry_list . get ( " child-name " )
fdb_conf_entry_list_child_link = fdb_conf_entry_list . get ( " child-link " )
fdb_conf_entry_list_child_link = fdb_conf_entry_list . get ( " child-link " )
print ( ' blabliblub ' )
for n in range ( len ( tree . xpath ( fdb_conf_entry_list_parent ) ) ) :
for n in range ( len ( tree . xpath ( fdb_conf_entry_list_parent ) ) ) :
name = tree . xpath (
name = tree . xpath (
fdb_conf_entry_list_parent
fdb_conf_entry_list_parent
@ -119,6 +128,8 @@ class fdb_spider(object):
+ " ] "
+ " ] "
+ fdb_conf_entry_list_child_name
+ fdb_conf_entry_list_child_name
)
)
print ( ' oi ' + name + ' oi ' )
print ( ' blablidubbiduub ' )
link = tree . xpath (
link = tree . xpath (
fdb_conf_entry_list_parent
fdb_conf_entry_list_parent
+ " [ "
+ " [ "