specifying the links, new exception clause if soupparser does not work

1 year ago · a846ce04cc
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
 backoff==2.2.1
 beautifulsoup4==4.11.2
 certifi==2022.12.7
 charset-normalizer==3.0.1
 idna==3.4
 lxml==4.9.2
 PyJWT==2.6.0
 PyYAML==6.0
 requests==2.28.2
 requests-oauthlib==1.3.1
 six==1.16.0
 soupsieve==2.4
 ujson==5.7.0
 urllib3==1.26.14
--- a/spiders/pycache/fdb_spider.cpython-311.pyc
+++ b/spiders/pycache/fdb_spider.cpython-311.pyc
--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -10,9 +10,9 @@ foerderinfo.bund.de:
    link1:  'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
    link2:  '#searchResults'
    iteration-var-list:  '[1,2,3,4,5,6,7,8]'
    parent:  '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
    child-name:  '//div.l-search-result-list_item//a//span.c-search-result__title'
    child-link:  '//div.l-search-result-list_item//a/@href'
    parent:  "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"
    child-name:  "//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"
    child-link:  "//div[@class='l-search-result-list_item']//a/@href"
  entry:
    info-1:
      parent:  '//html//body//form//table'
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -8,6 +8,8 @@ import urllib.request, urllib.error, urllib.parse
 from lxml import etree
 import lxml.html
 import lxml.html.soupparser
 from lxml import html
 class fdb_spider(object):
@ -90,15 +92,22 @@ class fdb_spider(object):
                        "spiders/pages/" + fdb + str(i) + "entryList.html"
                    )
                except Exception as e:
                    tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
                    print(
                        "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
                        e,
                    )
                try:    
                    print('oioioioioioioioioioioiOIOI')
                    for e in tree.iter():
                    #for e in tree.iter():
                        print(e.tag)
                    #    print(e.tag)
                    #
                    for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
                    for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):
                        print(etree.tostring(e).decode())
@ -110,7 +119,7 @@ class fdb_spider(object):
                    fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
                    fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
                    fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
                    print('blabliblub')
                    for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
                        name = tree.xpath(
                            fdb_conf_entry_list_parent
@ -119,6 +128,8 @@ class fdb_spider(object):
                            + "]"
                            + fdb_conf_entry_list_child_name
                        )
                        print('oi ' + name + ' oi')
                        print('blablidubbiduub')
                        link = tree.xpath(
                            fdb_conf_entry_list_parent
                            + "["
--- a/spiders/output/foerderinfo.bund.de1entryList.txt
+++ b/spiders/output/foerderinfo.bund.de1entryList.txt
@ -0,0 +1 @@
 {}