specifying the links, new exception clause if soupparser does not work

2023-11-07 14:55:05 +00:00 · 2023-11-07 14:55:05 +00:00 · a846ce04cc
commit a846ce04cc
parent a99881796a
6 changed files with 34 additions and 8 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
+backoff==2.2.1
+beautifulsoup4==4.11.2
+certifi==2022.12.7
+charset-normalizer==3.0.1
+idna==3.4
+lxml==4.9.2
+PyJWT==2.6.0
+PyYAML==6.0
+requests==2.28.2
+requests-oauthlib==1.3.1
+six==1.16.0
+soupsieve==2.4
+ujson==5.7.0
+urllib3==1.26.14
--- a/spiders/pycache/fdb_spider.cpython-311.pyc
+++ b/spiders/pycache/fdb_spider.cpython-311.pyc
--- a/spiders/pycache/fdb_spider.cpython-39.pyc
+++ b/spiders/pycache/fdb_spider.cpython-39.pyc
--- a/spiders/config.yaml
+++ b/spiders/config.yaml
@ -10,9 +10,9 @@ foerderinfo.bund.de:
    link1:  'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
    link2:  '#searchResults'
    iteration-var-list:  '[1,2,3,4,5,6,7,8]'
-    parent:  '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
-    child-name:  '//div.l-search-result-list_item//a//span.c-search-result__title'
-    child-link:  '//div.l-search-result-list_item//a/@href'
+    parent:  "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"
+    child-name:  "//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"
+    child-link:  "//div[@class='l-search-result-list_item']//a/@href"
  entry:
    info-1:
      parent:  '//html//body//form//table'
--- a/spiders/fdb_spider.py
+++ b/spiders/fdb_spider.py
@ -8,6 +8,8 @@ import urllib.request, urllib.error, urllib.parse
 from lxml import etree
 import lxml.html
 import lxml.html.soupparser
+from lxml import html
+


 class fdb_spider(object):
@ -90,15 +92,22 @@ class fdb_spider(object):
                        "spiders/pages/" + fdb + str(i) + "entryList.html"
                    )

-                    
+                except Exception as e:
+                    tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
+                    print(
+                        "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
+                        e,
+                    )
+
+                try:    

                    print('oioioioioioioioioioioiOIOI')

-                    for e in tree.iter():
+                    #for e in tree.iter():
                        
-                        print(e.tag)
+                    #    print(e.tag)
                    #
-                    for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
+                    for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):
                    
                        print(etree.tostring(e).decode())

@ -110,7 +119,7 @@ class fdb_spider(object):
                    fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
                    fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
                    fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
-
+                    print('blabliblub')
                    for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
                        name = tree.xpath(
                            fdb_conf_entry_list_parent
@ -119,6 +128,8 @@ class fdb_spider(object):
                            + "]"
                            + fdb_conf_entry_list_child_name
                        )
+                        print('oi ' + name + ' oi')
+                        print('blablidubbiduub')
                        link = tree.xpath(
                            fdb_conf_entry_list_parent
                            + "["
--- a/spiders/output/foerderinfo.bund.de1entryList.txt
+++ b/spiders/output/foerderinfo.bund.de1entryList.txt
@ -0,0 +1 @@
+{}