diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6927859 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +backoff==2.2.1 +beautifulsoup4==4.11.2 +certifi==2022.12.7 +charset-normalizer==3.0.1 +idna==3.4 +lxml==4.9.2 +PyJWT==2.6.0 +PyYAML==6.0 +requests==2.28.2 +requests-oauthlib==1.3.1 +six==1.16.0 +soupsieve==2.4 +ujson==5.7.0 +urllib3==1.26.14 diff --git a/spiders/__pycache__/fdb_spider.cpython-311.pyc b/spiders/__pycache__/fdb_spider.cpython-311.pyc new file mode 100644 index 0000000..7e1e819 Binary files /dev/null and b/spiders/__pycache__/fdb_spider.cpython-311.pyc differ diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index cf6c13f..25a1496 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/config.yaml b/spiders/config.yaml index 8894226..9a9ae81 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -10,9 +10,9 @@ foerderinfo.bund.de: link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link2: '#searchResults' iteration-var-list: '[1,2,3,4,5,6,7,8]' - parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list' - child-name: '//div.l-search-result-list_item//a//span.c-search-result__title' - child-link: '//div.l-search-result-list_item//a/@href' + parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']" + child-name: "//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]" + child-link: "//div[@class='l-search-result-list_item']//a/@href" entry: info-1: parent: '//html//body//form//table' diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 935315f..f263fa6 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -8,6 +8,8 @@ import urllib.request, urllib.error, urllib.parse from lxml import etree import lxml.html import lxml.html.soupparser +from lxml import html + class fdb_spider(object): @@ -90,15 +92,22 @@ class fdb_spider(object): "spiders/pages/" + fdb + str(i) + "entryList.html" ) - + except Exception as e: + tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html") + print( + "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:", + e, + ) + + try: print('oioioioioioioioioioioiOIOI') - for e in tree.iter(): + #for e in tree.iter(): - print(e.tag) + # print(e.tag) # - for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): + for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"): print(etree.tostring(e).decode()) @@ -110,7 +119,7 @@ class fdb_spider(object): fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") - + print('blabliblub') for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): name = tree.xpath( fdb_conf_entry_list_parent @@ -119,6 +128,8 @@ class fdb_spider(object): + "]" + fdb_conf_entry_list_child_name ) + print('oi ' + name + ' oi') + print('blablidubbiduub') link = tree.xpath( fdb_conf_entry_list_parent + "[" diff --git a/spiders/output/foerderinfo.bund.de1entryList.txt b/spiders/output/foerderinfo.bund.de1entryList.txt index e69de29..9e26dfe 100644 --- a/spiders/output/foerderinfo.bund.de1entryList.txt +++ b/spiders/output/foerderinfo.bund.de1entryList.txt @@ -0,0 +1 @@ +{} \ No newline at end of file