Browse Source

specifying the links, new exception clause if soupparser does not work

onlinkgen
alpcentaur 10 months ago
parent
commit
a846ce04cc
6 changed files with 34 additions and 8 deletions
  1. +14
    -0
      requirements.txt
  2. BIN
      spiders/__pycache__/fdb_spider.cpython-311.pyc
  3. BIN
      spiders/__pycache__/fdb_spider.cpython-39.pyc
  4. +3
    -3
      spiders/config.yaml
  5. +16
    -5
      spiders/fdb_spider.py
  6. +1
    -0
      spiders/output/foerderinfo.bund.de1entryList.txt

+ 14
- 0
requirements.txt View File

@ -0,0 +1,14 @@
backoff==2.2.1
beautifulsoup4==4.11.2
certifi==2022.12.7
charset-normalizer==3.0.1
idna==3.4
lxml==4.9.2
PyJWT==2.6.0
PyYAML==6.0
requests==2.28.2
requests-oauthlib==1.3.1
six==1.16.0
soupsieve==2.4
ujson==5.7.0
urllib3==1.26.14

BIN
spiders/__pycache__/fdb_spider.cpython-311.pyc View File


BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc View File


+ 3
- 3
spiders/config.yaml View File

@ -10,9 +10,9 @@ foerderinfo.bund.de:
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
link2: '#searchResults' link2: '#searchResults'
iteration-var-list: '[1,2,3,4,5,6,7,8]' iteration-var-list: '[1,2,3,4,5,6,7,8]'
parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
child-name: '//div.l-search-result-list_item//a//span.c-search-result__title'
child-link: '//div.l-search-result-list_item//a/@href'
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"
child-name: "//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"
child-link: "//div[@class='l-search-result-list_item']//a/@href"
entry: entry:
info-1: info-1:
parent: '//html//body//form//table' parent: '//html//body//form//table'

+ 16
- 5
spiders/fdb_spider.py View File

@ -8,6 +8,8 @@ import urllib.request, urllib.error, urllib.parse
from lxml import etree from lxml import etree
import lxml.html import lxml.html
import lxml.html.soupparser import lxml.html.soupparser
from lxml import html
class fdb_spider(object): class fdb_spider(object):
@ -90,15 +92,22 @@ class fdb_spider(object):
"spiders/pages/" + fdb + str(i) + "entryList.html" "spiders/pages/" + fdb + str(i) + "entryList.html"
) )
except Exception as e:
tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
print(
"parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
e,
)
try:
print('oioioioioioioioioioioiOIOI') print('oioioioioioioioioioioiOIOI')
for e in tree.iter():
#for e in tree.iter():
print(e.tag)
# print(e.tag)
# #
for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):
print(etree.tostring(e).decode()) print(etree.tostring(e).decode())
@ -110,7 +119,7 @@ class fdb_spider(object):
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
print('blabliblub')
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
name = tree.xpath( name = tree.xpath(
fdb_conf_entry_list_parent fdb_conf_entry_list_parent
@ -119,6 +128,8 @@ class fdb_spider(object):
+ "]" + "]"
+ fdb_conf_entry_list_child_name + fdb_conf_entry_list_child_name
) )
print('oi ' + name + ' oi')
print('blablidubbiduub')
link = tree.xpath( link = tree.xpath(
fdb_conf_entry_list_parent fdb_conf_entry_list_parent
+ "[" + "["

+ 1
- 0
spiders/output/foerderinfo.bund.de1entryList.txt View File

@ -0,0 +1 @@
{}

Loading…
Cancel
Save