specifying the links, new exception clause if soupparser does not work
This commit is contained in:
parent
a99881796a
commit
a846ce04cc
6 changed files with 34 additions and 8 deletions
14
requirements.txt
Normal file
14
requirements.txt
Normal file
|
@ -0,0 +1,14 @@
|
|||
backoff==2.2.1
|
||||
beautifulsoup4==4.11.2
|
||||
certifi==2022.12.7
|
||||
charset-normalizer==3.0.1
|
||||
idna==3.4
|
||||
lxml==4.9.2
|
||||
PyJWT==2.6.0
|
||||
PyYAML==6.0
|
||||
requests==2.28.2
|
||||
requests-oauthlib==1.3.1
|
||||
six==1.16.0
|
||||
soupsieve==2.4
|
||||
ujson==5.7.0
|
||||
urllib3==1.26.14
|
BIN
spiders/__pycache__/fdb_spider.cpython-311.pyc
Normal file
BIN
spiders/__pycache__/fdb_spider.cpython-311.pyc
Normal file
Binary file not shown.
Binary file not shown.
|
@ -10,9 +10,9 @@ foerderinfo.bund.de:
|
|||
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
|
||||
link2: '#searchResults'
|
||||
iteration-var-list: '[1,2,3,4,5,6,7,8]'
|
||||
parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list'
|
||||
child-name: '//div.l-search-result-list_item//a//span.c-search-result__title'
|
||||
child-link: '//div.l-search-result-list_item//a/@href'
|
||||
parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"
|
||||
child-name: "//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"
|
||||
child-link: "//div[@class='l-search-result-list_item']//a/@href"
|
||||
entry:
|
||||
info-1:
|
||||
parent: '//html//body//form//table'
|
||||
|
|
|
@ -8,6 +8,8 @@ import urllib.request, urllib.error, urllib.parse
|
|||
from lxml import etree
|
||||
import lxml.html
|
||||
import lxml.html.soupparser
|
||||
from lxml import html
|
||||
|
||||
|
||||
|
||||
class fdb_spider(object):
|
||||
|
@ -90,15 +92,22 @@ class fdb_spider(object):
|
|||
"spiders/pages/" + fdb + str(i) + "entryList.html"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
|
||||
print(
|
||||
"parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
|
||||
e,
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
print('oioioioioioioioioioioiOIOI')
|
||||
|
||||
for e in tree.iter():
|
||||
#for e in tree.iter():
|
||||
|
||||
print(e.tag)
|
||||
# print(e.tag)
|
||||
#
|
||||
for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
|
||||
for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):
|
||||
|
||||
print(etree.tostring(e).decode())
|
||||
|
||||
|
@ -110,7 +119,7 @@ class fdb_spider(object):
|
|||
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
|
||||
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
|
||||
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
|
||||
|
||||
print('blabliblub')
|
||||
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
|
||||
name = tree.xpath(
|
||||
fdb_conf_entry_list_parent
|
||||
|
@ -119,6 +128,8 @@ class fdb_spider(object):
|
|||
+ "]"
|
||||
+ fdb_conf_entry_list_child_name
|
||||
)
|
||||
print('oi ' + name + ' oi')
|
||||
print('blablidubbiduub')
|
||||
link = tree.xpath(
|
||||
fdb_conf_entry_list_parent
|
||||
+ "["
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{}
|
Loading…
Reference in a new issue