specifying the links, new exception clause if soupparser does not work

This commit is contained in:
alpcentaur 2023-11-07 14:55:05 +00:00
parent a99881796a
commit a846ce04cc
6 changed files with 34 additions and 8 deletions

14
requirements.txt Normal file
View file

@ -0,0 +1,14 @@
backoff==2.2.1
beautifulsoup4==4.11.2
certifi==2022.12.7
charset-normalizer==3.0.1
idna==3.4
lxml==4.9.2
PyJWT==2.6.0
PyYAML==6.0
requests==2.28.2
requests-oauthlib==1.3.1
six==1.16.0
soupsieve==2.4
ujson==5.7.0
urllib3==1.26.14

Binary file not shown.

View file

@ -10,9 +10,9 @@ foerderinfo.bund.de:
link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D'
link2: '#searchResults' link2: '#searchResults'
iteration-var-list: '[1,2,3,4,5,6,7,8]' iteration-var-list: '[1,2,3,4,5,6,7,8]'
parent: '//html//body//form//table//tr//td//column//div.row//section.l-search-result-list' parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"
child-name: '//div.l-search-result-list_item//a//span.c-search-result__title' child-name: "//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"
child-link: '//div.l-search-result-list_item//a/@href' child-link: "//div[@class='l-search-result-list_item']//a/@href"
entry: entry:
info-1: info-1:
parent: '//html//body//form//table' parent: '//html//body//form//table'

View file

@ -8,6 +8,8 @@ import urllib.request, urllib.error, urllib.parse
from lxml import etree from lxml import etree
import lxml.html import lxml.html
import lxml.html.soupparser import lxml.html.soupparser
from lxml import html
class fdb_spider(object): class fdb_spider(object):
@ -90,15 +92,22 @@ class fdb_spider(object):
"spiders/pages/" + fdb + str(i) + "entryList.html" "spiders/pages/" + fdb + str(i) + "entryList.html"
) )
except Exception as e:
tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html")
print(
"parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been, thanks to efficient particular html languages. The original error message is:",
e,
)
try:
print('oioioioioioioioioioioiOIOI') print('oioioioioioioioioioioiOIOI')
for e in tree.iter(): #for e in tree.iter():
print(e.tag) # print(e.tag)
# #
for e in tree.xpath('//html//body//form//table//tr//td//table//tr'): for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"):
print(etree.tostring(e).decode()) print(etree.tostring(e).decode())
@ -110,7 +119,7 @@ class fdb_spider(object):
fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
print('blabliblub')
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
name = tree.xpath( name = tree.xpath(
fdb_conf_entry_list_parent fdb_conf_entry_list_parent
@ -119,6 +128,8 @@ class fdb_spider(object):
+ "]" + "]"
+ fdb_conf_entry_list_child_name + fdb_conf_entry_list_child_name
) )
print('oi ' + name + ' oi')
print('blablidubbiduub')
link = tree.xpath( link = tree.xpath(
fdb_conf_entry_list_parent fdb_conf_entry_list_parent
+ "[" + "["