From 5ac07d151a3fd9f91daac743dc484260f658ad87 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Tue, 31 Oct 2023 17:41:44 +0000 Subject: [PATCH] added first config.yaml template and started creating folder structure --- .gitignore | 2 ++ spiders/config.yaml | 20 ++++++++++++++++++++ fdb_spider.py => spiders/fdb_spider.py | 2 +- 3 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 spiders/config.yaml rename fdb_spider.py => spiders/fdb_spider.py (99%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b8b7d9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/venv + diff --git a/spiders/config.yaml b/spiders/config.yaml new file mode 100644 index 0000000..da83c38 --- /dev/null +++ b/spiders/config.yaml @@ -0,0 +1,20 @@ +# Settings for the PEP crawler per country to crawl +# Follow the syntax and dont use tbody as it gets added by the browser (when researching xpath through inspector) + +# xpath syntax: https://www.w3schools.com/xml/xpath_syntax.asp +# lxml xpath syntax: https://www.geeksforgeeks.org/web-scraping-using-lxml-and-xpath-in-python/ + +foerderinfo.bund.de: + domain: 'http://foerderinfo.bund.de' + entryList: + link: '' + parent: '//html//body//form//table//tr//td//table//tr' + child-name: '//td//a/text()' + child-link: '//td//a/@href' + member: + info-1: + parent: '//html//body//form//table' + #child-name: '//html//body//form//table//tr[1]//td[2]//span' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' + diff --git a/fdb_spider.py b/spiders/fdb_spider.py similarity index 99% rename from fdb_spider.py rename to spiders/fdb_spider.py index 052df02..a4930b8 100644 --- a/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -27,7 +27,7 @@ class fdb_spider(object): for key in self.config: if key in list_of_fdbs: try: - entry_list = self.config.get(key).get("entry_list") + entry_list = self.config.get(key).get("entry-list") except Exception as e: print( "There is a problem with the configuration variable entryList in the config.yaml - the original error message is:",