diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 25a1496..2b5f1a9 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/config.yaml b/spiders/config.yaml index 9a9ae81..6ddc76f 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -10,9 +10,9 @@ foerderinfo.bund.de: link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link2: '#searchResults' iteration-var-list: '[1,2,3,4,5,6,7,8]' - parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']" - child-name: "//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]" - child-link: "//div[@class='l-search-result-list_item']//a/@href" + parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" + child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" + child-link: "/a[@class='c-search-result']/@href" entry: info-1: parent: '//html//body//form//table' @@ -20,3 +20,4 @@ foerderinfo.bund.de: #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' +foerderinfo.bund.de-mobilitaet: diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index f263fa6..703688f 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -72,6 +72,78 @@ class fdb_spider(object): f.write(web_content) f.close + def find_config_parameter(self, list_of_fdbs): + for fdb in list_of_fdbs: + + try: + iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) + except Exception as e: + print( + "There is a problem with the configuration variable entryList iteration var list in the config.yaml", + e, + ) + + fdb_conf = self.config.get(fdb) + fdb_domain = fdb_conf.get("domain") + fdb_conf_entry_list = fdb_conf.get("entry-list") + fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") + fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") + fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") + + for i in iteration_var_list: + print(i) + + + + try: + # use soupparser to handle broken html + + tree = lxml.html.soupparser.parse( + "spiders/pages/" + fdb + str(i) + "entryList.html" + ) + + except Exception as e: + tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html") + print( + "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been", + e, + ) + + try: + + print('this is the n looped elements of the parent specified in config.yaml:') + + #print('entrylistparent', fdb_conf_entry_list_parent) + + #print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']")) + + #print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode()) + + for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): + print('-----------------------------------------------------------------------------------------------------------------------------------------') + print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode()) + + print('this is the first actual name element:') + + name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name) + print(name_element) + for name in name_element: + print(name) + + print('this is the first actual link element:') + + link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link) + print(link_element) + #for link in link_element: + # print(link) + + except Exception as e: + print( + "parsing the html did not work.", + e, + ) + + def parse_entry_list_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: @@ -101,15 +173,18 @@ class fdb_spider(object): try: - print('oioioioioioioioioioioiOIOI') + #print('this is the n looped elements of the parent specified in config.yaml:') #for e in tree.iter(): # print(e.tag) # - for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"): + #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"): + #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']"): + + # print(etree.tostring(e).decode()) + - print(etree.tostring(e).decode()) dictionary_entry_list = {} @@ -120,35 +195,37 @@ class fdb_spider(object): fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") print('blabliblub') + print('len', len(tree.xpath(fdb_conf_entry_list_parent))) for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): + print('oi inside the loop') name = tree.xpath( fdb_conf_entry_list_parent - + "[" - + str(n) - + "]" + fdb_conf_entry_list_child_name - ) - print('oi ' + name + ' oi') + )[n] + print('oi ', name) print('blablidubbiduub') link = tree.xpath( fdb_conf_entry_list_parent - + "[" - + str(n) - + "]" + # + "[" + # + str(n) + # + "]" + fdb_conf_entry_list_child_link - ) + )[n] print('oi' + name) if len(name) > 0: dictionary_entry_list[n] = {} - dictionary_entry_list[n]["name"] = name[0] + dictionary_entry_list[n]["name"] = name - if fdb_domain in link[0]: - dictionary_entry_list[n]["link"] = link[0] + if fdb_domain in link: + dictionary_entry_list[n]["link"] = link - if fdb_domain not in link[0]: - dictionary_entry_list[n]["link"] = fdb_domain + link[0] + if fdb_domain not in link: + if link[-1] == '/': + dictionary_entry_list[n]["link"] = fdb_domain + link + else: + dictionary_entry_list[n]["link"] = fdb_domain + '/' + link except Exception as e: print( diff --git a/spiders/output/foerderinfo.bund.de1entryList.txt b/spiders/output/foerderinfo.bund.de1entryList.txt index 9e26dfe..b1c823d 100644 --- a/spiders/output/foerderinfo.bund.de1entryList.txt +++ b/spiders/output/foerderinfo.bund.de1entryList.txt @@ -1 +1 @@ -{} \ No newline at end of file +{0: {'name': 'Newsletter', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/news/newsletter/newsletter.html'}, 1: {'name': 'Wettbewerbe, Preise', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/news/wettbewerbe-preise/wettbewerbe-preise.html'}, 2: {'name': 'Veranstaltungen', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/news/veranstaltungen/veranstaltungen.html'}, 3: {'name': 'Projektträger in der Forschungsförderung', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/projekttraeger/projekttraeger-in-der-forschungsfoerderung.html'}, 4: {'name': 'Leichte Sprache', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/services/leichtesprache/leichte-sprache.html'}, 5: {'name': 'Ausführliche Informationen', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/services/leichtesprache/ausfuehrliche-informationen.html'}, 6: {'name': 'Erklärung zur Barrierefreiheit', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/services/leichtesprache/erklaerung-zur-barrierefreiheit.html'}, 7: {'name': 'Darum geht es auf dieser Seite', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/services/leichtesprache/darum-geht-es-auf-dieser-seite.html'}, 8: {'name': 'FAQ', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/faq/faq.html'}, 9: {'name': 'Forschungs- und Innovationsförderung', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/forschungs-und-innovationsfoerderung/forschungs-und-innovationsfoerderung.html'}, 10: {'name': 'Glossar', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/glossar/glossar.html'}, 11: {'name': 'Bei uns sind Sie bestens beraten!', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/erstberatung/bei-uns-sind-sie-bestens-beraten_.html'}, 12: {'name': 'Unser Service', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/unser-service/unser-service.html'}, 13: {'name': 'Was wir tun', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/was-wir-tun/was-wir-tun.html'}, 14: {'name': '„Ich hab‘ da mal eine Idee“ – Die Förderberatung des Bundes im Gespräch', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/_documents/ich-hab-da-mal-eine-idee.html'}} \ No newline at end of file