From 06fa81e5493706ea59c14978f21eb790be1d4c6f Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Fri, 10 Nov 2023 01:12:49 +0000 Subject: [PATCH] added function find config parameter and changed core spider --- spiders/__pycache__/fdb_spider.cpython-39.pyc | Bin 6103 -> 7323 bytes spiders/config.yaml | 7 +- spiders/fdb_spider.py | 111 +++++++++++++++--- .../output/foerderinfo.bund.de1entryList.txt | 2 +- 4 files changed, 99 insertions(+), 21 deletions(-) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 25a1496f21a9571d3caa2ebb44b7c516ae798d89..2b5f1a92af0ff7c0de586ff616255d6f285ccce2 100644 GIT binary patch delta 2252 zcmbVO&2Jk;6rY)_*XzxC{ZW6^X=#=as-`6$P18?8KcEezEd>-r+ejs{v+HCNXPt~U zm15R))l^6vkW$T|5){d~@CT49Qg56PAT9_WP=&Z4aY9H)QK7sycA6Fy4)tb#`}WOy zGyC4`@3;A3@YADaJ{r{t{H}Gs>Fhsr+q_3dZ+~;pu#H<}LbIdeq|aOr(;I3o0YL_w zufbXQjS>>j(`1!iy)PQ!@j98M4HmF9S_=iC2D25gRB2JL!a*2(A+R)QsbEE<6$VQW zI^;}awN76NI#(zq)gC`ItImcSnym#aOZ*7>v&4>M31k?Th<8pQe#a_Z&QMZQdLU)U zSN|*AS8QF}kL;{hKLJy&;Ol;8U^I+CTcay*oF-TI)}k#0n5dQ*B$n9Z!k;6B#QxCt5NioKPh%R=H16338BWbtmhpr5fI zs3!v{u|Qd2IC_QH;JvOa-fdln$Pi>KDmH-~ieKb$*cFajR4s^8dSRrg+U5DY>owJd zJa;^QL!X*)OLo7PpLaIxlIzX!ZJ3zC#5wVuK3A`v@n_tM1%LjGV|iAoTwZW&%PBeY zFipiO7v(@}R;#k$Ou0oD0$k6UDtkqDdSE#}UpiL(sQ;OM^=HcZ|D@{f$4e@@yy9E= zDSs(nvalVSwH?E_-CMOk5s1Co3fv{WHg>VTxVRBLuJgP%?KC?|jwim1?qGch@kexz zIBsS#&22@yFo|uMgx>Mx$&y>~n_|}7L7U=3vm5Ypv%Ah;g!VTTpF2*IdJ_gd)MUHi zO$F~J3|xQH$D_Ds5=_iY7hTVm%ed2%urZL}JN!HhRW1V%sxw9{YEp*B-(4Yt>YE4C z>4c`R4vi)sYD%1TJqlt>(=t?}7SossSB)CdG9aooQPsloDhsad@PVg3gQ$kPjO(ne z73?Ypt++xnVk5jV_WZ-~)!%I;9ms?n_Gt3)$M@GO*Uy z1GUi^bV@b@W&9^S$nURez|b~t)j9?T2C^_Pa|Fq1n?P4g1>Q*-%L#+%qZnYT9S>qz zDFd(|MA?ZsWtH*^E(yYcIZA>UUUS4wf*lFs*FY}#_d&ev-PZQf)p1A);&uu`({l`h z9$Pu8Ip>*9aJ;7!fJ^i z(J(wGV~FMJ=uO9=e{6MCVPFO=LuX{o@6!BI_? zUEq5K_FZZLN<_5-d=s%&q`g!qxc1Ugp~=cFhff9JuK+X^e2ag52)_+FSp67Y)r#v) z%Qy3?d96>_osv;0F-)Bd7@72o^wJlH=!$KabFZFoMvF zfHdPT12ogNJLS7&FAtpDVkSR`0f!I{??OC+a1`Mf!YBe3!cU2B&7-OSvvNzAv4J;Y z3VV!Hz?Bq5?kLEVhj4VTzZ6Apc@*{lJSy8zG($4W021P9(j;3Ku@7j^30&kf!Wp?7 zLJ~o$HGYA`Kw_1ShiLzA&=(cXL0(I;+NE^uBUbW01gj}6Y1{We>HRuj$A|-LumC-+)O+1xA6XL1Z+AV z7k{P)Rrp+kEJ4#U;?+!y;yYCO;>7vnT0*aI*X#wiX!9 delta 1391 zcmZux-*4Mg6u$SyO6 zzv-HhOezF^FU()Gu20<4exTF$emNz*OD5Ib6#XbJ!emnYk`WTn3uK2?J`vvxT$O-K ziObs7_BCHJd)t(fAl^|-xeKC7t|j@BpXjK8)TNzdmtcJJflXSy0l7&#EMQ&Qk^{NJ z%sya!5u?DAKmnfuED^B;Fg0Q-uq5ym=(261*tYQ2iBz>%BwaBdV_6k zmINttpn!%}Am$+0eL)&Mw_#meIpB|@xhJ9&iyUfU%!7PT?9k2X5^+yR#EH?gy*k-Lvkm+P#7jQvcyrRs_ zez=J1o03W3eus9~(^u(sx#-!?daf*7S!jN*>udgoqw7`A+|qT^zOCyUhV4G4U@y&QxjYUIK7ud`FxfBOR3{es^T0#N=av-?ns&4{ zgKw?Cuzctu4HaYuCn{$Ty-!HX;=VdW%i=qA1n^gNLf+DA9eP#-xdNRPpXaX8uXq2>9b=%B3#E}MRK9}n z5i_=JB2&WKEBfN?*BOVq;#?B%g zL%@CW352&0W)aRIoENVbt@HxYiwLhFTo&IJb^4RIlq`xz#X6ORH>4-#Foe70i^3Sr kKgT8DDR>+BFDg^HPo9#?@>t}b)c6JQ$Iu*|7L&vLU!n4DlK=n! diff --git a/spiders/config.yaml b/spiders/config.yaml index 9a9ae81..6ddc76f 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -10,9 +10,9 @@ foerderinfo.bund.de: link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' link2: '#searchResults' iteration-var-list: '[1,2,3,4,5,6,7,8]' - parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']" - child-name: "//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]" - child-link: "//div[@class='l-search-result-list_item']//a/@href" + parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" + child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" + child-link: "/a[@class='c-search-result']/@href" entry: info-1: parent: '//html//body//form//table' @@ -20,3 +20,4 @@ foerderinfo.bund.de: #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' +foerderinfo.bund.de-mobilitaet: diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index f263fa6..703688f 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -72,6 +72,78 @@ class fdb_spider(object): f.write(web_content) f.close + def find_config_parameter(self, list_of_fdbs): + for fdb in list_of_fdbs: + + try: + iteration_var_list = eval(self.config.get(fdb).get("entry-list").get("iteration-var-list")) + except Exception as e: + print( + "There is a problem with the configuration variable entryList iteration var list in the config.yaml", + e, + ) + + fdb_conf = self.config.get(fdb) + fdb_domain = fdb_conf.get("domain") + fdb_conf_entry_list = fdb_conf.get("entry-list") + fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") + fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") + fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") + + for i in iteration_var_list: + print(i) + + + + try: + # use soupparser to handle broken html + + tree = lxml.html.soupparser.parse( + "spiders/pages/" + fdb + str(i) + "entryList.html" + ) + + except Exception as e: + tree = html.parse("spiders/pages/" + fdb + str(i) + "entryList.html") + print( + "parsing the xml files did not work with the soupparser. Broken html will not be fixed as it could have been", + e, + ) + + try: + + print('this is the n looped elements of the parent specified in config.yaml:') + + #print('entrylistparent', fdb_conf_entry_list_parent) + + #print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']")) + + #print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode()) + + for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): + print('-----------------------------------------------------------------------------------------------------------------------------------------') + print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode()) + + print('this is the first actual name element:') + + name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name) + print(name_element) + for name in name_element: + print(name) + + print('this is the first actual link element:') + + link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link) + print(link_element) + #for link in link_element: + # print(link) + + except Exception as e: + print( + "parsing the html did not work.", + e, + ) + + def parse_entry_list_data2dictionary(self, list_of_fdbs): for fdb in list_of_fdbs: @@ -101,15 +173,18 @@ class fdb_spider(object): try: - print('oioioioioioioioioioioiOIOI') + #print('this is the n looped elements of the parent specified in config.yaml:') #for e in tree.iter(): # print(e.tag) # - for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"): + #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title'][text()]"): + #for e in tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']"): + + # print(etree.tostring(e).decode()) + - print(etree.tostring(e).decode()) dictionary_entry_list = {} @@ -120,35 +195,37 @@ class fdb_spider(object): fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") print('blabliblub') + print('len', len(tree.xpath(fdb_conf_entry_list_parent))) for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): + print('oi inside the loop') name = tree.xpath( fdb_conf_entry_list_parent - + "[" - + str(n) - + "]" + fdb_conf_entry_list_child_name - ) - print('oi ' + name + ' oi') + )[n] + print('oi ', name) print('blablidubbiduub') link = tree.xpath( fdb_conf_entry_list_parent - + "[" - + str(n) - + "]" + # + "[" + # + str(n) + # + "]" + fdb_conf_entry_list_child_link - ) + )[n] print('oi' + name) if len(name) > 0: dictionary_entry_list[n] = {} - dictionary_entry_list[n]["name"] = name[0] + dictionary_entry_list[n]["name"] = name - if fdb_domain in link[0]: - dictionary_entry_list[n]["link"] = link[0] + if fdb_domain in link: + dictionary_entry_list[n]["link"] = link - if fdb_domain not in link[0]: - dictionary_entry_list[n]["link"] = fdb_domain + link[0] + if fdb_domain not in link: + if link[-1] == '/': + dictionary_entry_list[n]["link"] = fdb_domain + link + else: + dictionary_entry_list[n]["link"] = fdb_domain + '/' + link except Exception as e: print( diff --git a/spiders/output/foerderinfo.bund.de1entryList.txt b/spiders/output/foerderinfo.bund.de1entryList.txt index 9e26dfe..b1c823d 100644 --- a/spiders/output/foerderinfo.bund.de1entryList.txt +++ b/spiders/output/foerderinfo.bund.de1entryList.txt @@ -1 +1 @@ -{} \ No newline at end of file +{0: {'name': 'Newsletter', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/news/newsletter/newsletter.html'}, 1: {'name': 'Wettbewerbe, Preise', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/news/wettbewerbe-preise/wettbewerbe-preise.html'}, 2: {'name': 'Veranstaltungen', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/news/veranstaltungen/veranstaltungen.html'}, 3: {'name': 'Projektträger in der Forschungsförderung', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/projekttraeger/projekttraeger-in-der-forschungsfoerderung.html'}, 4: {'name': 'Leichte Sprache', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/services/leichtesprache/leichte-sprache.html'}, 5: {'name': 'Ausführliche Informationen', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/services/leichtesprache/ausfuehrliche-informationen.html'}, 6: {'name': 'Erklärung zur Barrierefreiheit', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/services/leichtesprache/erklaerung-zur-barrierefreiheit.html'}, 7: {'name': 'Darum geht es auf dieser Seite', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/services/leichtesprache/darum-geht-es-auf-dieser-seite.html'}, 8: {'name': 'FAQ', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/faq/faq.html'}, 9: {'name': 'Forschungs- und Innovationsförderung', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/forschungs-und-innovationsfoerderung/forschungs-und-innovationsfoerderung.html'}, 10: {'name': 'Glossar', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/glossar/glossar.html'}, 11: {'name': 'Bei uns sind Sie bestens beraten!', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/erstberatung/bei-uns-sind-sie-bestens-beraten_.html'}, 12: {'name': 'Unser Service', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/unser-service/unser-service.html'}, 13: {'name': 'Was wir tun', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/beratung/was-wir-tun/was-wir-tun.html'}, 14: {'name': '„Ich hab‘ da mal eine Idee“ – Die Förderberatung des Bundes im Gespräch', 'link': 'http://foerderinfo.bund.de/foerderinfo/de/_documents/ich-hab-da-mal-eine-idee.html'}} \ No newline at end of file