From ff23c22e3ca8cd32674fd060c19a09db90718293 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Mon, 13 Nov 2023 16:44:11 +0000 Subject: [PATCH] added working bund.de-bekanntmachungen config with new example of xpath contains --- main.py | 7 ++- spiders/__pycache__/fdb_spider.cpython-39.pyc | Bin 7323 -> 7764 bytes spiders/config.yaml | 24 +++++++++- spiders/fdb_spider.py | 42 ++++++++++++++++-- 4 files changed, 66 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 519b63d..e83d588 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,8 @@ from spiders.fdb_spider import * config = "spiders/config.yaml" -list_of_fdbs = ["foerderinfo.bund.de"] +#list_of_fdbs = ["foerderinfo.bund.de"] +list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"] # doing the crawling of government websites @@ -10,9 +11,11 @@ spider = fdb_spider(config) # spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) +#spider.find_config_parameter(list_of_fdbs) + spider.parse_entry_list_data2dictionary(list_of_fdbs) -spider.download_entry_data_htmls(list_of_fdbs) +# spider.download_entry_data_htmls(list_of_fdbs) # spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 2b5f1a92af0ff7c0de586ff616255d6f285ccce2..3c02d28ba8df8a1a8a6cf7c8347a277afc51bd53 100644 GIT binary patch delta 1880 zcmaJ>&2Jk;6rWjd{IT9$f35ATV-gdkqN!ABXh@Sbv`w1w5z?ro&@_>PYiAwe#C77$ zf=D#RLb-6NB8>!aBi<+ho_?sSjvz#1zq<<7J4H&BMy#n8j+Y%v+PLdsIVVXZOZt~v)7n^~W zWCeF=Bg~M6fT_#}8oKaVLvv)NZBt6N`kk&7|4N`0v=k<#i4($jnpoj9fed|?)TU(O z_;%>#DM}hLWaJ(HacW1lR3>jrl*|(TbHHeB{RSbik)t?arngiEue1%HNwRRN5rJ79 zU&E0dt)(B#vxo_sw<5a)=XPlQHPA(ny9yQ(FBgSFM7``VGM8H-CX~$}Q-m}I(pX0d z%#(_kx2SvKI4I*CB`{AZVxCeJYbHRM=qQ09pC$E&qF&PG3?+F{uoS#~;-D;Z#DX$@ zDAyml&pgRPN?o?dO)_Zkd!bJ=`^w^SrD~n7)RxxvRhO@q?aG?9cS4z>&5W~L*~r1a zvs}*Aig(L7p|Q)g^IQLzt5#~O$Iap5N9M9e*K{{KGTXaev^OfXcg2qU`|tKYwR{)W zn^*Awha+6EGtwd72PNTI?JTYG>sqdhDpc|Dy7pX;PnCj_PJ;(e>Wyhs;-72h%+%55 z`Qfm6zJcbsB|{f2^K1Ghy~n@PCvr@I_6cJ%2&Mu<-@2{_|MFy{zX@F>HNcRbP5rGf z3p3#p7BxvD#@vXG&JmHg7m0Kt;3q2zE`{y^<~q~O(mEOgX5uWC7LDO3=xHmlD(^I@ zeWyZLfJMg$i{aBGmH|BO@g(qs$5X&V%y=KJ3wwYWUNr4R=?<*L467SfFjpne%%D0g zR)-kqY6c5gCPX^X9EPD$@4$VL_xyj~yD!lG8U8qs9dR_P$Gc>@#3hr3i{_basgY#K zmJU}>V4Ln@mnT;ul+@qxx}tY)HM&B(?dRcFv(Es`E2V0&RIQY%@0IK!{!?t&Xv^rF zwk%xdcfG}Xk(AYt0&~6%}GL@+G7kqruA9C0KjTBccOHV=Vm-gkLicLXPs_u;&AJe`G z+;9rv4Y4&s93g@5I)9SnbeOZ0NvHWIsoRo4_kK;KB|5=_=4rp1$S;^TlHv?*5@yaI zT;aRsc;>&#FN!|IpPGFG|ElUqOneJr8o@0jzmzFQ-^TGR1gyM0$G^^;kwz$=)=d6q qrWGB>=rx2{gn3@frunx$NqsK39vrY`2oCD@b^fI1GR^bBto=8F_rnAL delta 1360 zcmZ`&&u<$=6rR~^?6ti+yNY17g)R8eV(0{s!mR8H2Vb{r#* zr-fAGR$$b#r8ywcBJ3mLP*B7HZd~{SfKwz2>KTNDI3Os3_jcV`4zQAC#R>c_PrO{-JM$TPCz?&d(uC(IJk6gdA(GCM4lVwus@}5tGkR!kAXqQ7 z5<-`WHl^gsq!_G5s?o0D8&b;<5kq^1_^}M>(Uy6J)E9Lk20C=pqok#SM?pj$#TL3B zSDzYV*FJz*5MRVaLYiGu8a>*EF;9w5wJg{*@pPdJtIO`~ODhX!cI_U)y$)^c2cCuV zkRii6F}OPmtPG;DmJMsW@9{2pz~1TmF7Y4@2Ph6w&XC53HB`OTDe(?m6N^ zl2`S{&G20y1@LI-K3OI^9rZ`-{jE>pi*)I6ak0|O!(Uu1=Py)vQ{+o);!>%Wua`E; z`EsqiQLc;UuRNMptJK&3<&CRbsd07Pyg-xcXY-c2ZRN~u9b>JS$W-HOpX%j2)1>ix zLsM_DClb`4revRb>e%BR(9shL535UQ>|O5L8?4HG_qum_=lkVBY8*oUxJ z_0p+hLmaNfTiqGs{{WJ6Il2G< diff --git a/spiders/config.yaml b/spiders/config.yaml index 6ddc76f..16d2c41 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -13,6 +13,9 @@ foerderinfo.bund.de: parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" child-link: "/a[@class='c-search-result']/@href" + child-info: "//" + child-period: "/" + child-sponsor: "/" entry: info-1: parent: '//html//body//form//table' @@ -20,4 +23,23 @@ foerderinfo.bund.de: #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' -foerderinfo.bund.de-mobilitaet: +foerderinfo.bund.de-bekanntmachungen: + domain: 'http://foerderinfo.bund.de' + entry-list: + link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D' + link2: '#searchResults' + iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]' + #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" + parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]" + child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()" + child-link: "/@href" + child-info: "//div[@class='c-teaser__text-wrapper']//div[@class='c-teaser__text']/p/text()" + #child-period: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']/span[@class='c-topline__item']/text()" + child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" + child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()" + entry: + info-1: + parent: '//html//body//form//table' + #child-name: '//html//body//form//table//tr[1]//td[2]//span' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 703688f..8aa6ae9 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -89,6 +89,9 @@ class fdb_spider(object): fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") + fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info") + fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") + for i in iteration_var_list: print(i) @@ -123,19 +126,33 @@ class fdb_spider(object): print('-----------------------------------------------------------------------------------------------------------------------------------------') print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[n]).decode()) - print('this is the first actual name element:') + print('this is the name children:') name_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name) print(name_element) - for name in name_element: - print(name) + #for name in name_element: + # print(name) + print(len(name_element)) - print('this is the first actual link element:') + print('this is the link children:') link_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_link) print(link_element) #for link in link_element: # print(link) + print(len(link_element)) + + print('this is the info children:') + + info_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_info) + print(info_element) + print(len(info_element)) + + print('this is the period children:') + + period_element = tree.xpath(fdb_conf_entry_list_parent + fdb_conf_entry_list_child_period) + print(period_element) + print(len(period_element)) except Exception as e: print( @@ -194,6 +211,10 @@ class fdb_spider(object): fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent") fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name") fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link") + fdb_conf_entry_list_child_info = fdb_conf_entry_list.get("child-info") + fdb_conf_entry_list_child_period = fdb_conf_entry_list.get("child-period") + + print('blabliblub') print('len', len(tree.xpath(fdb_conf_entry_list_parent))) for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): @@ -202,6 +223,17 @@ class fdb_spider(object): fdb_conf_entry_list_parent + fdb_conf_entry_list_child_name )[n] + + info = tree.xpath( + fdb_conf_entry_list_parent + + fdb_conf_entry_list_child_info + )[n] + + period = tree.xpath( + fdb_conf_entry_list_parent + + fdb_conf_entry_list_child_period + )[n] + print('oi ', name) print('blablidubbiduub') link = tree.xpath( @@ -217,6 +249,8 @@ class fdb_spider(object): if len(name) > 0: dictionary_entry_list[n] = {} dictionary_entry_list[n]["name"] = name + dictionary_entry_list[n]["info"] = info + dictionary_entry_list[n]["period"] = period if fdb_domain in link: dictionary_entry_list[n]["link"] = link