From a0075e429d281862531cb63423efb74d7a20dbe2 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Mon, 27 Nov 2023 15:10:11 +0000 Subject: [PATCH] added further database in config.yaml, added new exception for downloading js generated html pages --- main.py | 8 ++--- spiders/__pycache__/fdb_spider.cpython-39.pyc | Bin 11209 -> 11670 bytes spiders/config.yaml | 26 +++++++++++++++ spiders/fdb_spider.py | 31 ++++++++++++++++-- 4 files changed, 58 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 15dcd94..94f1f88 100644 --- a/main.py +++ b/main.py @@ -1,15 +1,15 @@ from spiders.fdb_spider import * config = "spiders/config.yaml" -#list_of_fdbs = ["foerderinfo.bund.de"] -list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"] +list_of_fdbs = ["giz"] +#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"] # doing the crawling of government websites spider = fdb_spider(config) -# spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) +spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) #spider.find_config_parameter(list_of_fdbs) @@ -17,5 +17,5 @@ spider = fdb_spider(config) #spider.download_entry_data_htmls(list_of_fdbs) -spider.parse_entry_data2dictionary(list_of_fdbs) +#spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index 767558c00d875f8d726932e7f2d8360ad70e705c..8d567f13fe5c379e4eefee13371337d0029daa96 100644 GIT binary patch delta 1731 zcma)+O>7%Q6vteAiC2OtPgae-s^cGV~bT)w|$2T*)CJvG?^cYx7`~|GgaGNsv0#l z-NxVYyx|kn&AiCe8V`FXsm@>aPKW{55uQ69ZTCpjM+F+C0UDvQ=o4BWcs0@UFr;Ef zTjAgjG;$p6Afg))=f8(~XC4L9cC(^hR$0=qH`j`w5VS4H`%I%|GSGu1V3_V?VR*0% zq7PtqK!pr+H}t~mMOLV45S``M!r#ywKPy$K*19E4xetoqe?0X0(IK2Wj5vdM95I4; zqV-+p3T++(I~`&7mWmM}#3({UoJBl|z%8(+5V!$0hCm5f3Xuk+qxV$K&f%*pB8PYu zV0S6nRp)f!s0*qNop5JI;}~%s@iYRjHM@Ygh?PZlVy5hVB2O z;UG+5y@&<=W;_u*iRLN90RJ>TLSJnC7Jt=4r}?Es7CvR*cLW&XHxlQ%9NF$}n!Sdv zpXYXBxVwNR%Ge066bs(T4XY3pZKGmW(=7?x=L+ zfmQmq_8Q}*-V{ypcX|_-vCC|CZLO-A@Fj-A{%*0J2bTvYjw8;$m+DWI(W)S@3-i%s zWvKwdFCVmEU5B;mD(mO7srTs=|1~vBZ?-1Vr)k$|ob((bkI)b`{&ptQG3{IMHHqY# fWL|KlyI6^TndzZD{HKgeN4c->Bt5}L`riEu5TTGp delta 1305 zcma)+OH30%7=U-$Zo7R_4Q)Y@fdT}kHlznll?M}e|G-)=FhHNd37mL2!{h4 zdscd|OPl zUPCbb%Ot^>z67^M>H#yb>R;nl1G1m>f<{mdnC=AkBT5b?I61WJs)x+bYRC+(5yF|F za+t84=F{`hd5_FZaAMf1@Ft<3eH|o1TZFSQ@UyIJegR5`V)I2+uPS1gQsEA5_n#o6 zbk0BQ6+BHO;<4WNOKxwRg+sy`3BM)Lt=JACnh{%QPea#uk~M7}3feWrcB_hJ^NW>& zF$XR-E(wNSHK3$jgLc*x8v)%w2cna~-psOqf>F}7{LKR7)e?*Dq7NHhlQe~>O2(}B z(Np~QBxo-Fa!VH1<`6p(yAZvIK1)cH$x;(*i&6MJRgAzpfqsM=u^Tafz$3sO#9qW8 z0yV*Q#0~~A{#)j-52v~iX~Z!GJ0_Rt$EL<(`*Gv|;vnJ>;xJ+eaRhM`fdvAm5oZu* z5$6y+^knmH^29Qmzq+HxapXK=+zFs}wtWb!`&6b)%l6);pS;S_x+hnWXTiCwF&4GX z=jMzX>ZC|)L0M#0^ZIqkHM0PYp7`JOILCH;wBcqgh;0m#Z(NzLDeQV$Sb2d>eCB`lYpX2&GBBqOKW=W^CBo^bdPI_)U+YEE+IDCzBlk5x?P57NMIiCkM$M z>qT=?uPviWL7Eg9g! z#C{c5@;~$dCJ7UWjq;+e#Y|epzJkC4l;X4UOrDva&7=BT`I^Qyt0|D8?cEQ^Fn!)V zN$OU#r=P?!xM&11iI_!H=;cgzU!O1~EO8OxB>32%7=t{{;ZIKTl&iWXMQWg`rI{Rj VuZR}6^?>GEMpH{cn9S9OD20 diff --git a/spiders/config.yaml b/spiders/config.yaml index 1fa1174..12cdb83 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -47,3 +47,29 @@ foerderinfo.bund.de-bekanntmachungen: #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' unifalse: wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" + +giz: + domain: 'https://ausschreibungen.giz.de' + entry-list: + link1: 'https://ausschreibungen.giz.de/Satellite/company/welcome.do?method=showTable&fromSearch=1&tableSortPROJECT_RESULT=2&tableSortAttributePROJECT_RESULT=publicationDate&selectedTablePagePROJECT_RESULT=' + link2: '' + iteration-var-list: '[1,2,3,4,5,6,7]' + #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" + parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody" + child-name: "//tr//td[2]/text()" + child-link: "/tr//td[5]/a/@href" + child-info: "/tr//td[3]/text()" + child-period: "/tr/td[1]/text()" + #child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" + child-sponsor: "//tr/td[4]/text()" + entry: + general: + uniform: 'FALSE' + unitrue: + parent: '//html//body//form//table' + #child-name: '//html//body//form//table//tr[1]//td[2]//span' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' + unifalse: + wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" + diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 4f97c90..3b5978e 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -69,10 +69,35 @@ class fdb_spider(object): # download the html page of the List of entrys response = urllib.request.urlopen(entry_list_link1 + str(i) + entry_list_link2) - web_content = response.read().decode("UTF-8") - + # web_content = response.read().decode("UTF-8") + + try: + web_content = response.read().decode("UTF-8") + except Exception as e: + try: + web_content = response.read().decode("latin-1") + print( + "decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", + e, + ) + except Exception as ex: + print(ex) + + + + # save interim results to files - + if (len(web_content)) < 10: + print('getting the html page through urllib did not work, trying with requests librarys function get') + try: + res = requests.get(entry_list_link1 + str(i) + entry_list_link2) + web_content = res.text + except Exception as e: + print('also requests library did not work, original error is:', e) + + + print(web_content) + f = open("spiders/pages/" + key + str(i) + "entryList.html", "w+") f.write(web_content) f.close