From cf3bb5268425052533700f1c0404a6e2df8e7842 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Mon, 15 Jan 2024 19:09:28 +0000 Subject: [PATCH] corrected link glueing for pdf links for loop --- main.py | 9 +- .../__pycache__/fdb_spider.cpython-311.pyc | Bin 40443 -> 41511 bytes spiders/config.yaml | 4 +- spiders/config.yaml.save | 110 ++++++++++++++++++ spiders/fdb_spider.py | 18 ++- 5 files changed, 133 insertions(+), 8 deletions(-) create mode 100644 spiders/config.yaml.save diff --git a/main.py b/main.py index 75e8597..dd0dee2 100644 --- a/main.py +++ b/main.py @@ -5,20 +5,21 @@ import sys config = "spiders/config.yaml" #list_of_fdbs = eval(sys.argv[1]) -list_of_fdbs = ["giz","evergabe-online"] +#list_of_fdbs = ["giz","evergabe-online"] +list_of_fdbs = ["giz"] # doing the crawling of government websites spider = fdb_spider(config) -spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) +#spider.download_entry_list_pages_of_funding_databases(list_of_fdbs) #spider.find_config_parameter(list_of_fdbs) -spider.parse_entry_list_data2dictionary(list_of_fdbs) +#spider.parse_entry_list_data2dictionary(list_of_fdbs) -spider.download_entry_data_htmls(list_of_fdbs) +#spider.download_entry_data_htmls(list_of_fdbs) spider.parse_entry_data2dictionary(list_of_fdbs) diff --git a/spiders/__pycache__/fdb_spider.cpython-311.pyc b/spiders/__pycache__/fdb_spider.cpython-311.pyc index bac2262ab2508455f68da51e4111f83d9014ae46..48677c6df6a0cbd091f9a487b68cbf2667514bef 100644 GIT binary patch delta 3563 zcmcgvYfKy26}~fL8$U3{nDGk(0|piYv4`*q;S~}RHZPuS^L`k{1P8F`cp!;)4B0$Z z+Jt85Ua3)ctIDd>EZszETKXfa^uen8gRMwqZ`@tV3YA3LBx+R2RH>Ft`=fWpBrr}^ ztM*5)JxAX?=iGD8`Rp$rk;5N8jeN4}Wz_6F`u+xw6%R?Eo5rC_J$6COo=7>Ios{lO8R>0fC zUl8geXQzMi-2g_Yj&RfF00J^r0k3f7*kd@ul@)MuVc*1HNIvr#4#0KvBnAdlfY-xg zMJ6a8Nv|+ko*6)hl&m!IWb#U%*$yxO_l)G$T0-Oj;3G~G2f!)po7OQ_o*V^}1l!EX z;odGQJQ(P#um;i2p>$r&>(+8$Y->1Vl*G;ENvLnnR5Ce2Fm@}XPg@$uYZxjHJ&6%A z`O55oQ$CvmPypE6kYarxo@8=`t$BqI4j$yJ8Zjg`iLp0S&~@4hvC*wqK0JGxhTLf0 zp>-57_5qPd_9GOfU@cWfx2nIzo0-Cw=t-mpOyNa*%_c;E@llgQ>Q4x{vzOOr%oHW6 zguXK-(<{6=`B)fZ$O?~!EbwWU4*tcFcCrN=AwyZbna|RIwGB~{iQo#}$`k~(h<)n> zi2)%s-l_rXlM9(82^7D$DTfkNO`60*k96sZ!MBoa}oU&Mwco7@JB3l1( zs+1_*g zhR@_>W%JpH6lW-hD;I2u!RK%r0f1vVqmni8Ha;h8g&*28A$iuBkfToaW(#|s^0 z3n>^&$R??qM*Z~tDiR*!jF=7b!E%#vKqlYq9veQH&LKkZ758J4{sawztPj2da`P zuuHPGORY?G;*` z?XBB*8`B~^b_@uY$l9+c>P_ieyWOoc znh65~x`E?W`1Hc<#s%$&&&#nRo@2fJ-a&>Hez@4HNCfjvcsS1MJ3*~Fh<1wcGL+BH zQK$Uu0GkbeeX*`BXyE$Y9OVsAT)&4B-8NI~J3@?Bit&2_;sW>Zu*c0(oS*9PP^0ca z6la*}qXxac0YS<5Px%J@Zl(-k#!QTY|Gs$mc+lkKs1s;2uMc^&?-W0R2&VRU`uwbi z8ua^zsi4N)$9dS*Nr{le^T-Jc>**a~1Kv>&s)h3oQh`Btpno$3_rFmzeg2K>I93E* zZ{F7}pp+L>y(8Q)l;4O4l9#R`%ZJ-6koKg*D^3Oc{L)UtH5~wF!Mq_i4q9a@^D>gK z=JE$AK|+frLHOn6_m#&9eGj$_j!*w=LWiaFVV3-5&C6(Bo5?22q`i`v>_ zw)TjvJ&vbjyXK3_ey)CBJzub8nZO?ird{bC z7gyEY@vwk+>S3+)cvY{Mjox$5v$lSrJPc(ES6Tr z0ftQL8eO|dz9{o(H#-GW09`9>6;mh(%mt-J3@EQ*Eif$E!H;vCNNVUZR&_M zbwq1BW3`=eATMte#m(mVnr$XnO9g23)n15RN^5keJSRdpIR?MC^}vme4YZ#tEEZ zs|~v}nmIgkk4ZUaQtt(txjb@Doo231j_@6|X>Y#rPQJP=Rdc7z*p{OCV~QN%PpUGz z)ylhSb$7YuZl-dbS$@!^Sa7Ki z8C46l8He=as?YUuG+EMRxYHC%d88XxEmfx?d=Hno7210WLcl6az-kevse5SEJ#$7+ zsp?({*`v|iYf44Z(1vc5{~ zw;ogy!lVX6lLwnbT(9oWQ9V>-^jlO9&E$z%&BG0;0WKG%kylCtqV1!~O^7oMpHG delta 2490 zcmcgsZA?>F7(TbRxAeA@Ld)&Py)7-2q7_=HSSa737&n~=c9WS{-Nr?K;OC`?;=SPf zU{kB>JjpcMRAw@m$_xvCT#O4@G%?czP%@Ys4gvRLKbEq%nQTAYxh)J@G28y^qPC!K;!tQ=j;1E{~~EvBFkFL?b;RSRpKXhXj)SNI0JaB z5`+|fuOJ}-Y~}Lc_q!)?(%;ql^`1rym-|2J^)@1ab53|Q;Knp?ERX|TO^yP}C~`-J zKm~{y3;?U+sZI$1SI|E?Fu>}0YA*;$xQp<3lLbC&s&eUD(4vscD|q!R6KkBsf{vxp zE-Zv`^(4evXt=U@IbjyJK`WmFuQjK)%@M(-HAD{N1fF6ovox%F+P{Z4u-19}6!HVs zdQ>vICNi*Kt|uba7|$SYz{L!?N|@VbgxB|3+h~F24Lq#`vt&p>G)npWL4qMjV*u@V z(SZ~@H^rG$7R3vhPEnQi{1H4gs~z6lpN*x%@cuWa>)3O|2@kid$1}QZaHPe8Wx`GE z3!(Ku0b}Qlyt)$!zY-BIrilR(uvr2FcM9ox6K|>newSblxW%>T;LY&z0W-ny<_;P@ zvZcbY16n5&TpV1w(_sSv`Ai%DnSe71GIoK$#Iz1Nn|ERitm55zszEBob5x{xFfIiZ z2^HgXyAF2qWI}2P5B|)nU>~8^Iyx~V6#b(^NaNE)zim;R--4*(`VCqICdK}o_Z7i{ z|L0VZ@UUP`nJ+0CdRv_-*>QTT=p`9+Y)+gzJyK<{rA!;t@R=h?hf3`b=zwJ=LjZUy zPebx$kbddj6S_q~``v$@eAm~*(<=w_F6=f!rN z2M9La2FDKCQFoqw+Xa)4*iqcle~2r_Jh10TA?)0!*2Fbge)K>MTM$#w=_D{GoN}08 z=x8$h_>c-d*q}DC-iAowk^EyZ@(ZV>|8mHjQ1pMKQY0`Wh*ce1awIkSYRY4T{>zkY z#l<|smc(8F`g)^|-UxQ$`|ypR8AH7g`-VzOP<5s&A#YccW#T*}51xb8_F}9YeNyUF zWbjf*5iDYX4>*>=yY1H#IPq)Eoq@OBDJHlN5r%c|Zi1F$SyIjnKyT%-ZV6_GQzyoy z--(Ok!82(3XoV40*Q?=GmkeGy`LcF<5&#Fmh}IGY0h}~UAq$C9UnXqBRcNve-a2(z zz7|h;3!4IKdz;@=V+lUYm@x%>@MMVggcZQ<=vsYo^@z1(3YXZ^o&aRV(kxF1i9P-7 z%HQO`oY9f_88K*b4Vhd6CfAt74i~=f!$)imXgPfzqv2Pl8#Wh~-DvD987!|JDz6?a ztQjh-2?MFSEMj^pMhaH+)$}bNDp++p?_NRGKta{WD&N4XwF9r!B0IF|^=paO634uy zVL2#y3A=3{sID2PUXOlv9fQTb9(nH&ER~RT5>a_LAnBz)*qxBNE|OxH5wniw-na#x zKbP0`jaR=WS$-#3v8G6Qht{p}DDQX(g!@gY>)i5wx1!ph>@U_-Ey9lhfi^rgRA{zl%YMqHwn?Dv!gB0axb;FXtqCVk zE$6q~Q<&WGfc{A@hg4WXCSHX?uMcCxOyW#w*@XVTfUV8`4chR-U|IJwp2z DQBYEx diff --git a/spiders/config.yaml b/spiders/config.yaml index a8b25d3..201d189 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -57,8 +57,8 @@ giz: link1: 'https://ausschreibungen.giz.de/Satellite/company/welcome.do?method=showTable&fromSearch=1&tableSortPROJECT_RESULT=2&tableSortAttributePROJECT_RESULT=publicationDate&selectedTablePagePROJECT_RESULT=' link2: '' jsdomain: 'None' - iteration-var-list: "[1,2,3,4,5,6,7]" - #iteration-var-list: "[1,2]" + #iteration-var-list: "[1,2,3,4,5,6,7]" + iteration-var-list: "[1,2]" #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr" child-name: "//td[3]//text()" diff --git a/spiders/config.yaml.save b/spiders/config.yaml.save new file mode 100644 index 0000000..bdcde9f --- /dev/null +++ b/spiders/config.yaml.save @@ -0,0 +1,110 @@ +# Settings for the PEP crawler per country to crawl +# Follow the syntax and dont use tbody as it gets added by the browser (when researching xpath through inspector) +# xpath syntax: https://www.w3schools.com/xml/xpath_syntax.asp +# lxml xpath syntax: https://www.geeksforgeeks.org/web-scraping-using-lxml-and-xpath-in-python/ +foerderinfo.bund.de: + domain: 'http://foerderinfo.bund.de' + entry-list: + link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/expertensuche/Servicesuche_Formular.html?gtp=33498_list%253D' + link2: '#searchResults' + iteration-var-list: '[1,2,3,4,5]' + #iteration-var-list: '[1,2]' + parent: "//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']//div[@class='l-search-result-list__item']" + child-name: "//div[@class='c-search-result__text-wrapper']//span[@class='c-search-result__title']/text()" + jsdomain: 'None' + javascript-link: "" + child-link: "/a[@class='c-search-result']/@href" + child-info: "//" + child-period: "/" + child-sponsor: "/" + entry: + info-1: + parent: '//html//body//form//table' + #child-name: '//html//body//form//table//tr[1]//td[2]//span' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' +foerderinfo.bund.de-bekanntmachungen: + domain: 'http://foerderinfo.bund.de' + entry-list: + link1: 'https://www.foerderinfo.bund.de/SiteGlobals/Forms/foerderinfo/bekanntmachungen/Bekanntmachungen_Formular.html?gtp=407348_list%253D' + link2: '#searchResults' + # here jsdomain has to be specified None with this syntax, if html pages of entrylists are not depending on javascript actions + jsdomain: 'None' + iteration-var-list: '[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]' + #iteration-var-list: '[1,2]' + #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" + parent: "//html//body//div//main//div[@class='row']//a[contains(@class, 'c-teaser--announcement')]" + child-name: "//div[@class='c-teaser__text-wrapper']//span[@class='c-teaser__title']/text()" + javascript-link: "" + child-link: "/@href" + child-info: "//div[@class='c-teaser__text-wrapper']//div[@class='c-teaser__text']/p/text()" + #child-period: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']/span[@class='c-topline__item']/text()" + child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" + child-sponsor: "//div[@class='c-teaser__text-wrapper']//small[@class='c-topline']//span[@class='c-topline__item']/span[@class='c-topline__category']/text()" + entry: + general: + uniform: 'FALSE' + unitrue: + parent: '//html//body//form//table' + #child-name: '//html//body//form//table//tr[1]//td[2]//span' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' + unifalse: + wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" +giz: + domain: 'https://ausschreibungen.giz.de' + entry-list: + link1: 'https://ausschreibungen.giz.de/Satellite/company/welcome.do?method=showTable&fromSearch=1&tableSortPROJECT_RESULT=2&tableSortAttributePROJECT_RESULT=publicationDate&selectedTablePagePROJECT_RESULT=' + link2: '' + jsdomain: 'None' + #iteration-var-list: "[1,2,3,4,5,6,7]" + iteration-var-list: "[1,2]" + #parent: "//html//body//div//main//div//div[@class='row']//div[@class='large-12']//a[@class='c-teaser']" + parent: "//html//body//div//div//table[contains(@class, 'csx-new-table')]//tbody//tr" + child-name: "//td[3]//text()" + child-link: "//a/@href" + javascript-link: "/td[6]/a" + child-info: "/td[4]/text()[1]" + child-period: "//td[2]/abbr/text()" + #child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" + child-sponsor: "/tr/td[4]/text()" + entry: + general: + uniform: 'TRUE' + unitrue: + #parent: '//html//body//form//table' + text: '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a/@href' + text: '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' + unifalse: + wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" +evergabe-online: + domain: 'https://www.evergabe-online.de/' + entry-list: + link1: 'https://www.evergabe-online.de/search.html?101-1.-searchPanel-results-searchResults-results-topToolbars-toolbars-1-span-navigator-navigation-' + link2: '-pageLink' + jsdomain: 'https://www.evergabe-online.de/search.html' + jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span[' + jslink2: ']' + jsiteration-var-list: "[1,2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,7,8,9,10]" + #jsiteration-var-list: "[1,2, 3]" + iteration-var-list: "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]" + #iteration-var-list: "[1, 2, 3]" + parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody//tr" + child-name: "//td[1]/div/a/text()" + child-link: "//td[1]/div/a/@href" + javascript-link: "" + child-info: "//td[3]/div/text()" + child-period: "//td[5]/text()" + child-sponsor: "//td[2]/div/text()" + entry: + general: + uniform: 'FALSE' + unitrue: + #parent: '//html//body//form//table' + #text: '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a/@href' + #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img' + #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1' + unifalse: + wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']" diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index cfd2785..90c9855 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -718,6 +718,14 @@ class fdb_spider(object): file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".pdf" entry_link = dictionary_entry_list[entry_id]["link"] + print('that is the child: ' + child) + if 'http' in child: + try: + response = requests.get(child) + except Exception as e: + print(child + ' does not appear to be valid pdf link to download, original message is ' + e) + + if 'http' not in child: if 'javascript' or 'js' not in entry_link and 'http' in entry_link: try: @@ -731,15 +739,21 @@ class fdb_spider(object): if entry_domain[-1] == '/': pdf_link = entry_domain[:-1] + child[1:] if entry_domain[-1] != '/': + #print('it got into OIOIOIOOIOI') + #print('before loop ', entry_domain) + cut_value = 0 for n in range(len(entry_domain)): if entry_domain[-n] != '/': - entry_domain = entry_domain[:-1] + cut_value += 1 + else: break - + entry_domain = entry_domain[:-cut_value] + #print('after loop ', entry_domain) pdf_link = entry_domain + child[1:] + #print('the pdf link after recursive until slash: ', pdf_link) if child[0] == '/': if entry_domain[-1] == '/':