Browse Source

changes for new database dtvp, new exceptions trying to click away cookie pop ups

master
alpcentaur 8 months ago
parent
commit
d284fef015
5 changed files with 289 additions and 12 deletions
  1. +218
    -0
      log.txt
  2. +5
    -2
      main.py
  3. BIN
      spiders/__pycache__/fdb_spider.cpython-311.pyc
  4. +40
    -6
      spiders/config.yaml
  5. +26
    -4
      spiders/fdb_spider.py

+ 218
- 0
log.txt View File

@ -0,0 +1,218 @@
trying to get element
0
scrolling..
clicking..
length of the window handles 1
trying to get element
1
scrolling..
clicking..
length of the window handles 1
trying to get element
2
scrolling..
clicking..
length of the window handles 1
trying to get element
3
scrolling..
clicking..
length of the window handles 1
trying to get element
4
scrolling..
clicking..
length of the window handles 1
trying to get element
5
scrolling..
clicking..
length of the window handles 1
trying to get element
6
scrolling..
clicking..
length of the window handles 1
trying to get element
7
scrolling..
clicking..
length of the window handles 1
trying to get element
8
scrolling..
clicking..
length of the window handles 1
trying to get element
9
scrolling..
clicking..
length of the window handles 1
trying to get element
10
scrolling..
clicking..
length of the window handles 1
trying to get element
11
scrolling..
clicking..
length of the window handles 1
trying to get element
12
scrolling..
clicking..
length of the window handles 1
trying to get element
13
scrolling..
clicking..
length of the window handles 1
trying to get element
14
scrolling..
clicking..
length of the window handles 1
trying to get element
15
scrolling..
clicking..
length of the window handles 1
trying to get element
16
scrolling..
clicking..
length of the window handles 1
trying to get element
17
scrolling..
clicking..
length of the window handles 1
trying to get element
18
scrolling..
clicking..
length of the window handles 1
trying to get element
19
scrolling..
clicking..
length of the window handles 1
trying to get element
20
scrolling..
clicking..
length of the window handles 1
trying to get element
21
scrolling..
clicking..
trying to get element
0
scrolling..
clicking..
length of the window handles 1
trying to get element
1
scrolling..
clicking..
length of the window handles 1
trying to get element
2
scrolling..
clicking..
length of the window handles 1
trying to get element
3
scrolling..
clicking..
length of the window handles 1
trying to get element
4
scrolling..
clicking..
length of the window handles 1
trying to get element
5
scrolling..
clicking..
length of the window handles 1
trying to get element
6
scrolling..
clicking..
length of the window handles 1
trying to get element
7
scrolling..
clicking..
length of the window handles 1
trying to get element
8
scrolling..
clicking..
length of the window handles 1
trying to get element
9
scrolling..
clicking..
length of the window handles 1
trying to get element
10
scrolling..
clicking..
length of the window handles 1
trying to get element
11
scrolling..
clicking..
length of the window handles 1
trying to get element
12
scrolling..
clicking..
length of the window handles 1
trying to get element
13
scrolling..
clicking..
length of the window handles 1
trying to get element
14
scrolling..
clicking..
length of the window handles 1
trying to get element
15
scrolling..
clicking..
length of the window handles 1
trying to get element
16
scrolling..
clicking..
length of the window handles 1
trying to get element
17
scrolling..
clicking..
length of the window handles 1
trying to get element
18
scrolling..
clicking..
length of the window handles 1
trying to get element
19
scrolling..
clicking..
length of the window handles 1
trying to get element
20
scrolling..
clicking..
length of the window handles 1
trying to get element
21
scrolling..
clicking..

+ 5
- 2
main.py View File

@ -6,7 +6,10 @@ import sys
config = "spiders/config.yaml" config = "spiders/config.yaml"
#list_of_fdbs = eval(sys.argv[1]) #list_of_fdbs = eval(sys.argv[1])
#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"] #list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
#list_of_fdbs = ["giz","evergabe-online"]
#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
list_of_fdbs = ["ted.europa.eu"] list_of_fdbs = ["ted.europa.eu"]
#list_of_fdbs = ["dtvp"]
# doing the crawling of government websites # doing the crawling of government websites
@ -19,7 +22,7 @@ spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
spider.parse_entry_list_data2dictionary(list_of_fdbs) spider.parse_entry_list_data2dictionary(list_of_fdbs)
spider.download_entry_data_htmls(list_of_fdbs)
#spider.download_entry_data_htmls(list_of_fdbs)
spider.parse_entry_data2dictionary(list_of_fdbs)
#spider.parse_entry_data2dictionary(list_of_fdbs)

BIN
spiders/__pycache__/fdb_spider.cpython-311.pyc View File


+ 40
- 6
spiders/config.yaml
File diff suppressed because it is too large
View File


+ 26
- 4
spiders/fdb_spider.py View File

@ -167,6 +167,14 @@ class fdb_spider(object):
# driver = webdriver.Chrome() # driver = webdriver.Chrome()
driver.implicitly_wait(5) driver.implicitly_wait(5)
driver.get(entry_jsdomain) driver.get(entry_jsdomain)
try:
accept_button = driver.find_element("xpath","//button[contains(text(), 'akzeptieren')]")
accept_button.click()
except Exception as e:
print(e, 'no cookies to accept..')
pass
for i in range(len(entry_jsiteration_var_list)): for i in range(len(entry_jsiteration_var_list)):
time.sleep(1) time.sleep(1)
print('trying to get element') print('trying to get element')
@ -248,11 +256,11 @@ class fdb_spider(object):
print('this is the n looped elements of the parent specified in config.yaml:') print('this is the n looped elements of the parent specified in config.yaml:')
#print('entrylistparent', fdb_conf_entry_list_parent)
print('entrylistparent', fdb_conf_entry_list_parent)
#print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"))
print(tree.xpath("//html//body//div"))
#print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode())
print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[0]).decode())
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))): for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
print('-----------------------------------------------------------------------------------------------------------------------------------------') print('-----------------------------------------------------------------------------------------------------------------------------------------')
@ -482,7 +490,7 @@ class fdb_spider(object):
#service_args = ['--verbose'] #service_args = ['--verbose']
#driver = webdriver.Chrome('/usr/bin/chromium') #driver = webdriver.Chrome('/usr/bin/chromium')
options = webdriver.ChromeOptions() options = webdriver.ChromeOptions()
options.add_argument('headless')
#options.add_argument('headless')
options.add_argument("--remote-debugging-port=9222") options.add_argument("--remote-debugging-port=9222")
options.add_argument('--no-sandbox') options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-dev-shm-usage')
@ -540,6 +548,17 @@ class fdb_spider(object):
print(entry_link) print(entry_link)
if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE': if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
try:
accept_button = driver.find_element("xpath","//button[contains(text(), 'akzeptieren')]")
accept_button.click()
except Exception as e:
print(e, 'no cookies to accept..')
pass
driver.execute_script("scroll(0, 600)")
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link) print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
element = driver.find_element( element = driver.find_element(
"xpath", "xpath",
@ -560,6 +579,9 @@ class fdb_spider(object):
#element = driver.find_element("xpath", "//html") #element = driver.find_element("xpath", "//html")
#web_content = element.text #web_content = element.text
#entry_domain = driver.getCurrentUrl() #entry_domain = driver.getCurrentUrl()
entry_domain = driver.current_url entry_domain = driver.current_url

Loading…
Cancel
Save