changes for new database dtvp, new exceptions trying to click away cookie pop ups
This commit is contained in:
parent
7ba196b0c2
commit
d284fef015
5 changed files with 289 additions and 12 deletions
218
log.txt
Normal file
218
log.txt
Normal file
|
@ -0,0 +1,218 @@
|
|||
trying to get element
|
||||
0
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
1
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
2
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
3
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
4
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
5
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
6
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
7
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
8
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
9
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
10
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
11
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
12
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
13
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
14
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
15
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
16
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
17
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
18
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
19
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
20
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
21
|
||||
scrolling..
|
||||
clicking..
|
||||
trying to get element
|
||||
0
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
1
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
2
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
3
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
4
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
5
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
6
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
7
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
8
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
9
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
10
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
11
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
12
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
13
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
14
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
15
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
16
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
17
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
18
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
19
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
20
|
||||
scrolling..
|
||||
clicking..
|
||||
length of the window handles 1
|
||||
trying to get element
|
||||
21
|
||||
scrolling..
|
||||
clicking..
|
7
main.py
7
main.py
|
@ -6,7 +6,10 @@ import sys
|
|||
config = "spiders/config.yaml"
|
||||
#list_of_fdbs = eval(sys.argv[1])
|
||||
#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
||||
#list_of_fdbs = ["giz","evergabe-online"]
|
||||
#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
||||
list_of_fdbs = ["ted.europa.eu"]
|
||||
#list_of_fdbs = ["dtvp"]
|
||||
|
||||
|
||||
# doing the crawling of government websites
|
||||
|
@ -19,7 +22,7 @@ spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
|||
|
||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||
|
||||
spider.download_entry_data_htmls(list_of_fdbs)
|
||||
#spider.download_entry_data_htmls(list_of_fdbs)
|
||||
|
||||
spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||
#spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||
|
||||
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -167,6 +167,14 @@ class fdb_spider(object):
|
|||
# driver = webdriver.Chrome()
|
||||
driver.implicitly_wait(5)
|
||||
driver.get(entry_jsdomain)
|
||||
|
||||
try:
|
||||
accept_button = driver.find_element("xpath","//button[contains(text(), 'akzeptieren')]")
|
||||
accept_button.click()
|
||||
except Exception as e:
|
||||
print(e, 'no cookies to accept..')
|
||||
pass
|
||||
|
||||
for i in range(len(entry_jsiteration_var_list)):
|
||||
time.sleep(1)
|
||||
print('trying to get element')
|
||||
|
@ -248,11 +256,11 @@ class fdb_spider(object):
|
|||
|
||||
print('this is the n looped elements of the parent specified in config.yaml:')
|
||||
|
||||
#print('entrylistparent', fdb_conf_entry_list_parent)
|
||||
print('entrylistparent', fdb_conf_entry_list_parent)
|
||||
|
||||
#print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"))
|
||||
print(tree.xpath("//html//body//div"))
|
||||
|
||||
#print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode())
|
||||
print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[0]).decode())
|
||||
|
||||
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
|
||||
print('-----------------------------------------------------------------------------------------------------------------------------------------')
|
||||
|
@ -482,7 +490,7 @@ class fdb_spider(object):
|
|||
#service_args = ['--verbose']
|
||||
#driver = webdriver.Chrome('/usr/bin/chromium')
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('headless')
|
||||
#options.add_argument('headless')
|
||||
options.add_argument("--remote-debugging-port=9222")
|
||||
options.add_argument('--no-sandbox')
|
||||
options.add_argument('--disable-dev-shm-usage')
|
||||
|
@ -540,6 +548,17 @@ class fdb_spider(object):
|
|||
print(entry_link)
|
||||
|
||||
if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
|
||||
|
||||
|
||||
try:
|
||||
accept_button = driver.find_element("xpath","//button[contains(text(), 'akzeptieren')]")
|
||||
accept_button.click()
|
||||
except Exception as e:
|
||||
print(e, 'no cookies to accept..')
|
||||
pass
|
||||
|
||||
driver.execute_script("scroll(0, 600)")
|
||||
|
||||
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
|
||||
element = driver.find_element(
|
||||
"xpath",
|
||||
|
@ -560,6 +579,9 @@ class fdb_spider(object):
|
|||
#element = driver.find_element("xpath", "//html")
|
||||
#web_content = element.text
|
||||
|
||||
|
||||
|
||||
|
||||
#entry_domain = driver.getCurrentUrl()
|
||||
entry_domain = driver.current_url
|
||||
|
||||
|
|
Loading…
Reference in a new issue