changes for new database dtvp, new exceptions trying to click away cookie pop ups
This commit is contained in:
parent
7ba196b0c2
commit
d284fef015
5 changed files with 289 additions and 12 deletions
218
log.txt
Normal file
218
log.txt
Normal file
|
@ -0,0 +1,218 @@
|
||||||
|
trying to get element
|
||||||
|
0
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
1
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
2
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
3
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
4
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
5
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
6
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
7
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
8
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
9
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
10
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
11
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
12
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
13
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
14
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
15
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
16
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
17
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
18
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
19
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
20
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
21
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
trying to get element
|
||||||
|
0
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
1
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
2
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
3
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
4
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
5
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
6
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
7
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
8
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
9
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
10
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
11
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
12
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
13
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
14
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
15
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
16
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
17
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
18
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
19
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
20
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
||||||
|
length of the window handles 1
|
||||||
|
trying to get element
|
||||||
|
21
|
||||||
|
scrolling..
|
||||||
|
clicking..
|
7
main.py
7
main.py
|
@ -6,7 +6,10 @@ import sys
|
||||||
config = "spiders/config.yaml"
|
config = "spiders/config.yaml"
|
||||||
#list_of_fdbs = eval(sys.argv[1])
|
#list_of_fdbs = eval(sys.argv[1])
|
||||||
#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
||||||
|
#list_of_fdbs = ["giz","evergabe-online"]
|
||||||
|
#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
||||||
list_of_fdbs = ["ted.europa.eu"]
|
list_of_fdbs = ["ted.europa.eu"]
|
||||||
|
#list_of_fdbs = ["dtvp"]
|
||||||
|
|
||||||
|
|
||||||
# doing the crawling of government websites
|
# doing the crawling of government websites
|
||||||
|
@ -19,7 +22,7 @@ spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
spider.download_entry_data_htmls(list_of_fdbs)
|
#spider.download_entry_data_htmls(list_of_fdbs)
|
||||||
|
|
||||||
spider.parse_entry_data2dictionary(list_of_fdbs)
|
#spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||||
|
|
||||||
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -167,6 +167,14 @@ class fdb_spider(object):
|
||||||
# driver = webdriver.Chrome()
|
# driver = webdriver.Chrome()
|
||||||
driver.implicitly_wait(5)
|
driver.implicitly_wait(5)
|
||||||
driver.get(entry_jsdomain)
|
driver.get(entry_jsdomain)
|
||||||
|
|
||||||
|
try:
|
||||||
|
accept_button = driver.find_element("xpath","//button[contains(text(), 'akzeptieren')]")
|
||||||
|
accept_button.click()
|
||||||
|
except Exception as e:
|
||||||
|
print(e, 'no cookies to accept..')
|
||||||
|
pass
|
||||||
|
|
||||||
for i in range(len(entry_jsiteration_var_list)):
|
for i in range(len(entry_jsiteration_var_list)):
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
print('trying to get element')
|
print('trying to get element')
|
||||||
|
@ -248,11 +256,11 @@ class fdb_spider(object):
|
||||||
|
|
||||||
print('this is the n looped elements of the parent specified in config.yaml:')
|
print('this is the n looped elements of the parent specified in config.yaml:')
|
||||||
|
|
||||||
#print('entrylistparent', fdb_conf_entry_list_parent)
|
print('entrylistparent', fdb_conf_entry_list_parent)
|
||||||
|
|
||||||
#print(tree.xpath("//html//body//div//main//div//div[@class='row']//section[@class='l-search-result-list']"))
|
print(tree.xpath("//html//body//div"))
|
||||||
|
|
||||||
#print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)).decode())
|
print(etree.tostring(tree.xpath(fdb_conf_entry_list_parent)[0]).decode())
|
||||||
|
|
||||||
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
|
for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
|
||||||
print('-----------------------------------------------------------------------------------------------------------------------------------------')
|
print('-----------------------------------------------------------------------------------------------------------------------------------------')
|
||||||
|
@ -482,7 +490,7 @@ class fdb_spider(object):
|
||||||
#service_args = ['--verbose']
|
#service_args = ['--verbose']
|
||||||
#driver = webdriver.Chrome('/usr/bin/chromium')
|
#driver = webdriver.Chrome('/usr/bin/chromium')
|
||||||
options = webdriver.ChromeOptions()
|
options = webdriver.ChromeOptions()
|
||||||
options.add_argument('headless')
|
#options.add_argument('headless')
|
||||||
options.add_argument("--remote-debugging-port=9222")
|
options.add_argument("--remote-debugging-port=9222")
|
||||||
options.add_argument('--no-sandbox')
|
options.add_argument('--no-sandbox')
|
||||||
options.add_argument('--disable-dev-shm-usage')
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
|
@ -540,6 +548,17 @@ class fdb_spider(object):
|
||||||
print(entry_link)
|
print(entry_link)
|
||||||
|
|
||||||
if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
|
if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
accept_button = driver.find_element("xpath","//button[contains(text(), 'akzeptieren')]")
|
||||||
|
accept_button.click()
|
||||||
|
except Exception as e:
|
||||||
|
print(e, 'no cookies to accept..')
|
||||||
|
pass
|
||||||
|
|
||||||
|
driver.execute_script("scroll(0, 600)")
|
||||||
|
|
||||||
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
|
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
|
||||||
element = driver.find_element(
|
element = driver.find_element(
|
||||||
"xpath",
|
"xpath",
|
||||||
|
@ -560,6 +579,9 @@ class fdb_spider(object):
|
||||||
#element = driver.find_element("xpath", "//html")
|
#element = driver.find_element("xpath", "//html")
|
||||||
#web_content = element.text
|
#web_content = element.text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#entry_domain = driver.getCurrentUrl()
|
#entry_domain = driver.getCurrentUrl()
|
||||||
entry_domain = driver.current_url
|
entry_domain = driver.current_url
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue