added new database ted.europe.eu, created new case of slow downloading, intergrated scrolling into entrylistpagesdownload
This commit is contained in:
parent
094f092291
commit
a0dd469f25
4 changed files with 109 additions and 36 deletions
4
main.py
4
main.py
|
@ -5,8 +5,8 @@ import sys
|
|||
|
||||
config = "spiders/config.yaml"
|
||||
#list_of_fdbs = eval(sys.argv[1])
|
||||
list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
||||
#list_of_fdbs = ["giz"]
|
||||
#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
||||
list_of_fdbs = ["ted.europa.eu"]
|
||||
|
||||
|
||||
# doing the crawling of government websites
|
||||
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -146,7 +146,7 @@ class fdb_spider(object):
|
|||
else:
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
|
||||
#from selenium.webdriver.common.action_chains import ActionChains
|
||||
from pyvirtualdisplay import Display
|
||||
display = Display(visible=0, size=(800, 800))
|
||||
display.start()
|
||||
|
@ -164,6 +164,7 @@ class fdb_spider(object):
|
|||
service = Service(executable_path='/usr/bin/chromedriver')
|
||||
driver = webdriver.Chrome(options=options, service=service)
|
||||
# driver = webdriver.Chrome()
|
||||
driver.implicitly_wait(10)
|
||||
driver.get(entry_jsdomain)
|
||||
for i in range(len(entry_jsiteration_var_list)):
|
||||
time.sleep(2)
|
||||
|
@ -176,8 +177,14 @@ class fdb_spider(object):
|
|||
+ entry_list_jslink2
|
||||
)
|
||||
print(entry_iteration_var_list[i])
|
||||
time.sleep(2)
|
||||
time.sleep(1)
|
||||
print('scrolling..')
|
||||
|
||||
# scroll into view, because otherwise with javascript generated elements
|
||||
# it can be that clicking returns an error
|
||||
driver.execute_script("arguments[0].scrollIntoView();", element)
|
||||
print('clicking..')
|
||||
time.sleep(1)
|
||||
element.click()
|
||||
time.sleep(2)
|
||||
#window_after = driver.window_handles[1]
|
||||
|
@ -476,6 +483,7 @@ class fdb_spider(object):
|
|||
options.add_argument('--disable-dev-shm-usage')
|
||||
service = Service(executable_path='/usr/bin/chromedriver')
|
||||
driver = webdriver.Chrome(options=options, service=service)
|
||||
driver.implicitly_wait(10)
|
||||
#driver = webdriver.Chrome()
|
||||
for fdb in list_of_fdbs:
|
||||
print('spidering ' + fdb + ' ..')
|
||||
|
@ -503,12 +511,22 @@ class fdb_spider(object):
|
|||
try:
|
||||
fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
|
||||
except Exception as e:
|
||||
fdb_conf_entry_list_javascript_link = 'NONE'
|
||||
print('the javascript link in the config is missing, original error message is:', e)
|
||||
try:
|
||||
fdb_conf_entry_list_slow_downloading = fdb_conf_entry_list.get("slow-downloading")
|
||||
except Exception as e:
|
||||
print('the slow-downloading parameter is not set, original error message is:', e)
|
||||
fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
|
||||
fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
|
||||
|
||||
if fdb_conf_entry_list_slow_downloading == 'FALSE':
|
||||
|
||||
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
for entry_id in dictionary_entry_list:
|
||||
print(entry_id)
|
||||
entry_link = dictionary_entry_list[entry_id]["link"]
|
||||
|
@ -516,7 +534,7 @@ class fdb_spider(object):
|
|||
# download the html page of the entry
|
||||
print(entry_link)
|
||||
|
||||
if 'javascript' in entry_link:
|
||||
if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
|
||||
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
|
||||
element = driver.find_element(
|
||||
"xpath",
|
||||
|
@ -556,9 +574,25 @@ class fdb_spider(object):
|
|||
driver.switch_to.window(window_before)
|
||||
|
||||
|
||||
if 'javascript' not in entry_link and '.pdf' not in entry_link:
|
||||
if 'javascript' not in entry_link and '.pdf' not in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
|
||||
print('blabuuuuuba')
|
||||
#print('oi')
|
||||
|
||||
|
||||
if fdb_conf_entry_list_slow_downloading == 'TRUE':
|
||||
|
||||
try:
|
||||
|
||||
print("trying to get slowly entry link " , entry_link)
|
||||
driver.get(entry_link)
|
||||
time.sleep(3)
|
||||
web_content = driver.page_source
|
||||
|
||||
except Exception as e:
|
||||
print("getting the html behind the entry link did not work, ori message is:", e)
|
||||
|
||||
else:
|
||||
|
||||
try:
|
||||
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
||||
url = entry_link
|
||||
|
@ -591,7 +625,7 @@ class fdb_spider(object):
|
|||
|
||||
# save interim results to files
|
||||
|
||||
if '.pdf' in entry_link:
|
||||
if '.pdf' in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
|
||||
|
||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||
response = requests.get(entry_link)
|
||||
|
@ -606,6 +640,10 @@ class fdb_spider(object):
|
|||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||
|
||||
wget_wrote = False
|
||||
|
||||
|
||||
|
||||
|
||||
if web_content == 'NONE':
|
||||
print('other downloading approaches did not work, trying requests')
|
||||
|
||||
|
|
Loading…
Reference in a new issue