added new database ted.europe.eu, created new case of slow downloading, intergrated scrolling into entrylistpagesdownload

This commit is contained in:
alpcentaur 2024-02-09 18:38:49 +00:00
parent 094f092291
commit a0dd469f25
4 changed files with 109 additions and 36 deletions

View file

@ -5,8 +5,8 @@ import sys
config = "spiders/config.yaml" config = "spiders/config.yaml"
#list_of_fdbs = eval(sys.argv[1]) #list_of_fdbs = eval(sys.argv[1])
list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"] #list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
#list_of_fdbs = ["giz"] list_of_fdbs = ["ted.europa.eu"]
# doing the crawling of government websites # doing the crawling of government websites

File diff suppressed because one or more lines are too long

View file

@ -146,7 +146,7 @@ class fdb_spider(object):
else: else:
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
#from selenium.webdriver.common.action_chains import ActionChains
from pyvirtualdisplay import Display from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 800)) display = Display(visible=0, size=(800, 800))
display.start() display.start()
@ -164,6 +164,7 @@ class fdb_spider(object):
service = Service(executable_path='/usr/bin/chromedriver') service = Service(executable_path='/usr/bin/chromedriver')
driver = webdriver.Chrome(options=options, service=service) driver = webdriver.Chrome(options=options, service=service)
# driver = webdriver.Chrome() # driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get(entry_jsdomain) driver.get(entry_jsdomain)
for i in range(len(entry_jsiteration_var_list)): for i in range(len(entry_jsiteration_var_list)):
time.sleep(2) time.sleep(2)
@ -176,8 +177,14 @@ class fdb_spider(object):
+ entry_list_jslink2 + entry_list_jslink2
) )
print(entry_iteration_var_list[i]) print(entry_iteration_var_list[i])
time.sleep(2) time.sleep(1)
print('scrolling..')
# scroll into view, because otherwise with javascript generated elements
# it can be that clicking returns an error
driver.execute_script("arguments[0].scrollIntoView();", element)
print('clicking..') print('clicking..')
time.sleep(1)
element.click() element.click()
time.sleep(2) time.sleep(2)
#window_after = driver.window_handles[1] #window_after = driver.window_handles[1]
@ -476,6 +483,7 @@ class fdb_spider(object):
options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-dev-shm-usage')
service = Service(executable_path='/usr/bin/chromedriver') service = Service(executable_path='/usr/bin/chromedriver')
driver = webdriver.Chrome(options=options, service=service) driver = webdriver.Chrome(options=options, service=service)
driver.implicitly_wait(10)
#driver = webdriver.Chrome() #driver = webdriver.Chrome()
for fdb in list_of_fdbs: for fdb in list_of_fdbs:
print('spidering ' + fdb + ' ..') print('spidering ' + fdb + ' ..')
@ -503,11 +511,21 @@ class fdb_spider(object):
try: try:
fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link") fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
except Exception as e: except Exception as e:
fdb_conf_entry_list_javascript_link = 'NONE'
print('the javascript link in the config is missing, original error message is:', e) print('the javascript link in the config is missing, original error message is:', e)
try:
fdb_conf_entry_list_slow_downloading = fdb_conf_entry_list.get("slow-downloading")
except Exception as e:
print('the slow-downloading parameter is not set, original error message is:', e)
fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1") fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2") fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2) if fdb_conf_entry_list_slow_downloading == 'FALSE':
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
else:
pass
for entry_id in dictionary_entry_list: for entry_id in dictionary_entry_list:
print(entry_id) print(entry_id)
@ -516,7 +534,7 @@ class fdb_spider(object):
# download the html page of the entry # download the html page of the entry
print(entry_link) print(entry_link)
if 'javascript' in entry_link: if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link) print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
element = driver.find_element( element = driver.find_element(
"xpath", "xpath",
@ -556,42 +574,58 @@ class fdb_spider(object):
driver.switch_to.window(window_before) driver.switch_to.window(window_before)
if 'javascript' not in entry_link and '.pdf' not in entry_link: if 'javascript' not in entry_link and '.pdf' not in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
print('blabuuuuuba') print('blabuuuuuba')
#print('oi') #print('oi')
try:
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
url = entry_link if fdb_conf_entry_list_slow_downloading == 'TRUE':
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
response = urllib.request.urlopen(req)
print('response from first one', response)
except Exception as e:
print('cookie giving then downloading did not work, original error is:', e)
try: try:
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
print(
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
e,
)
except Exception as ex:
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
print("trying to get slowly entry link " , entry_link)
driver.get(entry_link)
time.sleep(3)
web_content = driver.page_source
except Exception as e:
print("getting the html behind the entry link did not work, ori message is:", e)
else:
try:
web_content = response.read().decode("UTF-8")
except Exception as e:
try: try:
web_content = response.read().decode("latin-1") # defining cookie to not end up in endless loop because of cookie banners pointing to redirects
print( url = entry_link
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
e, response = urllib.request.urlopen(req)
) print('response from first one', response)
except Exception as ex: except Exception as e:
print(ex) print('cookie giving then downloading did not work, original error is:', e)
try:
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
print(
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
e,
)
except Exception as ex:
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
# save interim results to files
if '.pdf' in entry_link: try:
web_content = response.read().decode("UTF-8")
except Exception as e:
try:
web_content = response.read().decode("latin-1")
print(
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
e,
)
except Exception as ex:
print(ex)
# save interim results to files
if '.pdf' in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
response = requests.get(entry_link) response = requests.get(entry_link)
@ -606,6 +640,10 @@ class fdb_spider(object):
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
wget_wrote = False wget_wrote = False
if web_content == 'NONE': if web_content == 'NONE':
print('other downloading approaches did not work, trying requests') print('other downloading approaches did not work, trying requests')