added new database ted.europe.eu, created new case of slow downloading, intergrated scrolling into entrylistpagesdownload
This commit is contained in:
parent
094f092291
commit
a0dd469f25
4 changed files with 109 additions and 36 deletions
4
main.py
4
main.py
|
@ -5,8 +5,8 @@ import sys
|
||||||
|
|
||||||
config = "spiders/config.yaml"
|
config = "spiders/config.yaml"
|
||||||
#list_of_fdbs = eval(sys.argv[1])
|
#list_of_fdbs = eval(sys.argv[1])
|
||||||
list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
||||||
#list_of_fdbs = ["giz"]
|
list_of_fdbs = ["ted.europa.eu"]
|
||||||
|
|
||||||
|
|
||||||
# doing the crawling of government websites
|
# doing the crawling of government websites
|
||||||
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -146,7 +146,7 @@ class fdb_spider(object):
|
||||||
else:
|
else:
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
#from selenium.webdriver.common.action_chains import ActionChains
|
||||||
from pyvirtualdisplay import Display
|
from pyvirtualdisplay import Display
|
||||||
display = Display(visible=0, size=(800, 800))
|
display = Display(visible=0, size=(800, 800))
|
||||||
display.start()
|
display.start()
|
||||||
|
@ -164,6 +164,7 @@ class fdb_spider(object):
|
||||||
service = Service(executable_path='/usr/bin/chromedriver')
|
service = Service(executable_path='/usr/bin/chromedriver')
|
||||||
driver = webdriver.Chrome(options=options, service=service)
|
driver = webdriver.Chrome(options=options, service=service)
|
||||||
# driver = webdriver.Chrome()
|
# driver = webdriver.Chrome()
|
||||||
|
driver.implicitly_wait(10)
|
||||||
driver.get(entry_jsdomain)
|
driver.get(entry_jsdomain)
|
||||||
for i in range(len(entry_jsiteration_var_list)):
|
for i in range(len(entry_jsiteration_var_list)):
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
@ -176,8 +177,14 @@ class fdb_spider(object):
|
||||||
+ entry_list_jslink2
|
+ entry_list_jslink2
|
||||||
)
|
)
|
||||||
print(entry_iteration_var_list[i])
|
print(entry_iteration_var_list[i])
|
||||||
time.sleep(2)
|
time.sleep(1)
|
||||||
|
print('scrolling..')
|
||||||
|
|
||||||
|
# scroll into view, because otherwise with javascript generated elements
|
||||||
|
# it can be that clicking returns an error
|
||||||
|
driver.execute_script("arguments[0].scrollIntoView();", element)
|
||||||
print('clicking..')
|
print('clicking..')
|
||||||
|
time.sleep(1)
|
||||||
element.click()
|
element.click()
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
#window_after = driver.window_handles[1]
|
#window_after = driver.window_handles[1]
|
||||||
|
@ -476,6 +483,7 @@ class fdb_spider(object):
|
||||||
options.add_argument('--disable-dev-shm-usage')
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
service = Service(executable_path='/usr/bin/chromedriver')
|
service = Service(executable_path='/usr/bin/chromedriver')
|
||||||
driver = webdriver.Chrome(options=options, service=service)
|
driver = webdriver.Chrome(options=options, service=service)
|
||||||
|
driver.implicitly_wait(10)
|
||||||
#driver = webdriver.Chrome()
|
#driver = webdriver.Chrome()
|
||||||
for fdb in list_of_fdbs:
|
for fdb in list_of_fdbs:
|
||||||
print('spidering ' + fdb + ' ..')
|
print('spidering ' + fdb + ' ..')
|
||||||
|
@ -503,11 +511,21 @@ class fdb_spider(object):
|
||||||
try:
|
try:
|
||||||
fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
|
fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
fdb_conf_entry_list_javascript_link = 'NONE'
|
||||||
print('the javascript link in the config is missing, original error message is:', e)
|
print('the javascript link in the config is missing, original error message is:', e)
|
||||||
|
try:
|
||||||
|
fdb_conf_entry_list_slow_downloading = fdb_conf_entry_list.get("slow-downloading")
|
||||||
|
except Exception as e:
|
||||||
|
print('the slow-downloading parameter is not set, original error message is:', e)
|
||||||
fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
|
fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1")
|
||||||
fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
|
fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2")
|
||||||
|
|
||||||
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
|
if fdb_conf_entry_list_slow_downloading == 'FALSE':
|
||||||
|
|
||||||
|
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2)
|
||||||
|
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
for entry_id in dictionary_entry_list:
|
for entry_id in dictionary_entry_list:
|
||||||
print(entry_id)
|
print(entry_id)
|
||||||
|
@ -516,7 +534,7 @@ class fdb_spider(object):
|
||||||
# download the html page of the entry
|
# download the html page of the entry
|
||||||
print(entry_link)
|
print(entry_link)
|
||||||
|
|
||||||
if 'javascript' in entry_link:
|
if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE':
|
||||||
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
|
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link)
|
||||||
element = driver.find_element(
|
element = driver.find_element(
|
||||||
"xpath",
|
"xpath",
|
||||||
|
@ -556,42 +574,58 @@ class fdb_spider(object):
|
||||||
driver.switch_to.window(window_before)
|
driver.switch_to.window(window_before)
|
||||||
|
|
||||||
|
|
||||||
if 'javascript' not in entry_link and '.pdf' not in entry_link:
|
if 'javascript' not in entry_link and '.pdf' not in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
|
||||||
print('blabuuuuuba')
|
print('blabuuuuuba')
|
||||||
#print('oi')
|
#print('oi')
|
||||||
try:
|
|
||||||
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
|
||||||
url = entry_link
|
|
||||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
|
|
||||||
response = urllib.request.urlopen(req)
|
|
||||||
print('response from first one', response)
|
|
||||||
except Exception as e:
|
|
||||||
print('cookie giving then downloading did not work, original error is:', e)
|
|
||||||
try:
|
|
||||||
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
|
|
||||||
print(
|
|
||||||
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
|
|
||||||
e,
|
|
||||||
)
|
|
||||||
except Exception as ex:
|
|
||||||
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
if fdb_conf_entry_list_slow_downloading == 'TRUE':
|
||||||
web_content = response.read().decode("UTF-8")
|
|
||||||
except Exception as e:
|
|
||||||
try:
|
try:
|
||||||
web_content = response.read().decode("latin-1")
|
|
||||||
print(
|
print("trying to get slowly entry link " , entry_link)
|
||||||
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
|
driver.get(entry_link)
|
||||||
e,
|
time.sleep(3)
|
||||||
)
|
web_content = driver.page_source
|
||||||
except Exception as ex:
|
|
||||||
print(ex)
|
except Exception as e:
|
||||||
|
print("getting the html behind the entry link did not work, ori message is:", e)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
try:
|
||||||
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
|
||||||
|
url = entry_link
|
||||||
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'})
|
||||||
|
response = urllib.request.urlopen(req)
|
||||||
|
print('response from first one', response)
|
||||||
|
except Exception as e:
|
||||||
|
print('cookie giving then downloading did not work, original error is:', e)
|
||||||
|
try:
|
||||||
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'))
|
||||||
|
print(
|
||||||
|
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex )
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
web_content = response.read().decode("UTF-8")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
web_content = response.read().decode("latin-1")
|
||||||
|
print(
|
||||||
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
print(ex)
|
||||||
|
|
||||||
# save interim results to files
|
# save interim results to files
|
||||||
|
|
||||||
if '.pdf' in entry_link:
|
if '.pdf' in entry_link and fdb_conf_entry_list_javascript_link == 'NONE':
|
||||||
|
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
response = requests.get(entry_link)
|
response = requests.get(entry_link)
|
||||||
|
@ -606,6 +640,10 @@ class fdb_spider(object):
|
||||||
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html"
|
||||||
|
|
||||||
wget_wrote = False
|
wget_wrote = False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if web_content == 'NONE':
|
if web_content == 'NONE':
|
||||||
print('other downloading approaches did not work, trying requests')
|
print('other downloading approaches did not work, trying requests')
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue