|
@ -146,7 +146,7 @@ class fdb_spider(object): |
|
|
else: |
|
|
else: |
|
|
from selenium import webdriver |
|
|
from selenium import webdriver |
|
|
from selenium.webdriver.chrome.service import Service |
|
|
from selenium.webdriver.chrome.service import Service |
|
|
|
|
|
|
|
|
|
|
|
#from selenium.webdriver.common.action_chains import ActionChains |
|
|
from pyvirtualdisplay import Display |
|
|
from pyvirtualdisplay import Display |
|
|
display = Display(visible=0, size=(800, 800)) |
|
|
display = Display(visible=0, size=(800, 800)) |
|
|
display.start() |
|
|
display.start() |
|
@ -164,6 +164,7 @@ class fdb_spider(object): |
|
|
service = Service(executable_path='/usr/bin/chromedriver') |
|
|
service = Service(executable_path='/usr/bin/chromedriver') |
|
|
driver = webdriver.Chrome(options=options, service=service) |
|
|
driver = webdriver.Chrome(options=options, service=service) |
|
|
# driver = webdriver.Chrome() |
|
|
# driver = webdriver.Chrome() |
|
|
|
|
|
driver.implicitly_wait(10) |
|
|
driver.get(entry_jsdomain) |
|
|
driver.get(entry_jsdomain) |
|
|
for i in range(len(entry_jsiteration_var_list)): |
|
|
for i in range(len(entry_jsiteration_var_list)): |
|
|
time.sleep(2) |
|
|
time.sleep(2) |
|
@ -176,8 +177,14 @@ class fdb_spider(object): |
|
|
+ entry_list_jslink2 |
|
|
+ entry_list_jslink2 |
|
|
) |
|
|
) |
|
|
print(entry_iteration_var_list[i]) |
|
|
print(entry_iteration_var_list[i]) |
|
|
time.sleep(2) |
|
|
|
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
print('scrolling..') |
|
|
|
|
|
|
|
|
|
|
|
# scroll into view, because otherwise with javascript generated elements |
|
|
|
|
|
# it can be that clicking returns an error |
|
|
|
|
|
driver.execute_script("arguments[0].scrollIntoView();", element) |
|
|
print('clicking..') |
|
|
print('clicking..') |
|
|
|
|
|
time.sleep(1) |
|
|
element.click() |
|
|
element.click() |
|
|
time.sleep(2) |
|
|
time.sleep(2) |
|
|
#window_after = driver.window_handles[1] |
|
|
#window_after = driver.window_handles[1] |
|
@ -476,6 +483,7 @@ class fdb_spider(object): |
|
|
options.add_argument('--disable-dev-shm-usage') |
|
|
options.add_argument('--disable-dev-shm-usage') |
|
|
service = Service(executable_path='/usr/bin/chromedriver') |
|
|
service = Service(executable_path='/usr/bin/chromedriver') |
|
|
driver = webdriver.Chrome(options=options, service=service) |
|
|
driver = webdriver.Chrome(options=options, service=service) |
|
|
|
|
|
driver.implicitly_wait(10) |
|
|
#driver = webdriver.Chrome() |
|
|
#driver = webdriver.Chrome() |
|
|
for fdb in list_of_fdbs: |
|
|
for fdb in list_of_fdbs: |
|
|
print('spidering ' + fdb + ' ..') |
|
|
print('spidering ' + fdb + ' ..') |
|
@ -503,11 +511,21 @@ class fdb_spider(object): |
|
|
try: |
|
|
try: |
|
|
fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link") |
|
|
fdb_conf_entry_list_javascript_link = fdb_conf_entry_list.get("javascript-link") |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
|
|
|
fdb_conf_entry_list_javascript_link = 'NONE' |
|
|
print('the javascript link in the config is missing, original error message is:', e) |
|
|
print('the javascript link in the config is missing, original error message is:', e) |
|
|
|
|
|
try: |
|
|
|
|
|
fdb_conf_entry_list_slow_downloading = fdb_conf_entry_list.get("slow-downloading") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print('the slow-downloading parameter is not set, original error message is:', e) |
|
|
fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1") |
|
|
fdb_conf_entry_list_link1 = fdb_conf_entry_list.get("link1") |
|
|
fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2") |
|
|
fdb_conf_entry_list_link2 = fdb_conf_entry_list.get("link2") |
|
|
|
|
|
|
|
|
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2) |
|
|
|
|
|
|
|
|
if fdb_conf_entry_list_slow_downloading == 'FALSE': |
|
|
|
|
|
|
|
|
|
|
|
driver.get(fdb_conf_entry_list_link1 + str(i) + fdb_conf_entry_list_link2) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
for entry_id in dictionary_entry_list: |
|
|
for entry_id in dictionary_entry_list: |
|
|
print(entry_id) |
|
|
print(entry_id) |
|
@ -516,7 +534,7 @@ class fdb_spider(object): |
|
|
# download the html page of the entry |
|
|
# download the html page of the entry |
|
|
print(entry_link) |
|
|
print(entry_link) |
|
|
|
|
|
|
|
|
if 'javascript' in entry_link: |
|
|
|
|
|
|
|
|
if 'javascript' in entry_link or fdb_conf_entry_list_javascript_link != 'NONE': |
|
|
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link) |
|
|
print('oioioi',fdb_conf_entry_list_parent, entry_id, fdb_conf_entry_list_javascript_link) |
|
|
element = driver.find_element( |
|
|
element = driver.find_element( |
|
|
"xpath", |
|
|
"xpath", |
|
@ -556,42 +574,58 @@ class fdb_spider(object): |
|
|
driver.switch_to.window(window_before) |
|
|
driver.switch_to.window(window_before) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'javascript' not in entry_link and '.pdf' not in entry_link: |
|
|
|
|
|
|
|
|
if 'javascript' not in entry_link and '.pdf' not in entry_link and fdb_conf_entry_list_javascript_link == 'NONE': |
|
|
print('blabuuuuuba') |
|
|
print('blabuuuuuba') |
|
|
#print('oi') |
|
|
#print('oi') |
|
|
try: |
|
|
|
|
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects |
|
|
|
|
|
url = entry_link |
|
|
|
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) |
|
|
|
|
|
response = urllib.request.urlopen(req) |
|
|
|
|
|
print('response from first one', response) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print('cookie giving then downloading did not work, original error is:', e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if fdb_conf_entry_list_slow_downloading == 'TRUE': |
|
|
|
|
|
|
|
|
try: |
|
|
try: |
|
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) |
|
|
|
|
|
print( |
|
|
|
|
|
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("trying to get slowly entry link " , entry_link) |
|
|
|
|
|
driver.get(entry_link) |
|
|
|
|
|
time.sleep(3) |
|
|
|
|
|
web_content = driver.page_source |
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print("getting the html behind the entry link did not work, ori message is:", e) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
web_content = response.read().decode("UTF-8") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
try: |
|
|
web_content = response.read().decode("latin-1") |
|
|
|
|
|
print( |
|
|
|
|
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(ex) |
|
|
|
|
|
|
|
|
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects |
|
|
|
|
|
url = entry_link |
|
|
|
|
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0', 'Cookie':'myCookie=oioioioi'}) |
|
|
|
|
|
response = urllib.request.urlopen(req) |
|
|
|
|
|
print('response from first one', response) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print('cookie giving then downloading did not work, original error is:', e) |
|
|
|
|
|
try: |
|
|
|
|
|
response = urllib.request.urlopen(entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii')) |
|
|
|
|
|
print( |
|
|
|
|
|
"opening the link did not work, try to encode to ascii replacing xmlcharrefs now and reopen - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(entry_link, entry_link.encode('ascii', errors='xmlcharrefreplace').decode('ascii'), ex ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
web_content = response.read().decode("UTF-8") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
|
|
|
web_content = response.read().decode("latin-1") |
|
|
|
|
|
print( |
|
|
|
|
|
"decoding the respone in utf8 did not work, try to decode latin1 now - the original error message is:", |
|
|
|
|
|
e, |
|
|
|
|
|
) |
|
|
|
|
|
except Exception as ex: |
|
|
|
|
|
print(ex) |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
|
|
|
|
|
|
# save interim results to files |
|
|
|
|
|
|
|
|
if '.pdf' in entry_link: |
|
|
|
|
|
|
|
|
if '.pdf' in entry_link and fdb_conf_entry_list_javascript_link == 'NONE': |
|
|
|
|
|
|
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
response = requests.get(entry_link) |
|
|
response = requests.get(entry_link) |
|
@ -606,6 +640,10 @@ class fdb_spider(object): |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
file_name = "spiders/pages/" + fdb + str(i) + "/" + str(entry_id) + ".html" |
|
|
|
|
|
|
|
|
wget_wrote = False |
|
|
wget_wrote = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if web_content == 'NONE': |
|
|
if web_content == 'NONE': |
|
|
print('other downloading approaches did not work, trying requests') |
|
|
print('other downloading approaches did not work, trying requests') |
|
|
|
|
|
|
|
|