@ -133,10 +133,22 @@ class fdb_spider(object):
f . close
f . close
else :
else :
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from pyvirtualdisplay import Display
display = Display ( visible = 0 , size = ( 800 , 800 ) )
display . start ( )
#outputdir = '.'
#service_log_path = "{}/chromedriver.log".format(outputdir)
#service_args = ['--verbose']
#driver = webdriver.Chrome('/usr/bin/chromium')
options = webdriver . ChromeOptions ( )
options = webdriver . ChromeOptions ( )
options . add_argument ( ' headless ' )
options . add_argument ( ' headless ' )
driver = webdriver . Chrome ( options = options )
options . add_argument ( " --remote-debugging-port=9222 " )
options . add_argument ( ' --no-sandbox ' )
options . add_argument ( ' --disable-dev-shm-usage ' )
service = Service ( executable_path = ' /usr/bin/chromedriver ' )
driver = webdriver . Chrome ( options = options , service = service )
def find_config_parameter ( self , list_of_fdbs ) :
def find_config_parameter ( self , list_of_fdbs ) :
@ -357,15 +369,21 @@ class fdb_spider(object):
if fdb_domain in link :
if fdb_domain in link :
dictionary_entry_list [ n ] [ " link " ] = link
dictionary_entry_list [ n ] [ " link " ] = link
if fdb_domain not in link and ( ' http: ' in link or ' www. ' in link or ' https: ' in link ) :
if fdb_domain not in link and ' http: ' in link :
dictionary_entry_list [ n ] [ " link " ] = link
if fdb_domain not in link and ' www. ' in link :
dictionary_entry_list [ n ] [ " link " ] = link
if fdb_domain not in link and ' https: ' in link :
dictionary_entry_list [ n ] [ " link " ] = link
dictionary_entry_list [ n ] [ " link " ] = link
if ' javascript: ' in link :
if ' javascript: ' in link :
dictionary_entry_list [ n ] [ " link " ] = link
dictionary_entry_list [ n ] [ " link " ] = link
else :
if fdb_domain not in link and ( ' http ' or ' https ' or ' www. ' ) not in link :
if link [ - 1 ] == ' / ' :
if link [ - 1 ] == ' / ' :
dictionary_entry_list [ n ] [ " link " ] = fdb_domain + link
dictionary_entry_list [ n ] [ " link " ] = fdb_domain + link
else :
else :
dictionary_entry_list [ n ] [ " link " ] = fdb_domain + ' / ' + link
dictionary_entry_list [ n ] [ " link " ] = fdb_domain + ' / ' + link
except Exception as e :
except Exception as e :
print (
print (
@ -382,10 +400,23 @@ class fdb_spider(object):
def download_entry_data_htmls ( self , list_of_fdbs ) :
def download_entry_data_htmls ( self , list_of_fdbs ) :
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from pyvirtualdisplay import Display
display = Display ( visible = 0 , size = ( 800 , 800 ) )
display . start ( )
#outputdir = '.'
#service_log_path = "{}/chromedriver.log".format(outputdir)
#service_args = ['--verbose']
#driver = webdriver.Chrome('/usr/bin/chromium')
options = webdriver . ChromeOptions ( )
options = webdriver . ChromeOptions ( )
options . add_argument ( ' headless ' )
options . add_argument ( ' headless ' )
driver = webdriver . Chrome ( options = options )
options . add_argument ( " --remote-debugging-port=9222 " )
options . add_argument ( ' --no-sandbox ' )
options . add_argument ( ' --disable-dev-shm-usage ' )
service = Service ( executable_path = ' /usr/bin/chromedriver ' )
driver = webdriver . Chrome ( options = options , service = service )
#driver = webdriver.Chrome()
for fdb in list_of_fdbs :
for fdb in list_of_fdbs :
try :
try :
@ -464,7 +495,7 @@ class fdb_spider(object):
driver . switch_to . window ( window_before )
driver . switch_to . window ( window_before )
if ( ' http ' or ' www ' ) in entry_link and ( ' javascript ' or ' js ' or ' .pdf ' ) not in enry_link :
if ( ' http ' or ' www ' ) in entry_link and ' javascript ' not in entry_link and ' .pdf ' not in ent ry_link :
try :
try :
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects
# defining cookie to not end up in endless loop because of cookie banners pointing to redirects