javascript on highest level done better
This commit is contained in:
parent
5627c80177
commit
16199256e3
3 changed files with 55 additions and 16 deletions
Binary file not shown.
|
@ -82,15 +82,15 @@ evergabe-online:
|
||||||
jsdomain: 'https://www.evergabe-online.de/search.html'
|
jsdomain: 'https://www.evergabe-online.de/search.html'
|
||||||
jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span['
|
jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span['
|
||||||
jslink2: ']'
|
jslink2: ']'
|
||||||
|
jsiteration-var-list: "[1,2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,7,8,9,10]"
|
||||||
iteration-var-list: "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]"
|
iteration-var-list: "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]"
|
||||||
parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody"
|
parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody//tr"
|
||||||
child-name: "//tr/td[1]/div/a/text()"
|
child-name: "//td[1]/div/a/text()"
|
||||||
child-link: "//tr/td[1]/div/a/@href"
|
child-link: "//td[1]/div/a/@href"
|
||||||
javascript-link: "/td[6]/a"
|
javascript-link: ""
|
||||||
child-info: "/td[4]/text()[1]"
|
child-info: "//td[3]/div/text()"
|
||||||
child-period: "//td[2]/abbr/text()"
|
child-period: "//td[5]/text()"
|
||||||
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
child-sponsor: "//td[2]/div/text()"
|
||||||
child-sponsor: "/tr/td[4]/text()"
|
|
||||||
entry:
|
entry:
|
||||||
general:
|
general:
|
||||||
uniform: 'TRUE'
|
uniform: 'TRUE'
|
||||||
|
|
|
@ -17,6 +17,9 @@ from trafilatura import extract
|
||||||
from pdfminer.high_level import extract_pages
|
from pdfminer.high_level import extract_pages
|
||||||
from pdfminer.layout import LTTextContainer
|
from pdfminer.layout import LTTextContainer
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
class fdb_spider(object):
|
class fdb_spider(object):
|
||||||
def __init__(self, config_file):
|
def __init__(self, config_file):
|
||||||
with open(config_file, "r") as stream:
|
with open(config_file, "r") as stream:
|
||||||
|
@ -81,10 +84,17 @@ class fdb_spider(object):
|
||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
entry_jsdomain = eval(entry_list.get("jsdomain"))
|
entry_jsiteration_var_list = eval(entry_list.get("jsiteration-var-list"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(
|
print(
|
||||||
"No iteration-var-list defined in config.yaml - the original error message is:",
|
"No jsiteration-var-list defined in config.yaml - the original error message is:",
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
entry_jsdomain = entry_list.get("jsdomain")
|
||||||
|
except Exception as e:
|
||||||
|
print(
|
||||||
|
"No jsdomain defined in config.yaml - the original error message is:",
|
||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
entry_jsdomain = 'NONE'
|
entry_jsdomain = 'NONE'
|
||||||
|
@ -134,14 +144,16 @@ class fdb_spider(object):
|
||||||
else:
|
else:
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
|
||||||
from pyvirtualdisplay import Display
|
from pyvirtualdisplay import Display
|
||||||
display = Display(visible=0, size=(800, 800))
|
display = Display(visible=0, size=(800, 800))
|
||||||
display.start()
|
display.start()
|
||||||
|
|
||||||
#outputdir = '.'
|
##outputdir = '.'
|
||||||
#service_log_path = "{}/chromedriver.log".format(outputdir)
|
##service_log_path = "{}/chromedriver.log".format(outputdir)
|
||||||
#service_args = ['--verbose']
|
##service_args = ['--verbose']
|
||||||
#driver = webdriver.Chrome('/usr/bin/chromium')
|
##driver = webdriver.Chrome('/usr/bin/chromium')
|
||||||
|
|
||||||
options = webdriver.ChromeOptions()
|
options = webdriver.ChromeOptions()
|
||||||
options.add_argument('headless')
|
options.add_argument('headless')
|
||||||
options.add_argument("--remote-debugging-port=9222")
|
options.add_argument("--remote-debugging-port=9222")
|
||||||
|
@ -149,6 +161,33 @@ class fdb_spider(object):
|
||||||
options.add_argument('--disable-dev-shm-usage')
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
service = Service(executable_path='/usr/bin/chromedriver')
|
service = Service(executable_path='/usr/bin/chromedriver')
|
||||||
driver = webdriver.Chrome(options=options, service=service)
|
driver = webdriver.Chrome(options=options, service=service)
|
||||||
|
# driver = webdriver.Chrome()
|
||||||
|
driver.get(entry_jsdomain)
|
||||||
|
for i in range(len(entry_jsiteration_var_list)):
|
||||||
|
time.sleep(2)
|
||||||
|
print('trying to get element')
|
||||||
|
try:
|
||||||
|
element = driver.find_element(
|
||||||
|
"xpath",
|
||||||
|
entry_list_jslink1
|
||||||
|
+ str(entry_jsiteration_var_list[i])
|
||||||
|
+ entry_list_jslink2
|
||||||
|
)
|
||||||
|
print(entry_iteration_var_list[i])
|
||||||
|
time.sleep(2)
|
||||||
|
print('clicking..')
|
||||||
|
element.click()
|
||||||
|
time.sleep(2)
|
||||||
|
#window_after = driver.window_handles[1]
|
||||||
|
print('length of the window handles', len(driver.window_handles))
|
||||||
|
#driver.switch_to.window(window_after)
|
||||||
|
web_content = driver.page_source
|
||||||
|
|
||||||
|
f = open("spiders/pages/" + key + str(entry_iteration_var_list[i]) + "entryList.html", "w+")
|
||||||
|
f.write(web_content)
|
||||||
|
f.close
|
||||||
|
except Exception as e:
|
||||||
|
print('the iteration var element for clicking the pages was not found.. the original message is:',e )
|
||||||
|
|
||||||
|
|
||||||
def find_config_parameter(self, list_of_fdbs):
|
def find_config_parameter(self, list_of_fdbs):
|
||||||
|
|
Loading…
Reference in a new issue