javascript on highest level done better
This commit is contained in:
parent
5627c80177
commit
16199256e3
3 changed files with 55 additions and 16 deletions
Binary file not shown.
|
@ -82,15 +82,15 @@ evergabe-online:
|
|||
jsdomain: 'https://www.evergabe-online.de/search.html'
|
||||
jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span['
|
||||
jslink2: ']'
|
||||
iteration-var-list: "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102]"
|
||||
parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody"
|
||||
child-name: "//tr/td[1]/div/a/text()"
|
||||
child-link: "//tr/td[1]/div/a/@href"
|
||||
javascript-link: "/td[6]/a"
|
||||
child-info: "/td[4]/text()[1]"
|
||||
child-period: "//td[2]/abbr/text()"
|
||||
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
|
||||
child-sponsor: "/tr/td[4]/text()"
|
||||
jsiteration-var-list: "[1,2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,7,8,9,10]"
|
||||
iteration-var-list: "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]"
|
||||
parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody//tr"
|
||||
child-name: "//td[1]/div/a/text()"
|
||||
child-link: "//td[1]/div/a/@href"
|
||||
javascript-link: ""
|
||||
child-info: "//td[3]/div/text()"
|
||||
child-period: "//td[5]/text()"
|
||||
child-sponsor: "//td[2]/div/text()"
|
||||
entry:
|
||||
general:
|
||||
uniform: 'TRUE'
|
||||
|
|
|
@ -17,6 +17,9 @@ from trafilatura import extract
|
|||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTTextContainer
|
||||
|
||||
import time
|
||||
|
||||
|
||||
class fdb_spider(object):
|
||||
def __init__(self, config_file):
|
||||
with open(config_file, "r") as stream:
|
||||
|
@ -81,10 +84,17 @@ class fdb_spider(object):
|
|||
e,
|
||||
)
|
||||
try:
|
||||
entry_jsdomain = eval(entry_list.get("jsdomain"))
|
||||
entry_jsiteration_var_list = eval(entry_list.get("jsiteration-var-list"))
|
||||
except Exception as e:
|
||||
print(
|
||||
"No iteration-var-list defined in config.yaml - the original error message is:",
|
||||
"No jsiteration-var-list defined in config.yaml - the original error message is:",
|
||||
e,
|
||||
)
|
||||
try:
|
||||
entry_jsdomain = entry_list.get("jsdomain")
|
||||
except Exception as e:
|
||||
print(
|
||||
"No jsdomain defined in config.yaml - the original error message is:",
|
||||
e,
|
||||
)
|
||||
entry_jsdomain = 'NONE'
|
||||
|
@ -134,14 +144,16 @@ class fdb_spider(object):
|
|||
else:
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
|
||||
from pyvirtualdisplay import Display
|
||||
display = Display(visible=0, size=(800, 800))
|
||||
display.start()
|
||||
|
||||
#outputdir = '.'
|
||||
#service_log_path = "{}/chromedriver.log".format(outputdir)
|
||||
#service_args = ['--verbose']
|
||||
#driver = webdriver.Chrome('/usr/bin/chromium')
|
||||
##outputdir = '.'
|
||||
##service_log_path = "{}/chromedriver.log".format(outputdir)
|
||||
##service_args = ['--verbose']
|
||||
##driver = webdriver.Chrome('/usr/bin/chromium')
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('headless')
|
||||
options.add_argument("--remote-debugging-port=9222")
|
||||
|
@ -149,7 +161,34 @@ class fdb_spider(object):
|
|||
options.add_argument('--disable-dev-shm-usage')
|
||||
service = Service(executable_path='/usr/bin/chromedriver')
|
||||
driver = webdriver.Chrome(options=options, service=service)
|
||||
|
||||
# driver = webdriver.Chrome()
|
||||
driver.get(entry_jsdomain)
|
||||
for i in range(len(entry_jsiteration_var_list)):
|
||||
time.sleep(2)
|
||||
print('trying to get element')
|
||||
try:
|
||||
element = driver.find_element(
|
||||
"xpath",
|
||||
entry_list_jslink1
|
||||
+ str(entry_jsiteration_var_list[i])
|
||||
+ entry_list_jslink2
|
||||
)
|
||||
print(entry_iteration_var_list[i])
|
||||
time.sleep(2)
|
||||
print('clicking..')
|
||||
element.click()
|
||||
time.sleep(2)
|
||||
#window_after = driver.window_handles[1]
|
||||
print('length of the window handles', len(driver.window_handles))
|
||||
#driver.switch_to.window(window_after)
|
||||
web_content = driver.page_source
|
||||
|
||||
f = open("spiders/pages/" + key + str(entry_iteration_var_list[i]) + "entryList.html", "w+")
|
||||
f.write(web_content)
|
||||
f.close
|
||||
except Exception as e:
|
||||
print('the iteration var element for clicking the pages was not found.. the original message is:',e )
|
||||
|
||||
|
||||
def find_config_parameter(self, list_of_fdbs):
|
||||
for fdb in list_of_fdbs:
|
||||
|
|
Loading…
Reference in a new issue