Browse Source

javascript on highest level done better

master
alpcentaur 10 months ago
parent
commit
16199256e3
3 changed files with 55 additions and 16 deletions
  1. BIN
      spiders/__pycache__/fdb_spider.cpython-39.pyc
  2. +9
    -9
      spiders/config.yaml
  3. +46
    -7
      spiders/fdb_spider.py

BIN
spiders/__pycache__/fdb_spider.cpython-39.pyc View File


+ 9
- 9
spiders/config.yaml View File

@ -82,15 +82,15 @@ evergabe-online:
jsdomain: 'https://www.evergabe-online.de/search.html' jsdomain: 'https://www.evergabe-online.de/search.html'
jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span[' jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span['
jslink2: ']' jslink2: ']'
iteration-var-list: "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102]"
parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody"
child-name: "//tr/td[1]/div/a/text()"
child-link: "//tr/td[1]/div/a/@href"
javascript-link: "/td[6]/a"
child-info: "/td[4]/text()[1]"
child-period: "//td[2]/abbr/text()"
#child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()"
child-sponsor: "/tr/td[4]/text()"
jsiteration-var-list: "[1,2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,7,8,9,10]"
iteration-var-list: "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]"
parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody//tr"
child-name: "//td[1]/div/a/text()"
child-link: "//td[1]/div/a/@href"
javascript-link: ""
child-info: "//td[3]/div/text()"
child-period: "//td[5]/text()"
child-sponsor: "//td[2]/div/text()"
entry: entry:
general: general:
uniform: 'TRUE' uniform: 'TRUE'

+ 46
- 7
spiders/fdb_spider.py View File

@ -17,6 +17,9 @@ from trafilatura import extract
from pdfminer.high_level import extract_pages from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer from pdfminer.layout import LTTextContainer
import time
class fdb_spider(object): class fdb_spider(object):
def __init__(self, config_file): def __init__(self, config_file):
with open(config_file, "r") as stream: with open(config_file, "r") as stream:
@ -81,10 +84,17 @@ class fdb_spider(object):
e, e,
) )
try: try:
entry_jsdomain = eval(entry_list.get("jsdomain"))
entry_jsiteration_var_list = eval(entry_list.get("jsiteration-var-list"))
except Exception as e: except Exception as e:
print( print(
"No iteration-var-list defined in config.yaml - the original error message is:",
"No jsiteration-var-list defined in config.yaml - the original error message is:",
e,
)
try:
entry_jsdomain = entry_list.get("jsdomain")
except Exception as e:
print(
"No jsdomain defined in config.yaml - the original error message is:",
e, e,
) )
entry_jsdomain = 'NONE' entry_jsdomain = 'NONE'
@ -134,14 +144,16 @@ class fdb_spider(object):
else: else:
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from pyvirtualdisplay import Display from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 800)) display = Display(visible=0, size=(800, 800))
display.start() display.start()
#outputdir = '.'
#service_log_path = "{}/chromedriver.log".format(outputdir)
#service_args = ['--verbose']
#driver = webdriver.Chrome('/usr/bin/chromium')
##outputdir = '.'
##service_log_path = "{}/chromedriver.log".format(outputdir)
##service_args = ['--verbose']
##driver = webdriver.Chrome('/usr/bin/chromium')
options = webdriver.ChromeOptions() options = webdriver.ChromeOptions()
options.add_argument('headless') options.add_argument('headless')
options.add_argument("--remote-debugging-port=9222") options.add_argument("--remote-debugging-port=9222")
@ -149,7 +161,34 @@ class fdb_spider(object):
options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-dev-shm-usage')
service = Service(executable_path='/usr/bin/chromedriver') service = Service(executable_path='/usr/bin/chromedriver')
driver = webdriver.Chrome(options=options, service=service) driver = webdriver.Chrome(options=options, service=service)
# driver = webdriver.Chrome()
driver.get(entry_jsdomain)
for i in range(len(entry_jsiteration_var_list)):
time.sleep(2)
print('trying to get element')
try:
element = driver.find_element(
"xpath",
entry_list_jslink1
+ str(entry_jsiteration_var_list[i])
+ entry_list_jslink2
)
print(entry_iteration_var_list[i])
time.sleep(2)
print('clicking..')
element.click()
time.sleep(2)
#window_after = driver.window_handles[1]
print('length of the window handles', len(driver.window_handles))
#driver.switch_to.window(window_after)
web_content = driver.page_source
f = open("spiders/pages/" + key + str(entry_iteration_var_list[i]) + "entryList.html", "w+")
f.write(web_content)
f.close
except Exception as e:
print('the iteration var element for clicking the pages was not found.. the original message is:',e )
def find_config_parameter(self, list_of_fdbs): def find_config_parameter(self, list_of_fdbs):
for fdb in list_of_fdbs: for fdb in list_of_fdbs:

Loading…
Cancel
Save