diff --git a/spiders/__pycache__/fdb_spider.cpython-39.pyc b/spiders/__pycache__/fdb_spider.cpython-39.pyc index a9293e6..f60df44 100644 Binary files a/spiders/__pycache__/fdb_spider.cpython-39.pyc and b/spiders/__pycache__/fdb_spider.cpython-39.pyc differ diff --git a/spiders/config.yaml b/spiders/config.yaml index dca8a01..c0be443 100644 --- a/spiders/config.yaml +++ b/spiders/config.yaml @@ -82,15 +82,15 @@ evergabe-online: jsdomain: 'https://www.evergabe-online.de/search.html' jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span[' jslink2: ']' - iteration-var-list: "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102]" - parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody" - child-name: "//tr/td[1]/div/a/text()" - child-link: "//tr/td[1]/div/a/@href" - javascript-link: "/td[6]/a" - child-info: "/td[4]/text()[1]" - child-period: "//td[2]/abbr/text()" - #child-period: "//div[@class='c-teaser__text-wrapper']//small//span/time/text()" - child-sponsor: "/tr/td[4]/text()" + jsiteration-var-list: "[1,2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,7,8,9,10]" + iteration-var-list: "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]" + parent: "/html/body/div[8]/main/div[4]/div/div/div[2]/table/tbody//tr" + child-name: "//td[1]/div/a/text()" + child-link: "//td[1]/div/a/@href" + javascript-link: "" + child-info: "//td[3]/div/text()" + child-period: "//td[5]/text()" + child-sponsor: "//td[2]/div/text()" entry: general: uniform: 'TRUE' diff --git a/spiders/fdb_spider.py b/spiders/fdb_spider.py index 023adf7..9569fcf 100644 --- a/spiders/fdb_spider.py +++ b/spiders/fdb_spider.py @@ -17,6 +17,9 @@ from trafilatura import extract from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer +import time + + class fdb_spider(object): def __init__(self, config_file): with open(config_file, "r") as stream: @@ -81,10 +84,17 @@ class fdb_spider(object): e, ) try: - entry_jsdomain = eval(entry_list.get("jsdomain")) + entry_jsiteration_var_list = eval(entry_list.get("jsiteration-var-list")) except Exception as e: print( - "No iteration-var-list defined in config.yaml - the original error message is:", + "No jsiteration-var-list defined in config.yaml - the original error message is:", + e, + ) + try: + entry_jsdomain = entry_list.get("jsdomain") + except Exception as e: + print( + "No jsdomain defined in config.yaml - the original error message is:", e, ) entry_jsdomain = 'NONE' @@ -134,14 +144,16 @@ class fdb_spider(object): else: from selenium import webdriver from selenium.webdriver.chrome.service import Service + from pyvirtualdisplay import Display display = Display(visible=0, size=(800, 800)) display.start() - #outputdir = '.' - #service_log_path = "{}/chromedriver.log".format(outputdir) - #service_args = ['--verbose'] - #driver = webdriver.Chrome('/usr/bin/chromium') + ##outputdir = '.' + ##service_log_path = "{}/chromedriver.log".format(outputdir) + ##service_args = ['--verbose'] + ##driver = webdriver.Chrome('/usr/bin/chromium') + options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument("--remote-debugging-port=9222") @@ -149,7 +161,34 @@ class fdb_spider(object): options.add_argument('--disable-dev-shm-usage') service = Service(executable_path='/usr/bin/chromedriver') driver = webdriver.Chrome(options=options, service=service) - + # driver = webdriver.Chrome() + driver.get(entry_jsdomain) + for i in range(len(entry_jsiteration_var_list)): + time.sleep(2) + print('trying to get element') + try: + element = driver.find_element( + "xpath", + entry_list_jslink1 + + str(entry_jsiteration_var_list[i]) + + entry_list_jslink2 + ) + print(entry_iteration_var_list[i]) + time.sleep(2) + print('clicking..') + element.click() + time.sleep(2) + #window_after = driver.window_handles[1] + print('length of the window handles', len(driver.window_handles)) + #driver.switch_to.window(window_after) + web_content = driver.page_source + + f = open("spiders/pages/" + key + str(entry_iteration_var_list[i]) + "entryList.html", "w+") + f.write(web_content) + f.close + except Exception as e: + print('the iteration var element for clicking the pages was not found.. the original message is:',e ) + def find_config_parameter(self, list_of_fdbs): for fdb in list_of_fdbs: