You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

33 lines
1.7 KiB

  1. # Settings for the PEP crawler per country to crawl
  2. # Follow the syntax and dont use tbody as it gets added by the browser (when researching xpath through inspector)
  3. # xpath syntax: https://www.w3schools.com/xml/xpath_syntax.asp
  4. # lxml xpath syntax: https://www.geeksforgeeks.org/web-scraping-using-lxml-and-xpath-in-python/
  5. greenjobs:
  6. domain: 'https://www.greenjobs.de'
  7. entry-list:
  8. link1: 'https://www.greenjobs.de/angebote/index.html?s=&loc=&countrycode=de&dist='
  9. link2: '0&lng=&lat='
  10. jsdomain: 'NONE'
  11. #jslink1: '/html/body/div[8]/main/div[4]/div/div/div[2]/table/thead/tr[1]/td/div[2]/div/span['
  12. #jslink2: ']'
  13. #jsiteration-var-list: "[1,2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,7,8,9,10]"
  14. iteration-var-list: "['1']"
  15. parent: "/html/body/div/div/div/div/main/div[2]/table/tbody//tr"
  16. child-name: "//td[1]/a/text()"
  17. child-link: "//td[1]/a/@href"
  18. #javascript-link: ""
  19. child-info: "//td[5]/text()"
  20. child-period: "//td[6]/text()"
  21. child-sponsor: "//td[3]/text()"
  22. entry:
  23. general:
  24. uniform: 'TRUE'
  25. unitrue:
  26. #parent: '//html//body//form//table'
  27. text: '/html/body/div[2]/div[4]/div/div[5]/div/table/tbody/tr/td[5]/a/@href'
  28. #child-sum: '//html//body//form//table//tr[2]//td[1]//span//img'
  29. #child-deadline: '//html/body/form/table/tr[2]/td[3]/span + label.1'
  30. unifalse:
  31. wordlist: "['Mobilität', 'Energie', 'Off-grid', 'regenerative Energien', 'Solar', 'Energienetze', 'Elektromobilität']"