You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

186 lines
6.7 KiB

  1. import os
  2. import yaml
  3. import json
  4. import urllib.request, urllib.error, urllib.parse
  5. from lxml import etree
  6. import lxml.html
  7. import lxml.html.soupparser
  8. class fdb_spider(object):
  9. def __init__(self, config_file):
  10. with open(config_file, "r") as stream:
  11. try:
  12. self.config = yaml.safe_load(stream)
  13. except yaml.YAMLError as exc:
  14. print(exc)
  15. # input list of funding databases in form of yaml file ['foerderinfo.bund.de', 'ausschreibungen.giz.de', .. , 'usw']
  16. def download_link_list_pages_of_funding_databases(self, list_of_fdbs):
  17. # download only html pages of the funding databases specified in input
  18. for fdb in list_of_fdbs:
  19. for key in self.config:
  20. if key in list_of_fdbs:
  21. try:
  22. entry_list = self.config.get(key).get("entry_list")
  23. except Exception as e:
  24. print(
  25. "There is a problem with the configuration variable entryList in the config.yaml - the original error message is:",
  26. e,
  27. )
  28. try:
  29. entry_list_link = entry_list.get("link")
  30. except Exception as e:
  31. print(
  32. "No entryListLink defined in config.yaml - the original error message is:",
  33. e,
  34. )
  35. # download the html page of the List of entrys
  36. response = urllib.request.urlopen(entry_list_link)
  37. web_content = response.read().decode("UTF-8")
  38. # save interim results to files
  39. f = open("spiders/pages/" + key + "entryList.html", "w+")
  40. f.write(webContent)
  41. f.close
  42. def parse_entry_list_data2dictionary(self, list_of_fdbs):
  43. for fdb in list_of_fdbs:
  44. try:
  45. # use soupparser to handle broken html
  46. tree = lxml.html.soupparser.parse(
  47. "spiders/pages/" + fdb + "entryList.html"
  48. )
  49. # for e in tree.iter():
  50. #
  51. # print(e.tag)
  52. #
  53. # for e in tree.xpath('//html//body//form//table//tr//td//table//tr'):
  54. #
  55. # #print(etree.tostring(e).decode())
  56. dictionary_entry_list = {}
  57. fdb_conf = self.config.get(fdb)
  58. fdb_domain = fdb_conf.get("domain")
  59. fdb_conf_entry_list = fdb_conf.get("entryList")
  60. fdb_conf_entry_list_parent = fdb_conf_entry_list.get("parent")
  61. fdb_conf_entry_list_child_name = fdb_conf_entry_list.get("child-name")
  62. fdb_conf_entry_list_child_link = fdb_conf_entry_list.get("child-link")
  63. for n in range(len(tree.xpath(fdb_conf_entry_list_parent))):
  64. name = tree.xpath(
  65. fdb_conf_entry_list_parent
  66. + "["
  67. + str(n)
  68. + "]"
  69. + fdb_conf_entry_list_child_name
  70. )
  71. link = tree.xpath(
  72. fdb_conf_entry_list_parent
  73. + "["
  74. + str(n)
  75. + "]"
  76. + fdb_conf_entry_list_child_link
  77. )
  78. if len(name) > 0:
  79. dictionary_entry_list[n] = {}
  80. dictionary_entry_list[n]["name"] = name[0]
  81. if fdb_domain in link[0]:
  82. dictionary_entry_list[n]["link"] = link[0]
  83. if fdb_domain not in link[0]:
  84. dictionary_entry_list[n]["link"] = fdb_domain + link[0]
  85. except Exception as e:
  86. print(
  87. "parsing the html did not work. Possibly you first have to run download_link_list_pages_of_funding_databases(). The original error message is:",
  88. e,
  89. )
  90. # save interim results to files
  91. f = open("spiders/output/" + fdb + "entryList.txt", "w+")
  92. f.write(str(dictionary_entry_list))
  93. f.close
  94. def download_entry_data_htmls(self, list_of_fdbs):
  95. for fdb in list_of_fdbs:
  96. f = open("spiders/output/" + fdb + "entryList.txt")
  97. text = f.read()
  98. dictionary_entry_list = eval(text)
  99. for entry_id in dictionary_entry_list:
  100. entry_link = dictionary_entry_list[entry_id]["link"]
  101. # download the html page of the entry
  102. response = urllib.request.urlopen(entry_link)
  103. web_content = response.read().decode("UTF-8")
  104. # save interim results to files
  105. file_name = "spiders/pages/" + fdb + "/" + str(entry_id) + ".html"
  106. os.makedirs(os.path.dirname(file_name), exist_ok=True)
  107. f = open(file_name, "w+")
  108. f.write(web_content)
  109. f.close
  110. def parse_entry_data2dictionary(self, list_of_fdbs):
  111. for fdb in list_of_fdbs:
  112. print("started to parse data of entry of " + fdb + " ..")
  113. f = open("spiders/output/" + fdb + "entryList.txt")
  114. text = f.read()
  115. dictionary_entry_list = eval(text)
  116. fdb_conf = self.config.get(fdb)
  117. fdb_domain = fdb_conf.get("domain")
  118. fdb_conf_entry = fdb_conf.get("entry")
  119. fdb_conf_entry_info1 = fdb_conf_entry.get("info-1")
  120. fdb_conf_entry_info1_parent = fdb_conf_entry_info1.get("parent")
  121. fdb_conf_entry_info1_child_1 = fdb_conf_entry_info1.get(
  122. "child-1"
  123. )
  124. for entry_id in dictionary_entry_list:
  125. print(
  126. "started to parse data of entry with name "
  127. + dictionary_entry_list[entry_id]["name"]
  128. + " .."
  129. )
  130. file_name = "spiders/pages/" + fdb + "/" + str(entry_id) + ".html"
  131. tree = lxml.html.soupparser.parse(file_name)
  132. child_1 = tree.xpath(
  133. fdb_conf_entry_info1_parent
  134. + fdb_conf_entry_info1_child_1
  135. )
  136. print("oi", child_1)
  137. if len(child_1) > 0:
  138. dictionary_entry_list[entry_id]["child_1"] = child_1[
  139. 0
  140. ]
  141. f = open("spiders/output/" + fdb + "entryList.txt", "w+")
  142. f.write(str(dictionary_entry_list))
  143. f.close