You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

84 lines
2.9 KiB

4 years ago
  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. from collections import defaultdict
  4. import traceback
  5. import warnings
  6. from zope.interface import implementer
  7. from scrapy.interfaces import ISpiderLoader
  8. from scrapy.utils.misc import walk_modules
  9. from scrapy.utils.spider import iter_spider_classes
  10. @implementer(ISpiderLoader)
  11. class SpiderLoader(object):
  12. """
  13. SpiderLoader is a class which locates and loads spiders
  14. in a Scrapy project.
  15. """
  16. def __init__(self, settings):
  17. self.spider_modules = settings.getlist('SPIDER_MODULES')
  18. self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')
  19. self._spiders = {}
  20. self._found = defaultdict(list)
  21. self._load_all_spiders()
  22. def _check_name_duplicates(self):
  23. dupes = ["\n".join(" {cls} named {name!r} (in {module})".format(
  24. module=mod, cls=cls, name=name)
  25. for (mod, cls) in locations)
  26. for name, locations in self._found.items()
  27. if len(locations)>1]
  28. if dupes:
  29. msg = ("There are several spiders with the same name:\n\n"
  30. "{}\n\n This can cause unexpected behavior.".format(
  31. "\n\n".join(dupes)))
  32. warnings.warn(msg, UserWarning)
  33. def _load_spiders(self, module):
  34. for spcls in iter_spider_classes(module):
  35. self._found[spcls.name].append((module.__name__, spcls.__name__))
  36. self._spiders[spcls.name] = spcls
  37. def _load_all_spiders(self):
  38. for name in self.spider_modules:
  39. try:
  40. for module in walk_modules(name):
  41. self._load_spiders(module)
  42. except ImportError as e:
  43. if self.warn_only:
  44. msg = ("\n{tb}Could not load spiders from module '{modname}'. "
  45. "See above traceback for details.".format(
  46. modname=name, tb=traceback.format_exc()))
  47. warnings.warn(msg, RuntimeWarning)
  48. else:
  49. raise
  50. self._check_name_duplicates()
  51. @classmethod
  52. def from_settings(cls, settings):
  53. return cls(settings)
  54. def load(self, spider_name):
  55. """
  56. Return the Spider class for the given spider name. If the spider
  57. name is not found, raise a KeyError.
  58. """
  59. try:
  60. return self._spiders[spider_name]
  61. except KeyError:
  62. raise KeyError("Spider not found: {}".format(spider_name))
  63. def find_by_request(self, request):
  64. """
  65. Return the list of spider names that can handle the given request.
  66. """
  67. return [name for name, cls in self._spiders.items()
  68. if cls.handles_request(request)]
  69. def list(self):
  70. """
  71. Return a list with the names of all spiders available in the project.
  72. """
  73. return list(self._spiders.keys())