You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

207 lines
7.4 KiB

4 years ago
  1. """Scrapy Shell
  2. See documentation in docs/topics/shell.rst
  3. """
  4. from __future__ import print_function
  5. import os
  6. import signal
  7. import warnings
  8. from twisted.internet import reactor, threads, defer
  9. from twisted.python import threadable
  10. from w3lib.url import any_to_uri
  11. from scrapy.crawler import Crawler
  12. from scrapy.exceptions import IgnoreRequest, ScrapyDeprecationWarning
  13. from scrapy.http import Request, Response
  14. from scrapy.item import BaseItem
  15. from scrapy.settings import Settings
  16. from scrapy.spiders import Spider
  17. from scrapy.utils.console import start_python_console
  18. from scrapy.utils.datatypes import SequenceExclude
  19. from scrapy.utils.misc import load_object
  20. from scrapy.utils.response import open_in_browser
  21. from scrapy.utils.conf import get_config
  22. from scrapy.utils.console import DEFAULT_PYTHON_SHELLS
  23. class Shell(object):
  24. relevant_classes = (Crawler, Spider, Request, Response, BaseItem,
  25. Settings)
  26. def __init__(self, crawler, update_vars=None, code=None):
  27. self.crawler = crawler
  28. self.update_vars = update_vars or (lambda x: None)
  29. self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
  30. self.spider = None
  31. self.inthread = not threadable.isInIOThread()
  32. self.code = code
  33. self.vars = {}
  34. def start(self, url=None, request=None, response=None, spider=None, redirect=True):
  35. # disable accidental Ctrl-C key press from shutting down the engine
  36. signal.signal(signal.SIGINT, signal.SIG_IGN)
  37. if url:
  38. self.fetch(url, spider, redirect=redirect)
  39. elif request:
  40. self.fetch(request, spider)
  41. elif response:
  42. request = response.request
  43. self.populate_vars(response, request, spider)
  44. else:
  45. self.populate_vars()
  46. if self.code:
  47. print(eval(self.code, globals(), self.vars))
  48. else:
  49. """
  50. Detect interactive shell setting in scrapy.cfg
  51. e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
  52. [settings]
  53. # shell can be one of ipython, bpython or python;
  54. # to be used as the interactive python console, if available.
  55. # (default is ipython, fallbacks in the order listed above)
  56. shell = python
  57. """
  58. cfg = get_config()
  59. section, option = 'settings', 'shell'
  60. env = os.environ.get('SCRAPY_PYTHON_SHELL')
  61. shells = []
  62. if env:
  63. shells += env.strip().lower().split(',')
  64. elif cfg.has_option(section, option):
  65. shells += [cfg.get(section, option).strip().lower()]
  66. else: # try all by default
  67. shells += DEFAULT_PYTHON_SHELLS.keys()
  68. # always add standard shell as fallback
  69. shells += ['python']
  70. start_python_console(self.vars, shells=shells,
  71. banner=self.vars.pop('banner', ''))
  72. def _schedule(self, request, spider):
  73. spider = self._open_spider(request, spider)
  74. d = _request_deferred(request)
  75. d.addCallback(lambda x: (x, spider))
  76. self.crawler.engine.crawl(request, spider)
  77. return d
  78. def _open_spider(self, request, spider):
  79. if self.spider:
  80. return self.spider
  81. if spider is None:
  82. spider = self.crawler.spider or self.crawler._create_spider()
  83. self.crawler.spider = spider
  84. self.crawler.engine.open_spider(spider, close_if_idle=False)
  85. self.spider = spider
  86. return spider
  87. def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
  88. if isinstance(request_or_url, Request):
  89. request = request_or_url
  90. else:
  91. url = any_to_uri(request_or_url)
  92. request = Request(url, dont_filter=True, **kwargs)
  93. if redirect:
  94. request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
  95. else:
  96. request.meta['handle_httpstatus_all'] = True
  97. response = None
  98. try:
  99. response, spider = threads.blockingCallFromThread(
  100. reactor, self._schedule, request, spider)
  101. except IgnoreRequest:
  102. pass
  103. self.populate_vars(response, request, spider)
  104. def populate_vars(self, response=None, request=None, spider=None):
  105. import scrapy
  106. self.vars['scrapy'] = scrapy
  107. self.vars['crawler'] = self.crawler
  108. self.vars['item'] = self.item_class()
  109. self.vars['settings'] = self.crawler.settings
  110. self.vars['spider'] = spider
  111. self.vars['request'] = request
  112. self.vars['response'] = response
  113. self.vars['sel'] = _SelectorProxy(response)
  114. if self.inthread:
  115. self.vars['fetch'] = self.fetch
  116. self.vars['view'] = open_in_browser
  117. self.vars['shelp'] = self.print_help
  118. self.update_vars(self.vars)
  119. if not self.code:
  120. self.vars['banner'] = self.get_help()
  121. def print_help(self):
  122. print(self.get_help())
  123. def get_help(self):
  124. b = []
  125. b.append("Available Scrapy objects:")
  126. b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
  127. for k, v in sorted(self.vars.items()):
  128. if self._is_relevant(v):
  129. b.append(" %-10s %s" % (k, v))
  130. b.append("Useful shortcuts:")
  131. if self.inthread:
  132. b.append(" fetch(url[, redirect=True]) "
  133. "Fetch URL and update local objects "
  134. "(by default, redirects are followed)")
  135. b.append(" fetch(req) "
  136. "Fetch a scrapy.Request and update local objects ")
  137. b.append(" shelp() Shell help (print this help)")
  138. b.append(" view(response) View response in a browser")
  139. return "\n".join("[s] %s" % l for l in b)
  140. def _is_relevant(self, value):
  141. return isinstance(value, self.relevant_classes)
  142. def inspect_response(response, spider):
  143. """Open a shell to inspect the given response"""
  144. Shell(spider.crawler).start(response=response, spider=spider)
  145. def _request_deferred(request):
  146. """Wrap a request inside a Deferred.
  147. This function is harmful, do not use it until you know what you are doing.
  148. This returns a Deferred whose first pair of callbacks are the request
  149. callback and errback. The Deferred also triggers when the request
  150. callback/errback is executed (ie. when the request is downloaded)
  151. WARNING: Do not call request.replace() until after the deferred is called.
  152. """
  153. request_callback = request.callback
  154. request_errback = request.errback
  155. def _restore_callbacks(result):
  156. request.callback = request_callback
  157. request.errback = request_errback
  158. return result
  159. d = defer.Deferred()
  160. d.addBoth(_restore_callbacks)
  161. if request.callback:
  162. d.addCallbacks(request.callback, request.errback)
  163. request.callback, request.errback = d.callback, d.errback
  164. return d
  165. class _SelectorProxy(object):
  166. def __init__(self, response):
  167. self._proxiedresponse = response
  168. def __getattr__(self, name):
  169. warnings.warn('"sel" shortcut is deprecated. Use "response.xpath()", '
  170. '"response.css()" or "response.selector" instead',
  171. category=ScrapyDeprecationWarning, stacklevel=2)
  172. return getattr(self._proxiedresponse.selector, name)