"""Scrapy Shell See documentation in docs/topics/shell.rst """ from __future__ import print_function import os import signal import warnings from twisted.internet import reactor, threads, defer from twisted.python import threadable from w3lib.url import any_to_uri from scrapy.crawler import Crawler from scrapy.exceptions import IgnoreRequest, ScrapyDeprecationWarning from scrapy.http import Request, Response from scrapy.item import BaseItem from scrapy.settings import Settings from scrapy.spiders import Spider from scrapy.utils.console import start_python_console from scrapy.utils.datatypes import SequenceExclude from scrapy.utils.misc import load_object from scrapy.utils.response import open_in_browser from scrapy.utils.conf import get_config from scrapy.utils.console import DEFAULT_PYTHON_SHELLS class Shell(object): relevant_classes = (Crawler, Spider, Request, Response, BaseItem, Settings) def __init__(self, crawler, update_vars=None, code=None): self.crawler = crawler self.update_vars = update_vars or (lambda x: None) self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS']) self.spider = None self.inthread = not threadable.isInIOThread() self.code = code self.vars = {} def start(self, url=None, request=None, response=None, spider=None, redirect=True): # disable accidental Ctrl-C key press from shutting down the engine signal.signal(signal.SIGINT, signal.SIG_IGN) if url: self.fetch(url, spider, redirect=redirect) elif request: self.fetch(request, spider) elif response: request = response.request self.populate_vars(response, request, spider) else: self.populate_vars() if self.code: print(eval(self.code, globals(), self.vars)) else: """ Detect interactive shell setting in scrapy.cfg e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg [settings] # shell can be one of ipython, bpython or python; # to be used as the interactive python console, if available. # (default is ipython, fallbacks in the order listed above) shell = python """ cfg = get_config() section, option = 'settings', 'shell' env = os.environ.get('SCRAPY_PYTHON_SHELL') shells = [] if env: shells += env.strip().lower().split(',') elif cfg.has_option(section, option): shells += [cfg.get(section, option).strip().lower()] else: # try all by default shells += DEFAULT_PYTHON_SHELLS.keys() # always add standard shell as fallback shells += ['python'] start_python_console(self.vars, shells=shells, banner=self.vars.pop('banner', '')) def _schedule(self, request, spider): spider = self._open_spider(request, spider) d = _request_deferred(request) d.addCallback(lambda x: (x, spider)) self.crawler.engine.crawl(request, spider) return d def _open_spider(self, request, spider): if self.spider: return self.spider if spider is None: spider = self.crawler.spider or self.crawler._create_spider() self.crawler.spider = spider self.crawler.engine.open_spider(spider, close_if_idle=False) self.spider = spider return spider def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): if isinstance(request_or_url, Request): request = request_or_url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, **kwargs) if redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True response = None try: response, spider = threads.blockingCallFromThread( reactor, self._schedule, request, spider) except IgnoreRequest: pass self.populate_vars(response, request, spider) def populate_vars(self, response=None, request=None, spider=None): import scrapy self.vars['scrapy'] = scrapy self.vars['crawler'] = self.crawler self.vars['item'] = self.item_class() self.vars['settings'] = self.crawler.settings self.vars['spider'] = spider self.vars['request'] = request self.vars['response'] = response self.vars['sel'] = _SelectorProxy(response) if self.inthread: self.vars['fetch'] = self.fetch self.vars['view'] = open_in_browser self.vars['shelp'] = self.print_help self.update_vars(self.vars) if not self.code: self.vars['banner'] = self.get_help() def print_help(self): print(self.get_help()) def get_help(self): b = [] b.append("Available Scrapy objects:") b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)") for k, v in sorted(self.vars.items()): if self._is_relevant(v): b.append(" %-10s %s" % (k, v)) b.append("Useful shortcuts:") if self.inthread: b.append(" fetch(url[, redirect=True]) " "Fetch URL and update local objects " "(by default, redirects are followed)") b.append(" fetch(req) " "Fetch a scrapy.Request and update local objects ") b.append(" shelp() Shell help (print this help)") b.append(" view(response) View response in a browser") return "\n".join("[s] %s" % l for l in b) def _is_relevant(self, value): return isinstance(value, self.relevant_classes) def inspect_response(response, spider): """Open a shell to inspect the given response""" Shell(spider.crawler).start(response=response, spider=spider) def _request_deferred(request): """Wrap a request inside a Deferred. This function is harmful, do not use it until you know what you are doing. This returns a Deferred whose first pair of callbacks are the request callback and errback. The Deferred also triggers when the request callback/errback is executed (ie. when the request is downloaded) WARNING: Do not call request.replace() until after the deferred is called. """ request_callback = request.callback request_errback = request.errback def _restore_callbacks(result): request.callback = request_callback request.errback = request_errback return result d = defer.Deferred() d.addBoth(_restore_callbacks) if request.callback: d.addCallbacks(request.callback, request.errback) request.callback, request.errback = d.callback, d.errback return d class _SelectorProxy(object): def __init__(self, response): self._proxiedresponse = response def __getattr__(self, name): warnings.warn('"sel" shortcut is deprecated. Use "response.xpath()", ' '"response.css()" or "response.selector" instead', category=ScrapyDeprecationWarning, stacklevel=2) return getattr(self._proxiedresponse.selector, name)