208 lines
7.4 KiB
Python
208 lines
7.4 KiB
Python
|
"""Scrapy Shell
|
||
|
|
||
|
See documentation in docs/topics/shell.rst
|
||
|
|
||
|
"""
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import os
|
||
|
import signal
|
||
|
import warnings
|
||
|
|
||
|
from twisted.internet import reactor, threads, defer
|
||
|
from twisted.python import threadable
|
||
|
from w3lib.url import any_to_uri
|
||
|
|
||
|
from scrapy.crawler import Crawler
|
||
|
from scrapy.exceptions import IgnoreRequest, ScrapyDeprecationWarning
|
||
|
from scrapy.http import Request, Response
|
||
|
from scrapy.item import BaseItem
|
||
|
from scrapy.settings import Settings
|
||
|
from scrapy.spiders import Spider
|
||
|
from scrapy.utils.console import start_python_console
|
||
|
from scrapy.utils.datatypes import SequenceExclude
|
||
|
from scrapy.utils.misc import load_object
|
||
|
from scrapy.utils.response import open_in_browser
|
||
|
from scrapy.utils.conf import get_config
|
||
|
from scrapy.utils.console import DEFAULT_PYTHON_SHELLS
|
||
|
|
||
|
|
||
|
class Shell(object):
|
||
|
|
||
|
relevant_classes = (Crawler, Spider, Request, Response, BaseItem,
|
||
|
Settings)
|
||
|
|
||
|
def __init__(self, crawler, update_vars=None, code=None):
|
||
|
self.crawler = crawler
|
||
|
self.update_vars = update_vars or (lambda x: None)
|
||
|
self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
|
||
|
self.spider = None
|
||
|
self.inthread = not threadable.isInIOThread()
|
||
|
self.code = code
|
||
|
self.vars = {}
|
||
|
|
||
|
def start(self, url=None, request=None, response=None, spider=None, redirect=True):
|
||
|
# disable accidental Ctrl-C key press from shutting down the engine
|
||
|
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||
|
if url:
|
||
|
self.fetch(url, spider, redirect=redirect)
|
||
|
elif request:
|
||
|
self.fetch(request, spider)
|
||
|
elif response:
|
||
|
request = response.request
|
||
|
self.populate_vars(response, request, spider)
|
||
|
else:
|
||
|
self.populate_vars()
|
||
|
if self.code:
|
||
|
print(eval(self.code, globals(), self.vars))
|
||
|
else:
|
||
|
"""
|
||
|
Detect interactive shell setting in scrapy.cfg
|
||
|
e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
|
||
|
[settings]
|
||
|
# shell can be one of ipython, bpython or python;
|
||
|
# to be used as the interactive python console, if available.
|
||
|
# (default is ipython, fallbacks in the order listed above)
|
||
|
shell = python
|
||
|
"""
|
||
|
cfg = get_config()
|
||
|
section, option = 'settings', 'shell'
|
||
|
env = os.environ.get('SCRAPY_PYTHON_SHELL')
|
||
|
shells = []
|
||
|
if env:
|
||
|
shells += env.strip().lower().split(',')
|
||
|
elif cfg.has_option(section, option):
|
||
|
shells += [cfg.get(section, option).strip().lower()]
|
||
|
else: # try all by default
|
||
|
shells += DEFAULT_PYTHON_SHELLS.keys()
|
||
|
# always add standard shell as fallback
|
||
|
shells += ['python']
|
||
|
start_python_console(self.vars, shells=shells,
|
||
|
banner=self.vars.pop('banner', ''))
|
||
|
|
||
|
def _schedule(self, request, spider):
|
||
|
spider = self._open_spider(request, spider)
|
||
|
d = _request_deferred(request)
|
||
|
d.addCallback(lambda x: (x, spider))
|
||
|
self.crawler.engine.crawl(request, spider)
|
||
|
return d
|
||
|
|
||
|
def _open_spider(self, request, spider):
|
||
|
if self.spider:
|
||
|
return self.spider
|
||
|
|
||
|
if spider is None:
|
||
|
spider = self.crawler.spider or self.crawler._create_spider()
|
||
|
|
||
|
self.crawler.spider = spider
|
||
|
self.crawler.engine.open_spider(spider, close_if_idle=False)
|
||
|
self.spider = spider
|
||
|
return spider
|
||
|
|
||
|
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
|
||
|
if isinstance(request_or_url, Request):
|
||
|
request = request_or_url
|
||
|
else:
|
||
|
url = any_to_uri(request_or_url)
|
||
|
request = Request(url, dont_filter=True, **kwargs)
|
||
|
if redirect:
|
||
|
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
|
||
|
else:
|
||
|
request.meta['handle_httpstatus_all'] = True
|
||
|
response = None
|
||
|
try:
|
||
|
response, spider = threads.blockingCallFromThread(
|
||
|
reactor, self._schedule, request, spider)
|
||
|
except IgnoreRequest:
|
||
|
pass
|
||
|
self.populate_vars(response, request, spider)
|
||
|
|
||
|
def populate_vars(self, response=None, request=None, spider=None):
|
||
|
import scrapy
|
||
|
|
||
|
self.vars['scrapy'] = scrapy
|
||
|
self.vars['crawler'] = self.crawler
|
||
|
self.vars['item'] = self.item_class()
|
||
|
self.vars['settings'] = self.crawler.settings
|
||
|
self.vars['spider'] = spider
|
||
|
self.vars['request'] = request
|
||
|
self.vars['response'] = response
|
||
|
self.vars['sel'] = _SelectorProxy(response)
|
||
|
if self.inthread:
|
||
|
self.vars['fetch'] = self.fetch
|
||
|
self.vars['view'] = open_in_browser
|
||
|
self.vars['shelp'] = self.print_help
|
||
|
self.update_vars(self.vars)
|
||
|
if not self.code:
|
||
|
self.vars['banner'] = self.get_help()
|
||
|
|
||
|
def print_help(self):
|
||
|
print(self.get_help())
|
||
|
|
||
|
def get_help(self):
|
||
|
b = []
|
||
|
b.append("Available Scrapy objects:")
|
||
|
b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
|
||
|
for k, v in sorted(self.vars.items()):
|
||
|
if self._is_relevant(v):
|
||
|
b.append(" %-10s %s" % (k, v))
|
||
|
b.append("Useful shortcuts:")
|
||
|
if self.inthread:
|
||
|
b.append(" fetch(url[, redirect=True]) "
|
||
|
"Fetch URL and update local objects "
|
||
|
"(by default, redirects are followed)")
|
||
|
b.append(" fetch(req) "
|
||
|
"Fetch a scrapy.Request and update local objects ")
|
||
|
b.append(" shelp() Shell help (print this help)")
|
||
|
b.append(" view(response) View response in a browser")
|
||
|
|
||
|
return "\n".join("[s] %s" % l for l in b)
|
||
|
|
||
|
def _is_relevant(self, value):
|
||
|
return isinstance(value, self.relevant_classes)
|
||
|
|
||
|
|
||
|
def inspect_response(response, spider):
|
||
|
"""Open a shell to inspect the given response"""
|
||
|
Shell(spider.crawler).start(response=response, spider=spider)
|
||
|
|
||
|
|
||
|
def _request_deferred(request):
|
||
|
"""Wrap a request inside a Deferred.
|
||
|
|
||
|
This function is harmful, do not use it until you know what you are doing.
|
||
|
|
||
|
This returns a Deferred whose first pair of callbacks are the request
|
||
|
callback and errback. The Deferred also triggers when the request
|
||
|
callback/errback is executed (ie. when the request is downloaded)
|
||
|
|
||
|
WARNING: Do not call request.replace() until after the deferred is called.
|
||
|
"""
|
||
|
request_callback = request.callback
|
||
|
request_errback = request.errback
|
||
|
|
||
|
def _restore_callbacks(result):
|
||
|
request.callback = request_callback
|
||
|
request.errback = request_errback
|
||
|
return result
|
||
|
|
||
|
d = defer.Deferred()
|
||
|
d.addBoth(_restore_callbacks)
|
||
|
if request.callback:
|
||
|
d.addCallbacks(request.callback, request.errback)
|
||
|
|
||
|
request.callback, request.errback = d.callback, d.errback
|
||
|
return d
|
||
|
|
||
|
|
||
|
class _SelectorProxy(object):
|
||
|
|
||
|
def __init__(self, response):
|
||
|
self._proxiedresponse = response
|
||
|
|
||
|
def __getattr__(self, name):
|
||
|
warnings.warn('"sel" shortcut is deprecated. Use "response.xpath()", '
|
||
|
'"response.css()" or "response.selector" instead',
|
||
|
category=ScrapyDeprecationWarning, stacklevel=2)
|
||
|
return getattr(self._proxiedresponse.selector, name)
|