336 lines
13 KiB
Python
336 lines
13 KiB
Python
import six
|
|
import signal
|
|
import logging
|
|
import warnings
|
|
|
|
import sys
|
|
from twisted.internet import reactor, defer
|
|
from zope.interface.verify import verifyClass, DoesNotImplement
|
|
|
|
from scrapy.core.engine import ExecutionEngine
|
|
from scrapy.resolver import CachingThreadedResolver
|
|
from scrapy.interfaces import ISpiderLoader
|
|
from scrapy.extension import ExtensionManager
|
|
from scrapy.settings import overridden_settings, Settings
|
|
from scrapy.signalmanager import SignalManager
|
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
|
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
|
|
from scrapy.utils.misc import load_object
|
|
from scrapy.utils.log import (
|
|
LogCounterHandler, configure_logging, log_scrapy_info,
|
|
get_scrapy_root_handler, install_scrapy_root_handler)
|
|
from scrapy import signals
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Crawler(object):
|
|
|
|
def __init__(self, spidercls, settings=None):
|
|
if isinstance(settings, dict) or settings is None:
|
|
settings = Settings(settings)
|
|
|
|
self.spidercls = spidercls
|
|
self.settings = settings.copy()
|
|
self.spidercls.update_settings(self.settings)
|
|
|
|
d = dict(overridden_settings(self.settings))
|
|
logger.info("Overridden settings: %(settings)r", {'settings': d})
|
|
|
|
self.signals = SignalManager(self)
|
|
self.stats = load_object(self.settings['STATS_CLASS'])(self)
|
|
|
|
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
|
|
logging.root.addHandler(handler)
|
|
if get_scrapy_root_handler() is not None:
|
|
# scrapy root handler already installed: update it with new settings
|
|
install_scrapy_root_handler(self.settings)
|
|
# lambda is assigned to Crawler attribute because this way it is not
|
|
# garbage collected after leaving __init__ scope
|
|
self.__remove_handler = lambda: logging.root.removeHandler(handler)
|
|
self.signals.connect(self.__remove_handler, signals.engine_stopped)
|
|
|
|
lf_cls = load_object(self.settings['LOG_FORMATTER'])
|
|
self.logformatter = lf_cls.from_crawler(self)
|
|
self.extensions = ExtensionManager.from_crawler(self)
|
|
|
|
self.settings.freeze()
|
|
self.crawling = False
|
|
self.spider = None
|
|
self.engine = None
|
|
|
|
@property
|
|
def spiders(self):
|
|
if not hasattr(self, '_spiders'):
|
|
warnings.warn("Crawler.spiders is deprecated, use "
|
|
"CrawlerRunner.spider_loader or instantiate "
|
|
"scrapy.spiderloader.SpiderLoader with your "
|
|
"settings.",
|
|
category=ScrapyDeprecationWarning, stacklevel=2)
|
|
self._spiders = _get_spider_loader(self.settings.frozencopy())
|
|
return self._spiders
|
|
|
|
@defer.inlineCallbacks
|
|
def crawl(self, *args, **kwargs):
|
|
assert not self.crawling, "Crawling already taking place"
|
|
self.crawling = True
|
|
|
|
try:
|
|
self.spider = self._create_spider(*args, **kwargs)
|
|
self.engine = self._create_engine()
|
|
start_requests = iter(self.spider.start_requests())
|
|
yield self.engine.open_spider(self.spider, start_requests)
|
|
yield defer.maybeDeferred(self.engine.start)
|
|
except Exception:
|
|
# In Python 2 reraising an exception after yield discards
|
|
# the original traceback (see https://bugs.python.org/issue7563),
|
|
# so sys.exc_info() workaround is used.
|
|
# This workaround also works in Python 3, but it is not needed,
|
|
# and it is slower, so in Python 3 we use native `raise`.
|
|
if six.PY2:
|
|
exc_info = sys.exc_info()
|
|
|
|
self.crawling = False
|
|
if self.engine is not None:
|
|
yield self.engine.close()
|
|
|
|
if six.PY2:
|
|
six.reraise(*exc_info)
|
|
raise
|
|
|
|
def _create_spider(self, *args, **kwargs):
|
|
return self.spidercls.from_crawler(self, *args, **kwargs)
|
|
|
|
def _create_engine(self):
|
|
return ExecutionEngine(self, lambda _: self.stop())
|
|
|
|
@defer.inlineCallbacks
|
|
def stop(self):
|
|
if self.crawling:
|
|
self.crawling = False
|
|
yield defer.maybeDeferred(self.engine.stop)
|
|
|
|
|
|
class CrawlerRunner(object):
|
|
"""
|
|
This is a convenient helper class that keeps track of, manages and runs
|
|
crawlers inside an already setup Twisted `reactor`_.
|
|
|
|
The CrawlerRunner object must be instantiated with a
|
|
:class:`~scrapy.settings.Settings` object.
|
|
|
|
This class shouldn't be needed (since Scrapy is responsible of using it
|
|
accordingly) unless writing scripts that manually handle the crawling
|
|
process. See :ref:`run-from-script` for an example.
|
|
"""
|
|
|
|
crawlers = property(
|
|
lambda self: self._crawlers,
|
|
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
|
|
":meth:`crawl` and managed by this class."
|
|
)
|
|
|
|
def __init__(self, settings=None):
|
|
if isinstance(settings, dict) or settings is None:
|
|
settings = Settings(settings)
|
|
self.settings = settings
|
|
self.spider_loader = _get_spider_loader(settings)
|
|
self._crawlers = set()
|
|
self._active = set()
|
|
|
|
@property
|
|
def spiders(self):
|
|
warnings.warn("CrawlerRunner.spiders attribute is renamed to "
|
|
"CrawlerRunner.spider_loader.",
|
|
category=ScrapyDeprecationWarning, stacklevel=2)
|
|
return self.spider_loader
|
|
|
|
def crawl(self, crawler_or_spidercls, *args, **kwargs):
|
|
"""
|
|
Run a crawler with the provided arguments.
|
|
|
|
It will call the given Crawler's :meth:`~Crawler.crawl` method, while
|
|
keeping track of it so it can be stopped later.
|
|
|
|
If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
|
|
instance, this method will try to create one using this parameter as
|
|
the spider class given to it.
|
|
|
|
Returns a deferred that is fired when the crawling is finished.
|
|
|
|
:param crawler_or_spidercls: already created crawler, or a spider class
|
|
or spider's name inside the project to create it
|
|
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
|
|
:class:`~scrapy.spiders.Spider` subclass or string
|
|
|
|
:param list args: arguments to initialize the spider
|
|
|
|
:param dict kwargs: keyword arguments to initialize the spider
|
|
"""
|
|
crawler = self.create_crawler(crawler_or_spidercls)
|
|
return self._crawl(crawler, *args, **kwargs)
|
|
|
|
def _crawl(self, crawler, *args, **kwargs):
|
|
self.crawlers.add(crawler)
|
|
d = crawler.crawl(*args, **kwargs)
|
|
self._active.add(d)
|
|
|
|
def _done(result):
|
|
self.crawlers.discard(crawler)
|
|
self._active.discard(d)
|
|
return result
|
|
|
|
return d.addBoth(_done)
|
|
|
|
def create_crawler(self, crawler_or_spidercls):
|
|
"""
|
|
Return a :class:`~scrapy.crawler.Crawler` object.
|
|
|
|
* If `crawler_or_spidercls` is a Crawler, it is returned as-is.
|
|
* If `crawler_or_spidercls` is a Spider subclass, a new Crawler
|
|
is constructed for it.
|
|
* If `crawler_or_spidercls` is a string, this function finds
|
|
a spider with this name in a Scrapy project (using spider loader),
|
|
then creates a Crawler instance for it.
|
|
"""
|
|
if isinstance(crawler_or_spidercls, Crawler):
|
|
return crawler_or_spidercls
|
|
return self._create_crawler(crawler_or_spidercls)
|
|
|
|
def _create_crawler(self, spidercls):
|
|
if isinstance(spidercls, six.string_types):
|
|
spidercls = self.spider_loader.load(spidercls)
|
|
return Crawler(spidercls, self.settings)
|
|
|
|
def stop(self):
|
|
"""
|
|
Stops simultaneously all the crawling jobs taking place.
|
|
|
|
Returns a deferred that is fired when they all have ended.
|
|
"""
|
|
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
|
|
|
|
@defer.inlineCallbacks
|
|
def join(self):
|
|
"""
|
|
join()
|
|
|
|
Returns a deferred that is fired when all managed :attr:`crawlers` have
|
|
completed their executions.
|
|
"""
|
|
while self._active:
|
|
yield defer.DeferredList(self._active)
|
|
|
|
|
|
class CrawlerProcess(CrawlerRunner):
|
|
"""
|
|
A class to run multiple scrapy crawlers in a process simultaneously.
|
|
|
|
This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
|
|
for starting a Twisted `reactor`_ and handling shutdown signals, like the
|
|
keyboard interrupt command Ctrl-C. It also configures top-level logging.
|
|
|
|
This utility should be a better fit than
|
|
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
|
|
Twisted `reactor`_ within your application.
|
|
|
|
The CrawlerProcess object must be instantiated with a
|
|
:class:`~scrapy.settings.Settings` object.
|
|
|
|
:param install_root_handler: whether to install root logging handler
|
|
(default: True)
|
|
|
|
This class shouldn't be needed (since Scrapy is responsible of using it
|
|
accordingly) unless writing scripts that manually handle the crawling
|
|
process. See :ref:`run-from-script` for an example.
|
|
"""
|
|
|
|
def __init__(self, settings=None, install_root_handler=True):
|
|
super(CrawlerProcess, self).__init__(settings)
|
|
install_shutdown_handlers(self._signal_shutdown)
|
|
configure_logging(self.settings, install_root_handler)
|
|
log_scrapy_info(self.settings)
|
|
|
|
def _signal_shutdown(self, signum, _):
|
|
install_shutdown_handlers(self._signal_kill)
|
|
signame = signal_names[signum]
|
|
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
|
|
{'signame': signame})
|
|
reactor.callFromThread(self._graceful_stop_reactor)
|
|
|
|
def _signal_kill(self, signum, _):
|
|
install_shutdown_handlers(signal.SIG_IGN)
|
|
signame = signal_names[signum]
|
|
logger.info('Received %(signame)s twice, forcing unclean shutdown',
|
|
{'signame': signame})
|
|
reactor.callFromThread(self._stop_reactor)
|
|
|
|
def start(self, stop_after_crawl=True):
|
|
"""
|
|
This method starts a Twisted `reactor`_, adjusts its pool size to
|
|
:setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
|
|
on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
|
|
|
|
If `stop_after_crawl` is True, the reactor will be stopped after all
|
|
crawlers have finished, using :meth:`join`.
|
|
|
|
:param boolean stop_after_crawl: stop or not the reactor when all
|
|
crawlers have finished
|
|
"""
|
|
if stop_after_crawl:
|
|
d = self.join()
|
|
# Don't start the reactor if the deferreds are already fired
|
|
if d.called:
|
|
return
|
|
d.addBoth(self._stop_reactor)
|
|
|
|
reactor.installResolver(self._get_dns_resolver())
|
|
tp = reactor.getThreadPool()
|
|
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
|
|
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
|
|
reactor.run(installSignalHandlers=False) # blocking call
|
|
|
|
def _get_dns_resolver(self):
|
|
if self.settings.getbool('DNSCACHE_ENABLED'):
|
|
cache_size = self.settings.getint('DNSCACHE_SIZE')
|
|
else:
|
|
cache_size = 0
|
|
return CachingThreadedResolver(
|
|
reactor=reactor,
|
|
cache_size=cache_size,
|
|
timeout=self.settings.getfloat('DNS_TIMEOUT')
|
|
)
|
|
|
|
def _graceful_stop_reactor(self):
|
|
d = self.stop()
|
|
d.addBoth(self._stop_reactor)
|
|
return d
|
|
|
|
def _stop_reactor(self, _=None):
|
|
try:
|
|
reactor.stop()
|
|
except RuntimeError: # raised if already stopped or in shutdown stage
|
|
pass
|
|
|
|
|
|
def _get_spider_loader(settings):
|
|
""" Get SpiderLoader instance from settings """
|
|
if settings.get('SPIDER_MANAGER_CLASS'):
|
|
warnings.warn(
|
|
'SPIDER_MANAGER_CLASS option is deprecated. '
|
|
'Please use SPIDER_LOADER_CLASS.',
|
|
category=ScrapyDeprecationWarning, stacklevel=2
|
|
)
|
|
cls_path = settings.get('SPIDER_MANAGER_CLASS',
|
|
settings.get('SPIDER_LOADER_CLASS'))
|
|
loader_cls = load_object(cls_path)
|
|
try:
|
|
verifyClass(ISpiderLoader, loader_cls)
|
|
except DoesNotImplement:
|
|
warnings.warn(
|
|
'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
|
|
'not fully implement scrapy.interfaces.ISpiderLoader interface. '
|
|
'Please add all missing methods to avoid unexpected runtime errors.',
|
|
category=ScrapyDeprecationWarning, stacklevel=2
|
|
)
|
|
return loader_cls.from_settings(settings.frozencopy())
|