from __future__ import print_function
|
|
import sys, os
|
|
import optparse
|
|
import cProfile
|
|
import inspect
|
|
import pkg_resources
|
|
|
|
import scrapy
|
|
from scrapy.crawler import CrawlerProcess
|
|
from scrapy.commands import ScrapyCommand
|
|
from scrapy.exceptions import UsageError
|
|
from scrapy.utils.misc import walk_modules
|
|
from scrapy.utils.project import inside_project, get_project_settings
|
|
from scrapy.utils.python import garbage_collect
|
|
from scrapy.settings.deprecated import check_deprecated_settings
|
|
|
|
def _iter_command_classes(module_name):
|
|
# TODO: add `name` attribute to commands and and merge this function with
|
|
# scrapy.utils.spider.iter_spider_classes
|
|
for module in walk_modules(module_name):
|
|
for obj in vars(module).values():
|
|
if inspect.isclass(obj) and \
|
|
issubclass(obj, ScrapyCommand) and \
|
|
obj.__module__ == module.__name__ and \
|
|
not obj == ScrapyCommand:
|
|
yield obj
|
|
|
|
def _get_commands_from_module(module, inproject):
|
|
d = {}
|
|
for cmd in _iter_command_classes(module):
|
|
if inproject or not cmd.requires_project:
|
|
cmdname = cmd.__module__.split('.')[-1]
|
|
d[cmdname] = cmd()
|
|
return d
|
|
|
|
def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
|
|
cmds = {}
|
|
for entry_point in pkg_resources.iter_entry_points(group):
|
|
obj = entry_point.load()
|
|
if inspect.isclass(obj):
|
|
cmds[entry_point.name] = obj()
|
|
else:
|
|
raise Exception("Invalid entry point %s" % entry_point.name)
|
|
return cmds
|
|
|
|
def _get_commands_dict(settings, inproject):
|
|
cmds = _get_commands_from_module('scrapy.commands', inproject)
|
|
cmds.update(_get_commands_from_entry_points(inproject))
|
|
cmds_module = settings['COMMANDS_MODULE']
|
|
if cmds_module:
|
|
cmds.update(_get_commands_from_module(cmds_module, inproject))
|
|
return cmds
|
|
|
|
def _pop_command_name(argv):
|
|
i = 0
|
|
for arg in argv[1:]:
|
|
if not arg.startswith('-'):
|
|
del argv[i]
|
|
return arg
|
|
i += 1
|
|
|
|
def _print_header(settings, inproject):
|
|
if inproject:
|
|
print("Scrapy %s - project: %s\n" % (scrapy.__version__, \
|
|
settings['BOT_NAME']))
|
|
else:
|
|
print("Scrapy %s - no active project\n" % scrapy.__version__)
|
|
|
|
def _print_commands(settings, inproject):
|
|
_print_header(settings, inproject)
|
|
print("Usage:")
|
|
print(" scrapy <command> [options] [args]\n")
|
|
print("Available commands:")
|
|
cmds = _get_commands_dict(settings, inproject)
|
|
for cmdname, cmdclass in sorted(cmds.items()):
|
|
print(" %-13s %s" % (cmdname, cmdclass.short_desc()))
|
|
if not inproject:
|
|
print()
|
|
print(" [ more ] More commands available when run from project directory")
|
|
print()
|
|
print('Use "scrapy <command> -h" to see more info about a command')
|
|
|
|
def _print_unknown_command(settings, cmdname, inproject):
|
|
_print_header(settings, inproject)
|
|
print("Unknown command: %s\n" % cmdname)
|
|
print('Use "scrapy" to see available commands')
|
|
|
|
def _run_print_help(parser, func, *a, **kw):
|
|
try:
|
|
func(*a, **kw)
|
|
except UsageError as e:
|
|
if str(e):
|
|
parser.error(str(e))
|
|
if e.print_help:
|
|
parser.print_help()
|
|
sys.exit(2)
|
|
|
|
def execute(argv=None, settings=None):
|
|
if argv is None:
|
|
argv = sys.argv
|
|
|
|
# --- backwards compatibility for scrapy.conf.settings singleton ---
|
|
if settings is None and 'scrapy.conf' in sys.modules:
|
|
from scrapy import conf
|
|
if hasattr(conf, 'settings'):
|
|
settings = conf.settings
|
|
# ------------------------------------------------------------------
|
|
|
|
if settings is None:
|
|
settings = get_project_settings()
|
|
# set EDITOR from environment if available
|
|
try:
|
|
editor = os.environ['EDITOR']
|
|
except KeyError: pass
|
|
else:
|
|
settings['EDITOR'] = editor
|
|
check_deprecated_settings(settings)
|
|
|
|
# --- backwards compatibility for scrapy.conf.settings singleton ---
|
|
import warnings
|
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
|
|
from scrapy import conf
|
|
conf.settings = settings
|
|
# ------------------------------------------------------------------
|
|
|
|
inproject = inside_project()
|
|
cmds = _get_commands_dict(settings, inproject)
|
|
cmdname = _pop_command_name(argv)
|
|
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
|
|
conflict_handler='resolve')
|
|
if not cmdname:
|
|
_print_commands(settings, inproject)
|
|
sys.exit(0)
|
|
elif cmdname not in cmds:
|
|
_print_unknown_command(settings, cmdname, inproject)
|
|
sys.exit(2)
|
|
|
|
cmd = cmds[cmdname]
|
|
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
|
|
parser.description = cmd.long_desc()
|
|
settings.setdict(cmd.default_settings, priority='command')
|
|
cmd.settings = settings
|
|
cmd.add_options(parser)
|
|
opts, args = parser.parse_args(args=argv[1:])
|
|
_run_print_help(parser, cmd.process_options, args, opts)
|
|
|
|
cmd.crawler_process = CrawlerProcess(settings)
|
|
_run_print_help(parser, _run_command, cmd, args, opts)
|
|
sys.exit(cmd.exitcode)
|
|
|
|
def _run_command(cmd, args, opts):
|
|
if opts.profile:
|
|
_run_command_profiled(cmd, args, opts)
|
|
else:
|
|
cmd.run(args, opts)
|
|
|
|
def _run_command_profiled(cmd, args, opts):
|
|
if opts.profile:
|
|
sys.stderr.write("scrapy: writing cProfile stats to %r\n" % opts.profile)
|
|
loc = locals()
|
|
p = cProfile.Profile()
|
|
p.runctx('cmd.run(args, opts)', globals(), loc)
|
|
if opts.profile:
|
|
p.dump_stats(opts.profile)
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
execute()
|
|
finally:
|
|
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect()
|
|
# on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies
|
|
garbage_collect()
|