from __future__ import print_function import sys, os import optparse import cProfile import inspect import pkg_resources import scrapy from scrapy.crawler import CrawlerProcess from scrapy.commands import ScrapyCommand from scrapy.exceptions import UsageError from scrapy.utils.misc import walk_modules from scrapy.utils.project import inside_project, get_project_settings from scrapy.utils.python import garbage_collect from scrapy.settings.deprecated import check_deprecated_settings def _iter_command_classes(module_name): # TODO: add `name` attribute to commands and and merge this function with # scrapy.utils.spider.iter_spider_classes for module in walk_modules(module_name): for obj in vars(module).values(): if inspect.isclass(obj) and \ issubclass(obj, ScrapyCommand) and \ obj.__module__ == module.__name__ and \ not obj == ScrapyCommand: yield obj def _get_commands_from_module(module, inproject): d = {} for cmd in _iter_command_classes(module): if inproject or not cmd.requires_project: cmdname = cmd.__module__.split('.')[-1] d[cmdname] = cmd() return d def _get_commands_from_entry_points(inproject, group='scrapy.commands'): cmds = {} for entry_point in pkg_resources.iter_entry_points(group): obj = entry_point.load() if inspect.isclass(obj): cmds[entry_point.name] = obj() else: raise Exception("Invalid entry point %s" % entry_point.name) return cmds def _get_commands_dict(settings, inproject): cmds = _get_commands_from_module('scrapy.commands', inproject) cmds.update(_get_commands_from_entry_points(inproject)) cmds_module = settings['COMMANDS_MODULE'] if cmds_module: cmds.update(_get_commands_from_module(cmds_module, inproject)) return cmds def _pop_command_name(argv): i = 0 for arg in argv[1:]: if not arg.startswith('-'): del argv[i] return arg i += 1 def _print_header(settings, inproject): if inproject: print("Scrapy %s - project: %s\n" % (scrapy.__version__, \ settings['BOT_NAME'])) else: print("Scrapy %s - no active project\n" % scrapy.__version__) def _print_commands(settings, inproject): _print_header(settings, inproject) print("Usage:") print(" scrapy [options] [args]\n") print("Available commands:") cmds = _get_commands_dict(settings, inproject) for cmdname, cmdclass in sorted(cmds.items()): print(" %-13s %s" % (cmdname, cmdclass.short_desc())) if not inproject: print() print(" [ more ] More commands available when run from project directory") print() print('Use "scrapy -h" to see more info about a command') def _print_unknown_command(settings, cmdname, inproject): _print_header(settings, inproject) print("Unknown command: %s\n" % cmdname) print('Use "scrapy" to see available commands') def _run_print_help(parser, func, *a, **kw): try: func(*a, **kw) except UsageError as e: if str(e): parser.error(str(e)) if e.print_help: parser.print_help() sys.exit(2) def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode) def _run_command(cmd, args, opts): if opts.profile: _run_command_profiled(cmd, args, opts) else: cmd.run(args, opts) def _run_command_profiled(cmd, args, opts): if opts.profile: sys.stderr.write("scrapy: writing cProfile stats to %r\n" % opts.profile) loc = locals() p = cProfile.Profile() p.runctx('cmd.run(args, opts)', globals(), loc) if opts.profile: p.dump_stats(opts.profile) if __name__ == '__main__': try: execute() finally: # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() # on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies garbage_collect()