# Copyright 2018 Yelp # Copyright 2019 Yelp # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """A drop-in replacement for :command:`spark-submit` that can use mrjob's runners. For example, you can submit your spark job to EMR just by adding ``-r emr``. This also adds a few mrjob features that are not standard with :command:`spark-submit`, such as ``--cmdenv``, ``--dirs``, and ``--setup``. .. versionadded:: 0.6.7 .. versionchanged:: 0.6.8 added ``local``, ``spark`` runners, made ``spark`` the default (was ``hadoop``) .. versionchanged:: 0.7.1 ``--archives`` and ``--dirs`` are supported on all masters (except local) Usage:: mrjob spark-submit [-r ] [options] [app arguments] Options:: All runners: -r {emr,hadoop,local,spark}, --runner {emr,hadoop,local,spark} Where to run the job (default: "spark") --class MAIN_CLASS Your application's main class (for Java / Scala apps). --name NAME The name of your application. --jars LIBJARS Comma-separated list of jars to include on the driverand executor classpaths. --packages PACKAGES Comma-separated list of maven coordinates of jars to include on the driver and executor classpaths. Will search the local maven repo, then maven central and any additional remote repositories given by --repositories. The format for the coordinates should be groupId:artifactId:version. --exclude-packages EXCLUDE_PACKAGES Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in --packages to avoid dependency conflicts. --repositories REPOSITORIES Comma-separated list of additional remote repositories to search for the maven coordinates given with --packages. --py-files PY_FILES Comma-separated list of .zip, .egg, or .py files to placed on the PYTHONPATH for Python apps. --files UPLOAD_FILES Comma-separated list of files to be placed in the working directory of each executor. Ignored on local[*] master. --archives UPLOAD_ARCHIVES Comma-separated list of archives to be extracted into the working directory of each executor. --dirs UPLOAD_DIRS Comma-separated list of directors to be archived and then extracted into the working directory of each executor. --cmdenv CMDENV Arbitrary environment variable to set inside Spark, in the format NAME=VALUE. --conf JOBCONF Arbitrary Spark configuration property, in the format PROP=VALUE. --setup SETUP A command to run before each Spark executor in the shell ("touch foo"). In cluster mode, runs before the Spark driver as well. You may interpolate files available via URL or on your local filesystem using Hadoop Distributed Cache syntax (". setup.sh#"). To interpolate archives (YARN only), use #/: "cd foo.tar.gz#/; make. --properties-file PROPERTIES_FILE Path to a file from which to load extra properties. If not specified, this will look for conf/spark- defaults.conf. --driver-memory DRIVER_MEMORY Memory for driver (e.g. 1000M, 2G) (Default: 1024M). --driver-java-options DRIVER_JAVA_OPTIONS Extra Java options to pass to the driver. --driver-library-path DRIVER_LIBRARY_PATH Extra library path entries to pass to the driver. --driver-class-path DRIVER_CLASS_PATH Extra class path entries to pass to the driver. Note that jars added with --jars are automatically included in the classpath. --executor-memory EXECUTOR_MEMORY Memory per executor (e.g. 1000M, 2G) (Default: 1G). --proxy-user PROXY_USER User to impersonate when submitting the application. This argument does not work with --principal / --keytab. -c CONF_PATHS, --conf-path CONF_PATHS Path to alternate mrjob.conf file to read from --no-conf Don't load mrjob.conf even if it's available -q, --quiet Don't print anything to stderr -v, --verbose print more messages to stderr -h, --help show this message and exit Spark and Hadoop runners only: --master SPARK_MASTER spark://host:port, mesos://host:port, yarn,k8s://https://host:port, or local. Defaults to local[*] on spark runner, yarn on hadoop runner. --deploy-mode SPARK_DEPLOY_MODE Whether to launch the driver program locally ("client") or on one of the worker machines inside the cluster ("cluster") (Default: client). Cluster deploy mode only: --driver-cores DRIVER_CORES Number of cores used by the driver (Default: 1). Spark standalone or Mesos with cluster deploy mode only: --supervise If given, restarts the driver on failure. Spark standalone and Mesos only: --total-executor-cores TOTAL_EXECUTOR_CORES Total cores for all executors. Spark standalone and YARN only: --executor-cores EXECUTOR_CORES Number of cores per executor. (Default: 1 in YARN mode, or all available cores on the worker in standalone mode) YARN-only: --queue QUEUE_NAME The YARN queue to submit to (Default: "default"). --num-executors NUM_EXECUTORS Number of executors to launch (Default: 2). If dynamic allocation is enabled, the initial number of executors will be at least NUM. --principal PRINCIPAL Principal to be used to login to KDC, while running onsecure HDFS. --keytab KEYTAB The full path to the file that contains the keytab for the principal specified above. This keytab will be copied to the node running the Application Master via the Secure Distributed Cache, for renewing the login tickets and the delegation tokens periodically. This also supports the same runner-specific switches as :py:class:`~mrjob.job.MRJob`\\s (e.g. ``--hadoop-bin``, ``--region``). """ from __future__ import print_function import os import sys from argparse import ArgumentParser from argparse import REMAINDER from argparse import SUPPRESS from logging import getLogger from mrjob.job import MRJob from mrjob.options import _RUNNER_OPTS from mrjob.options import _add_basic_args from mrjob.options import _add_runner_args from mrjob.options import _parse_raw_args from mrjob.runner import _runner_class from mrjob.step import SparkJarStep from mrjob.step import SparkScriptStep log = getLogger(__name__) _USAGE = ('%(prog)s spark-submit [-r ] [options]' ' [app arguments]') _DESCRIPTION = 'Submit a spark job to Hadoop or the cloud' _BASIC_HELP_EPILOG = ( 'To see help for a specific runner, use --help -r ') _DEPRECATED_OPT_HELP = ( 'To include help for deprecated options, add --deprecated') # for spark-submit args, just need switches and help message # (which can be patched into runner opts with same dest name) # then add runner opts (other than check_input_paths) but don't # display in default help message # the only runners that support spark scripts/jars _SPARK_RUNNERS = ('emr', 'hadoop', 'local', 'spark') # the default spark runner to use _DEFAULT_RUNNER = 'spark' # just find spark-submit and use it # our mostly similar version of spark-submit's args, arranged in to groups # for the --help message. Differences: # # spark_master (--master) is in its own "Spark and Hadoop runners only" group # added upload_dirs (--dirs) which is similar to --archives # # --runner and other basic options are patched into the first ("None") # argument group in _make_basic_help_parser(), below _SPARK_SUBMIT_ARG_GROUPS = [ (None, [ 'main_class', 'name', 'libjars', 'packages', 'exclude_packages', 'repositories', 'py_files', 'upload_files', 'upload_archives', 'upload_dirs', 'cmdenv', 'jobconf', 'setup', 'properties_file', 'driver_memory', 'driver_java_options', 'driver_library_path', 'driver_class_path', 'executor_memory', 'proxy_user', ]), ('Spark and Hadoop runners only', [ 'spark_master', 'spark_deploy_mode', ]), ('Cluster deploy mode only', [ 'driver_cores', ]), ('Spark standalone or Mesos with cluster deploy mode only', [ 'supervise', # --kill and --status aren't for launching jobs ]), ('Spark standalone and Mesos only', [ 'total_executor_cores', ]), ('Spark standalone and YARN only', [ 'executor_cores', ]), ('YARN-only', [ 'queue_name', 'num_executors', 'principal', 'keytab', ]), ] # lightly modified versions of help messages from spark-submit _SPARK_SUBMIT_ARG_HELP = dict( cmdenv=('Arbitrary environment variable to set inside Spark, in the' ' format NAME=VALUE.'), driver_class_path=('Extra class path entries to pass to the driver. Note' ' that jars added with --jars are automatically' ' included in the classpath.'), driver_cores='Number of cores used by the driver (Default: 1).', driver_java_options='Extra Java options to pass to the driver.', driver_library_path='Extra library path entries to pass to the driver.', driver_memory='Memory for driver (e.g. 1000M, 2G) (Default: 1024M).', exclude_packages=('Comma-separated list of groupId:artifactId, to exclude' ' while resolving the dependencies provided in' ' --packages to avoid dependency conflicts.'), executor_cores=('Number of cores per executor. (Default: 1 in YARN mode,' ' or all available cores on the worker in standalone' ' mode)'), executor_memory='Memory per executor (e.g. 1000M, 2G) (Default: 1G).', jobconf=('Arbitrary Spark configuration property, in the format' ' PROP=VALUE.'), keytab=('The full path to the file that contains the keytab for the' ' principal specified above. This keytab will be copied to' ' the node running the Application Master via the Secure' ' Distributed Cache, for renewing the login tickets and the' ' delegation tokens periodically.'), libjars=('Comma-separated list of jars to include on the driver' 'and executor classpaths.'), main_class="Your application's main class (for Java / Scala apps).", name='The name of your application.', num_executors=('Number of executors to launch (Default: 2).' ' If dynamic allocation is enabled, the initial number of' ' executors will be at least NUM.'), packages=('Comma-separated list of maven coordinates of jars to include' ' on the driver and executor classpaths. Will search the local' ' maven repo, then maven central and any additional remote' ' repositories given by --repositories. The format for the' ' coordinates should be groupId:artifactId:version.'), principal=('Principal to be used to login to KDC, while running on' 'secure HDFS.'), properties_file=('Path to a file from which to load extra properties. If' ' not specified, this will look for' ' conf/spark-defaults.conf.'), proxy_user=('User to impersonate when submitting the application.' ' This argument does not work with --principal / --keytab.'), py_files=('Comma-separated list of .zip, .egg, or .py files to place' 'on the PYTHONPATH for Python apps.'), queue_name='The YARN queue to submit to (Default: "default").', repositories=('Comma-separated list of additional remote repositories to' ' search for the maven coordinates given with --packages.'), setup=('A command to run before each Spark executor in the' ' shell ("touch foo"). In cluster mode, runs before the Spark' ' driver as well. You may interpolate files' ' available via URL or on your local filesystem using' ' Hadoop Distributed Cache syntax (". setup.sh#"). To' ' interpolate archives (YARN only), use' ' #/: "cd foo.tar.gz#/; make.'), spark_deploy_mode=('Whether to launch the driver program locally' ' ("client") or on one of the worker machines inside' ' the cluster ("cluster") (Default: client).'), spark_master=('spark://host:port, mesos://host:port, yarn,' 'k8s://https://host:port, or local. Defaults' ' to local[*] on spark runner, yarn on hadoop runner.'), supervise='If given, restarts the driver on failure.', total_executor_cores='Total cores for all executors.', upload_archives=('Comma-separated list of archives to be extracted into' ' the working directory of each executor. Ignored on' ' local[*] master.'), upload_dirs=('Comma-separated list of directors to be archived and then' ' extracted into the working directory of each executor.' ' Ignored on local[*] master.'), upload_files=('Comma-separated list of files to be placed in the working' ' directory of each executor. Ignored on local[*] master.'), ) _SPARK_SUBMIT_OPT_NAMES = { opt_name for _, opt_names in _SPARK_SUBMIT_ARG_GROUPS for opt_name in opt_names } _SPARK_SUBMIT_SWITCHES = dict( cmdenv='--cmdenv', driver_class_path='--driver-class-path', driver_cores='--driver-cores', driver_java_options='--driver-java-options', driver_library_path='--driver-library-path', driver_memory='--driver-memory', exclude_packages='--exclude-packages', executor_cores='--executor-cores', executor_memory='--executor-memory', jobconf='--conf', keytab='--keytab', libjars='--jars', main_class='--class', name='--name', num_executors='--num-executors', packages='--packages', principal='--principal', properties_file='--properties-file', proxy_user='--proxy-user', py_files='--py-files', queue_name='--queue', repositories='--repositories', setup='--setup', spark_deploy_mode='--deploy-mode', spark_master='--master', supervise='--supervise', total_executor_cores='--total-executor-cores', upload_archives='--archives', upload_dirs='--dirs', upload_files='--files', ) # things that are different about specific spark submit args _SPARK_SUBMIT_ARG_KWARGS = dict( supervise=dict(action='store_true'), ) # not a runner opt or one that's passed straight through to spark _STEP_OPT_NAMES = {'main_class'} # arguments that are passed straight through to spark-submit _SPARK_ARG_OPT_NAMES = ( set(_SPARK_SUBMIT_SWITCHES) - set(_RUNNER_OPTS) - _STEP_OPT_NAMES) _SWITCH_ALIASES = { '--master': '--spark-master', '--deploy-mode': '--spark-deploy-mode', '--jars': '--libjars', '--conf': '--jobconf', } # these options don't make any sense with Spark scripts _HARD_CODED_OPTS = dict( check_input_paths=False, output_dir=None, ) def main(cl_args=None): parser = _make_arg_parser() options = parser.parse_args(cl_args) runner_alias = options.runner or _DEFAULT_RUNNER runner_class = _runner_class(runner_alias) if options.help or not options.script_or_jar: _print_help(options, runner_class) sys.exit(0) MRJob.set_up_logging( quiet=options.quiet, verbose=options.verbose, ) kwargs = _get_runner_opt_kwargs(options, runner_class) kwargs.update(_HARD_CODED_OPTS) kwargs['input_paths'] = [os.devnull] step = _get_step(options, parser, cl_args) kwargs['steps'] = [step.description()] runner = runner_class(**kwargs) try: runner.run() finally: runner.cleanup() def _get_runner_opt_kwargs(options, runner_class): """Extract the options for the given runner class from *options*.""" return {opt_name: getattr(options, opt_name) for opt_name in runner_class.OPT_NAMES if hasattr(options, opt_name)} def _get_step(options, parser, cl_args): """Extract the step from the runner options.""" args = options.args main_class = options.main_class spark_args = _get_spark_args(parser, cl_args) script_or_jar = options.script_or_jar if script_or_jar.lower().endswith('.jar'): return SparkJarStep(args=args, jar=script_or_jar, main_class=main_class, spark_args=spark_args) elif script_or_jar.lower().split('.')[-1].startswith('py'): return SparkScriptStep(args=args, script=script_or_jar, spark_args=spark_args) else: raise ValueError('%s appears not to be a JAR or Python script' % options.script_or_jar) def _get_spark_args(parser, cl_args): raw_args = _parse_raw_args(parser, cl_args) spark_args = [] for dest, option_string, args in raw_args: if dest in _SPARK_ARG_OPT_NAMES: spark_args.append(option_string) spark_args.extend(args) return spark_args def _add_spark_submit_arg(parser, opt_name): opt_string = _SPARK_SUBMIT_SWITCHES[opt_name] kwargs = dict(dest=opt_name) # if opt_name is a mrjob opt, parse args like a MRJob would if opt_name in _RUNNER_OPTS: opt_alias = _SWITCH_ALIASES.get(opt_string, opt_string) for opt_strings, opt_kwargs in _RUNNER_OPTS[opt_name]['switches']: if opt_alias in opt_strings: kwargs.update(opt_kwargs) kwargs['help'] = _SPARK_SUBMIT_ARG_HELP[opt_name] kwargs.update(_SPARK_SUBMIT_ARG_KWARGS.get(opt_name) or {}) parser.add_argument(opt_string, **kwargs) def _make_arg_parser(): # this parser is never used for help messages, but # will show usage on error parser = ArgumentParser(usage=_USAGE, add_help=False) # add positional arguments parser.add_argument(dest='script_or_jar', nargs='?') parser.add_argument(dest='args', nargs=REMAINDER) _add_basic_args(parser) _add_runner_alias_arg(parser) _add_help_arg(parser) _add_deprecated_arg(parser) # add runner opts runner_opt_names = set(_RUNNER_OPTS) - set(_HARD_CODED_OPTS) _add_runner_args(parser, opt_names=runner_opt_names) # add spark-specific opts (without colliding with runner opts) for opt_name, switch in _SPARK_SUBMIT_SWITCHES.items(): if opt_name in _RUNNER_OPTS and switch not in _SWITCH_ALIASES: continue _add_spark_submit_arg(parser, opt_name) return parser def _add_runner_alias_arg(parser): # we can't set default here because -r also affects help parser.add_argument( '-r', '--runner', dest='runner', choices=_SPARK_RUNNERS, help=('Where to run the job (default: "%s")' % _DEFAULT_RUNNER)) def _add_help_arg(parser): parser.add_argument( '-h', '--help', dest='help', action='store_true', help='show this message and exit') def _add_deprecated_arg(parser): parser.add_argument( '--deprecated', dest='deprecated', action='store_true', help='include help for deprecated options') def _print_help(options, runner_class): if options.help and options.runner: # if user specifies -r without -h, show basic help _print_help_for_runner(runner_class, include_deprecated=options.deprecated) else: _print_basic_help(include_deprecated=options.deprecated) def _print_help_for_runner(runner_class, include_deprecated=False): help_parser = ArgumentParser(usage=SUPPRESS, add_help=False) arg_group = help_parser.add_argument_group( 'optional arguments for %s runner' % runner_class.alias) # don't include hard-coded opts or opts in basic help opt_names = runner_class.OPT_NAMES - set(_HARD_CODED_OPTS) # don't include switches already in basic help suppress_switches = set(_SPARK_SUBMIT_SWITCHES.values()) # simplify description of aliases of switches in basic help customize_switches = { v: dict(help='Alias for %s' % k) for k, v in _SWITCH_ALIASES.items() } _add_runner_args(arg_group, opt_names, include_deprecated=include_deprecated, customize_switches=customize_switches, suppress_switches=suppress_switches) help_parser.print_help() def _print_basic_help(include_deprecated=False): _make_basic_help_parser(include_deprecated).print_help() if not include_deprecated: print() print(_DEPRECATED_OPT_HELP) def _make_basic_help_parser(include_deprecated=False): """Make an arg parser that's used only for printing basic help. This prints help very similar to spark-submit itself. Runner args are not included unless they are also spark-submit args (e.g. --py-files) """ help_parser = ArgumentParser(usage=_USAGE, description=_DESCRIPTION, epilog=_BASIC_HELP_EPILOG, add_help=False) _add_runner_alias_arg(help_parser) for group_desc, opt_names in _SPARK_SUBMIT_ARG_GROUPS: if group_desc is None: parser_or_group = help_parser else: parser_or_group = help_parser.add_argument_group(group_desc) for opt_name in opt_names: _add_spark_submit_arg(parser_or_group, opt_name) if group_desc is None: _add_basic_args(help_parser) _add_help_arg(help_parser) if include_deprecated: _add_deprecated_arg(help_parser) return help_parser if __name__ == '__main__': main()