mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-09 03:31:28 +00:00
610 lines
23 KiB
Python
610 lines
23 KiB
Python
# Copyright 2018 Yelp
|
|
# Copyright 2019 Yelp
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""A drop-in replacement for :command:`spark-submit` that can use mrjob's
|
|
runners. For example, you can submit your spark job to EMR just by adding
|
|
``-r emr``.
|
|
|
|
This also adds a few mrjob features that are not standard with
|
|
:command:`spark-submit`, such as ``--cmdenv``, ``--dirs``, and ``--setup``.
|
|
|
|
.. versionadded:: 0.6.7
|
|
|
|
.. versionchanged:: 0.6.8
|
|
|
|
added ``local``, ``spark`` runners, made ``spark`` the default (was
|
|
``hadoop``)
|
|
|
|
.. versionchanged:: 0.7.1
|
|
|
|
``--archives`` and ``--dirs`` are supported on all masters (except local)
|
|
|
|
Usage::
|
|
|
|
mrjob spark-submit [-r <runner>] [options] <python file | app jar>
|
|
[app arguments]
|
|
|
|
Options::
|
|
|
|
All runners:
|
|
-r {emr,hadoop,local,spark}, --runner {emr,hadoop,local,spark}
|
|
Where to run the job (default: "spark")
|
|
--class MAIN_CLASS Your application's main class (for Java / Scala apps).
|
|
--name NAME The name of your application.
|
|
--jars LIBJARS Comma-separated list of jars to include on the
|
|
driverand executor classpaths.
|
|
--packages PACKAGES Comma-separated list of maven coordinates of jars to
|
|
include on the driver and executor classpaths. Will
|
|
search the local maven repo, then maven central and
|
|
any additional remote repositories given by
|
|
--repositories. The format for the coordinates should
|
|
be groupId:artifactId:version.
|
|
--exclude-packages EXCLUDE_PACKAGES
|
|
Comma-separated list of groupId:artifactId, to exclude
|
|
while resolving the dependencies provided in
|
|
--packages to avoid dependency conflicts.
|
|
--repositories REPOSITORIES
|
|
Comma-separated list of additional remote repositories
|
|
to search for the maven coordinates given with
|
|
--packages.
|
|
--py-files PY_FILES Comma-separated list of .zip, .egg, or .py files to
|
|
placed on the PYTHONPATH for Python apps.
|
|
--files UPLOAD_FILES Comma-separated list of files to be placed in the
|
|
working directory of each executor. Ignored on
|
|
local[*] master.
|
|
--archives UPLOAD_ARCHIVES
|
|
Comma-separated list of archives to be extracted into
|
|
the working directory of each executor.
|
|
--dirs UPLOAD_DIRS Comma-separated list of directors to be archived and
|
|
then extracted into the working directory of each
|
|
executor.
|
|
--cmdenv CMDENV Arbitrary environment variable to set inside Spark, in
|
|
the format NAME=VALUE.
|
|
--conf JOBCONF Arbitrary Spark configuration property, in the format
|
|
PROP=VALUE.
|
|
--setup SETUP A command to run before each Spark executor in the
|
|
shell ("touch foo"). In cluster mode, runs before the
|
|
Spark driver as well. You may interpolate files
|
|
available via URL or on your local filesystem using
|
|
Hadoop Distributed Cache syntax (". setup.sh#"). To
|
|
interpolate archives (YARN only), use #/: "cd
|
|
foo.tar.gz#/; make.
|
|
--properties-file PROPERTIES_FILE
|
|
Path to a file from which to load extra properties. If
|
|
not specified, this will look for conf/spark-
|
|
defaults.conf.
|
|
--driver-memory DRIVER_MEMORY
|
|
Memory for driver (e.g. 1000M, 2G) (Default: 1024M).
|
|
--driver-java-options DRIVER_JAVA_OPTIONS
|
|
Extra Java options to pass to the driver.
|
|
--driver-library-path DRIVER_LIBRARY_PATH
|
|
Extra library path entries to pass to the driver.
|
|
--driver-class-path DRIVER_CLASS_PATH
|
|
Extra class path entries to pass to the driver. Note
|
|
that jars added with --jars are automatically included
|
|
in the classpath.
|
|
--executor-memory EXECUTOR_MEMORY
|
|
Memory per executor (e.g. 1000M, 2G) (Default: 1G).
|
|
--proxy-user PROXY_USER
|
|
User to impersonate when submitting the application.
|
|
This argument does not work with --principal /
|
|
--keytab.
|
|
-c CONF_PATHS, --conf-path CONF_PATHS
|
|
Path to alternate mrjob.conf file to read from
|
|
--no-conf Don't load mrjob.conf even if it's available
|
|
-q, --quiet Don't print anything to stderr
|
|
-v, --verbose print more messages to stderr
|
|
-h, --help show this message and exit
|
|
|
|
Spark and Hadoop runners only:
|
|
--master SPARK_MASTER
|
|
spark://host:port, mesos://host:port,
|
|
yarn,k8s://https://host:port, or local. Defaults to
|
|
local[*] on spark runner, yarn on hadoop runner.
|
|
--deploy-mode SPARK_DEPLOY_MODE
|
|
Whether to launch the driver program locally
|
|
("client") or on one of the worker machines inside the
|
|
cluster ("cluster") (Default: client).
|
|
|
|
Cluster deploy mode only:
|
|
--driver-cores DRIVER_CORES
|
|
Number of cores used by the driver (Default: 1).
|
|
|
|
Spark standalone or Mesos with cluster deploy mode only:
|
|
--supervise If given, restarts the driver on failure.
|
|
|
|
Spark standalone and Mesos only:
|
|
--total-executor-cores TOTAL_EXECUTOR_CORES
|
|
Total cores for all executors.
|
|
|
|
Spark standalone and YARN only:
|
|
--executor-cores EXECUTOR_CORES
|
|
Number of cores per executor. (Default: 1 in YARN
|
|
mode, or all available cores on the worker in
|
|
standalone mode)
|
|
|
|
YARN-only:
|
|
--queue QUEUE_NAME The YARN queue to submit to (Default: "default").
|
|
--num-executors NUM_EXECUTORS
|
|
Number of executors to launch (Default: 2). If dynamic
|
|
allocation is enabled, the initial number of executors
|
|
will be at least NUM.
|
|
--principal PRINCIPAL
|
|
Principal to be used to login to KDC, while running
|
|
onsecure HDFS.
|
|
--keytab KEYTAB The full path to the file that contains the keytab for
|
|
the principal specified above. This keytab will be
|
|
copied to the node running the Application Master via
|
|
the Secure Distributed Cache, for renewing the login
|
|
tickets and the delegation tokens periodically.
|
|
|
|
This also supports the same runner-specific switches as
|
|
:py:class:`~mrjob.job.MRJob`\\s (e.g. ``--hadoop-bin``, ``--region``).
|
|
|
|
"""
|
|
from __future__ import print_function
|
|
|
|
import os
|
|
import sys
|
|
from argparse import ArgumentParser
|
|
from argparse import REMAINDER
|
|
from argparse import SUPPRESS
|
|
from logging import getLogger
|
|
|
|
from mrjob.job import MRJob
|
|
from mrjob.options import _RUNNER_OPTS
|
|
from mrjob.options import _add_basic_args
|
|
from mrjob.options import _add_runner_args
|
|
from mrjob.options import _parse_raw_args
|
|
from mrjob.runner import _runner_class
|
|
from mrjob.step import SparkJarStep
|
|
from mrjob.step import SparkScriptStep
|
|
|
|
log = getLogger(__name__)
|
|
|
|
|
|
_USAGE = ('%(prog)s spark-submit [-r <runner>] [options]'
|
|
' <python file | app jar> [app arguments]')
|
|
|
|
_DESCRIPTION = 'Submit a spark job to Hadoop or the cloud'
|
|
|
|
_BASIC_HELP_EPILOG = (
|
|
'To see help for a specific runner, use --help -r <runner name>')
|
|
|
|
_DEPRECATED_OPT_HELP = (
|
|
'To include help for deprecated options, add --deprecated')
|
|
|
|
|
|
# for spark-submit args, just need switches and help message
|
|
# (which can be patched into runner opts with same dest name)
|
|
|
|
# then add runner opts (other than check_input_paths) but don't
|
|
# display in default help message
|
|
|
|
# the only runners that support spark scripts/jars
|
|
_SPARK_RUNNERS = ('emr', 'hadoop', 'local', 'spark')
|
|
|
|
# the default spark runner to use
|
|
_DEFAULT_RUNNER = 'spark' # just find spark-submit and use it
|
|
|
|
|
|
# our mostly similar version of spark-submit's args, arranged in to groups
|
|
# for the --help message. Differences:
|
|
#
|
|
# spark_master (--master) is in its own "Spark and Hadoop runners only" group
|
|
# added upload_dirs (--dirs) which is similar to --archives
|
|
#
|
|
# --runner and other basic options are patched into the first ("None")
|
|
# argument group in _make_basic_help_parser(), below
|
|
_SPARK_SUBMIT_ARG_GROUPS = [
|
|
(None, [
|
|
'main_class',
|
|
'name',
|
|
'libjars',
|
|
'packages',
|
|
'exclude_packages',
|
|
'repositories',
|
|
'py_files',
|
|
'upload_files',
|
|
'upload_archives',
|
|
'upload_dirs',
|
|
'cmdenv',
|
|
'jobconf',
|
|
'setup',
|
|
'properties_file',
|
|
'driver_memory',
|
|
'driver_java_options',
|
|
'driver_library_path',
|
|
'driver_class_path',
|
|
'executor_memory',
|
|
'proxy_user',
|
|
]),
|
|
('Spark and Hadoop runners only', [
|
|
'spark_master',
|
|
'spark_deploy_mode',
|
|
]),
|
|
('Cluster deploy mode only', [
|
|
'driver_cores',
|
|
]),
|
|
('Spark standalone or Mesos with cluster deploy mode only', [
|
|
'supervise',
|
|
# --kill and --status aren't for launching jobs
|
|
]),
|
|
('Spark standalone and Mesos only', [
|
|
'total_executor_cores',
|
|
]),
|
|
('Spark standalone and YARN only', [
|
|
'executor_cores',
|
|
]),
|
|
('YARN-only', [
|
|
'queue_name',
|
|
'num_executors',
|
|
'principal',
|
|
'keytab',
|
|
]),
|
|
]
|
|
|
|
# lightly modified versions of help messages from spark-submit
|
|
_SPARK_SUBMIT_ARG_HELP = dict(
|
|
cmdenv=('Arbitrary environment variable to set inside Spark, in the'
|
|
' format NAME=VALUE.'),
|
|
driver_class_path=('Extra class path entries to pass to the driver. Note'
|
|
' that jars added with --jars are automatically'
|
|
' included in the classpath.'),
|
|
driver_cores='Number of cores used by the driver (Default: 1).',
|
|
driver_java_options='Extra Java options to pass to the driver.',
|
|
driver_library_path='Extra library path entries to pass to the driver.',
|
|
driver_memory='Memory for driver (e.g. 1000M, 2G) (Default: 1024M).',
|
|
exclude_packages=('Comma-separated list of groupId:artifactId, to exclude'
|
|
' while resolving the dependencies provided in'
|
|
' --packages to avoid dependency conflicts.'),
|
|
executor_cores=('Number of cores per executor. (Default: 1 in YARN mode,'
|
|
' or all available cores on the worker in standalone'
|
|
' mode)'),
|
|
executor_memory='Memory per executor (e.g. 1000M, 2G) (Default: 1G).',
|
|
jobconf=('Arbitrary Spark configuration property, in the format'
|
|
' PROP=VALUE.'),
|
|
keytab=('The full path to the file that contains the keytab for the'
|
|
' principal specified above. This keytab will be copied to'
|
|
' the node running the Application Master via the Secure'
|
|
' Distributed Cache, for renewing the login tickets and the'
|
|
' delegation tokens periodically.'),
|
|
libjars=('Comma-separated list of jars to include on the driver'
|
|
'and executor classpaths.'),
|
|
main_class="Your application's main class (for Java / Scala apps).",
|
|
name='The name of your application.',
|
|
num_executors=('Number of executors to launch (Default: 2).'
|
|
' If dynamic allocation is enabled, the initial number of'
|
|
' executors will be at least NUM.'),
|
|
packages=('Comma-separated list of maven coordinates of jars to include'
|
|
' on the driver and executor classpaths. Will search the local'
|
|
' maven repo, then maven central and any additional remote'
|
|
' repositories given by --repositories. The format for the'
|
|
' coordinates should be groupId:artifactId:version.'),
|
|
principal=('Principal to be used to login to KDC, while running on'
|
|
'secure HDFS.'),
|
|
properties_file=('Path to a file from which to load extra properties. If'
|
|
' not specified, this will look for'
|
|
' conf/spark-defaults.conf.'),
|
|
proxy_user=('User to impersonate when submitting the application.'
|
|
' This argument does not work with --principal / --keytab.'),
|
|
py_files=('Comma-separated list of .zip, .egg, or .py files to place'
|
|
'on the PYTHONPATH for Python apps.'),
|
|
queue_name='The YARN queue to submit to (Default: "default").',
|
|
repositories=('Comma-separated list of additional remote repositories to'
|
|
' search for the maven coordinates given with --packages.'),
|
|
setup=('A command to run before each Spark executor in the'
|
|
' shell ("touch foo"). In cluster mode, runs before the Spark'
|
|
' driver as well. You may interpolate files'
|
|
' available via URL or on your local filesystem using'
|
|
' Hadoop Distributed Cache syntax (". setup.sh#"). To'
|
|
' interpolate archives (YARN only), use'
|
|
' #/: "cd foo.tar.gz#/; make.'),
|
|
spark_deploy_mode=('Whether to launch the driver program locally'
|
|
' ("client") or on one of the worker machines inside'
|
|
' the cluster ("cluster") (Default: client).'),
|
|
spark_master=('spark://host:port, mesos://host:port, yarn,'
|
|
'k8s://https://host:port, or local. Defaults'
|
|
' to local[*] on spark runner, yarn on hadoop runner.'),
|
|
supervise='If given, restarts the driver on failure.',
|
|
total_executor_cores='Total cores for all executors.',
|
|
upload_archives=('Comma-separated list of archives to be extracted into'
|
|
' the working directory of each executor. Ignored on'
|
|
' local[*] master.'),
|
|
upload_dirs=('Comma-separated list of directors to be archived and then'
|
|
' extracted into the working directory of each executor.'
|
|
' Ignored on local[*] master.'),
|
|
upload_files=('Comma-separated list of files to be placed in the working'
|
|
' directory of each executor. Ignored on local[*] master.'),
|
|
)
|
|
|
|
_SPARK_SUBMIT_OPT_NAMES = {
|
|
opt_name for _, opt_names in _SPARK_SUBMIT_ARG_GROUPS
|
|
for opt_name in opt_names
|
|
}
|
|
|
|
_SPARK_SUBMIT_SWITCHES = dict(
|
|
cmdenv='--cmdenv',
|
|
driver_class_path='--driver-class-path',
|
|
driver_cores='--driver-cores',
|
|
driver_java_options='--driver-java-options',
|
|
driver_library_path='--driver-library-path',
|
|
driver_memory='--driver-memory',
|
|
exclude_packages='--exclude-packages',
|
|
executor_cores='--executor-cores',
|
|
executor_memory='--executor-memory',
|
|
jobconf='--conf',
|
|
keytab='--keytab',
|
|
libjars='--jars',
|
|
main_class='--class',
|
|
name='--name',
|
|
num_executors='--num-executors',
|
|
packages='--packages',
|
|
principal='--principal',
|
|
properties_file='--properties-file',
|
|
proxy_user='--proxy-user',
|
|
py_files='--py-files',
|
|
queue_name='--queue',
|
|
repositories='--repositories',
|
|
setup='--setup',
|
|
spark_deploy_mode='--deploy-mode',
|
|
spark_master='--master',
|
|
supervise='--supervise',
|
|
total_executor_cores='--total-executor-cores',
|
|
upload_archives='--archives',
|
|
upload_dirs='--dirs',
|
|
upload_files='--files',
|
|
)
|
|
|
|
# things that are different about specific spark submit args
|
|
_SPARK_SUBMIT_ARG_KWARGS = dict(
|
|
supervise=dict(action='store_true'),
|
|
)
|
|
|
|
# not a runner opt or one that's passed straight through to spark
|
|
_STEP_OPT_NAMES = {'main_class'}
|
|
|
|
# arguments that are passed straight through to spark-submit
|
|
_SPARK_ARG_OPT_NAMES = (
|
|
set(_SPARK_SUBMIT_SWITCHES) - set(_RUNNER_OPTS) - _STEP_OPT_NAMES)
|
|
|
|
_SWITCH_ALIASES = {
|
|
'--master': '--spark-master',
|
|
'--deploy-mode': '--spark-deploy-mode',
|
|
'--jars': '--libjars',
|
|
'--conf': '--jobconf',
|
|
}
|
|
|
|
# these options don't make any sense with Spark scripts
|
|
_HARD_CODED_OPTS = dict(
|
|
check_input_paths=False,
|
|
output_dir=None,
|
|
)
|
|
|
|
|
|
def main(cl_args=None):
|
|
parser = _make_arg_parser()
|
|
options = parser.parse_args(cl_args)
|
|
|
|
runner_alias = options.runner or _DEFAULT_RUNNER
|
|
runner_class = _runner_class(runner_alias)
|
|
|
|
if options.help or not options.script_or_jar:
|
|
_print_help(options, runner_class)
|
|
sys.exit(0)
|
|
|
|
MRJob.set_up_logging(
|
|
quiet=options.quiet,
|
|
verbose=options.verbose,
|
|
)
|
|
|
|
kwargs = _get_runner_opt_kwargs(options, runner_class)
|
|
kwargs.update(_HARD_CODED_OPTS)
|
|
|
|
kwargs['input_paths'] = [os.devnull]
|
|
|
|
step = _get_step(options, parser, cl_args)
|
|
kwargs['steps'] = [step.description()]
|
|
|
|
runner = runner_class(**kwargs)
|
|
|
|
try:
|
|
runner.run()
|
|
finally:
|
|
runner.cleanup()
|
|
|
|
|
|
def _get_runner_opt_kwargs(options, runner_class):
|
|
"""Extract the options for the given runner class from *options*."""
|
|
return {opt_name: getattr(options, opt_name)
|
|
for opt_name in runner_class.OPT_NAMES
|
|
if hasattr(options, opt_name)}
|
|
|
|
|
|
def _get_step(options, parser, cl_args):
|
|
"""Extract the step from the runner options."""
|
|
args = options.args
|
|
main_class = options.main_class
|
|
spark_args = _get_spark_args(parser, cl_args)
|
|
script_or_jar = options.script_or_jar
|
|
|
|
if script_or_jar.lower().endswith('.jar'):
|
|
return SparkJarStep(args=args,
|
|
jar=script_or_jar,
|
|
main_class=main_class,
|
|
spark_args=spark_args)
|
|
elif script_or_jar.lower().split('.')[-1].startswith('py'):
|
|
return SparkScriptStep(args=args,
|
|
script=script_or_jar,
|
|
spark_args=spark_args)
|
|
else:
|
|
raise ValueError('%s appears not to be a JAR or Python script' %
|
|
options.script_or_jar)
|
|
|
|
|
|
def _get_spark_args(parser, cl_args):
|
|
raw_args = _parse_raw_args(parser, cl_args)
|
|
|
|
spark_args = []
|
|
|
|
for dest, option_string, args in raw_args:
|
|
if dest in _SPARK_ARG_OPT_NAMES:
|
|
spark_args.append(option_string)
|
|
spark_args.extend(args)
|
|
|
|
return spark_args
|
|
|
|
|
|
def _add_spark_submit_arg(parser, opt_name):
|
|
opt_string = _SPARK_SUBMIT_SWITCHES[opt_name]
|
|
|
|
kwargs = dict(dest=opt_name)
|
|
|
|
# if opt_name is a mrjob opt, parse args like a MRJob would
|
|
if opt_name in _RUNNER_OPTS:
|
|
opt_alias = _SWITCH_ALIASES.get(opt_string, opt_string)
|
|
|
|
for opt_strings, opt_kwargs in _RUNNER_OPTS[opt_name]['switches']:
|
|
if opt_alias in opt_strings:
|
|
kwargs.update(opt_kwargs)
|
|
|
|
kwargs['help'] = _SPARK_SUBMIT_ARG_HELP[opt_name]
|
|
kwargs.update(_SPARK_SUBMIT_ARG_KWARGS.get(opt_name) or {})
|
|
|
|
parser.add_argument(opt_string, **kwargs)
|
|
|
|
|
|
def _make_arg_parser():
|
|
# this parser is never used for help messages, but
|
|
# will show usage on error
|
|
parser = ArgumentParser(usage=_USAGE, add_help=False)
|
|
|
|
# add positional arguments
|
|
parser.add_argument(dest='script_or_jar', nargs='?')
|
|
parser.add_argument(dest='args', nargs=REMAINDER)
|
|
|
|
_add_basic_args(parser)
|
|
_add_runner_alias_arg(parser)
|
|
_add_help_arg(parser)
|
|
_add_deprecated_arg(parser)
|
|
|
|
# add runner opts
|
|
runner_opt_names = set(_RUNNER_OPTS) - set(_HARD_CODED_OPTS)
|
|
_add_runner_args(parser, opt_names=runner_opt_names)
|
|
|
|
# add spark-specific opts (without colliding with runner opts)
|
|
for opt_name, switch in _SPARK_SUBMIT_SWITCHES.items():
|
|
if opt_name in _RUNNER_OPTS and switch not in _SWITCH_ALIASES:
|
|
continue
|
|
_add_spark_submit_arg(parser, opt_name)
|
|
|
|
return parser
|
|
|
|
|
|
def _add_runner_alias_arg(parser):
|
|
# we can't set default here because -r also affects help
|
|
parser.add_argument(
|
|
'-r', '--runner', dest='runner',
|
|
choices=_SPARK_RUNNERS,
|
|
help=('Where to run the job (default: "%s")'
|
|
% _DEFAULT_RUNNER))
|
|
|
|
|
|
def _add_help_arg(parser):
|
|
parser.add_argument(
|
|
'-h', '--help', dest='help', action='store_true',
|
|
help='show this message and exit')
|
|
|
|
|
|
def _add_deprecated_arg(parser):
|
|
parser.add_argument(
|
|
'--deprecated', dest='deprecated', action='store_true',
|
|
help='include help for deprecated options')
|
|
|
|
|
|
def _print_help(options, runner_class):
|
|
if options.help and options.runner:
|
|
# if user specifies -r without -h, show basic help
|
|
_print_help_for_runner(runner_class,
|
|
include_deprecated=options.deprecated)
|
|
else:
|
|
_print_basic_help(include_deprecated=options.deprecated)
|
|
|
|
|
|
def _print_help_for_runner(runner_class, include_deprecated=False):
|
|
help_parser = ArgumentParser(usage=SUPPRESS, add_help=False)
|
|
|
|
arg_group = help_parser.add_argument_group(
|
|
'optional arguments for %s runner' % runner_class.alias)
|
|
|
|
# don't include hard-coded opts or opts in basic help
|
|
opt_names = runner_class.OPT_NAMES - set(_HARD_CODED_OPTS)
|
|
|
|
# don't include switches already in basic help
|
|
suppress_switches = set(_SPARK_SUBMIT_SWITCHES.values())
|
|
|
|
# simplify description of aliases of switches in basic help
|
|
customize_switches = {
|
|
v: dict(help='Alias for %s' % k)
|
|
for k, v in _SWITCH_ALIASES.items()
|
|
}
|
|
|
|
_add_runner_args(arg_group, opt_names,
|
|
include_deprecated=include_deprecated,
|
|
customize_switches=customize_switches,
|
|
suppress_switches=suppress_switches)
|
|
|
|
help_parser.print_help()
|
|
|
|
|
|
def _print_basic_help(include_deprecated=False):
|
|
_make_basic_help_parser(include_deprecated).print_help()
|
|
|
|
if not include_deprecated:
|
|
print()
|
|
print(_DEPRECATED_OPT_HELP)
|
|
|
|
|
|
def _make_basic_help_parser(include_deprecated=False):
|
|
"""Make an arg parser that's used only for printing basic help.
|
|
|
|
This prints help very similar to spark-submit itself. Runner args
|
|
are not included unless they are also spark-submit args (e.g. --py-files)
|
|
"""
|
|
help_parser = ArgumentParser(usage=_USAGE, description=_DESCRIPTION,
|
|
epilog=_BASIC_HELP_EPILOG, add_help=False)
|
|
|
|
_add_runner_alias_arg(help_parser)
|
|
|
|
for group_desc, opt_names in _SPARK_SUBMIT_ARG_GROUPS:
|
|
if group_desc is None:
|
|
parser_or_group = help_parser
|
|
else:
|
|
parser_or_group = help_parser.add_argument_group(group_desc)
|
|
|
|
for opt_name in opt_names:
|
|
_add_spark_submit_arg(parser_or_group, opt_name)
|
|
|
|
if group_desc is None:
|
|
_add_basic_args(help_parser)
|
|
_add_help_arg(help_parser)
|
|
if include_deprecated:
|
|
_add_deprecated_arg(help_parser)
|
|
|
|
return help_parser
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|