GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/options.py
2025-06-22 13:36:01 +05:30

1746 lines
58 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2009-2016 Yelp and Contributors
# Copyright 2017 Yelp
# Copyright 2018 Yelp, Google, Inc., and Contributors
# Copyright 2019 Yelp
# Copyright 2020 Affirm, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to populate py:class:`~argparse.ArgumentParser``
objects with categorized command line parameters.
"""
from __future__ import print_function
import json
import re
from argparse import Action
from argparse import ArgumentParser
from argparse import SUPPRESS
from logging import getLogger
from mrjob.conf import combine_cmds
from mrjob.conf import combine_dicts
from mrjob.conf import combine_envs
from mrjob.conf import combine_jobconfs
from mrjob.conf import combine_lists
from mrjob.conf import combine_paths
from mrjob.conf import combine_path_lists
from mrjob.parse import _parse_port_range_list
from mrjob.util import shlex_split
log = getLogger(__name__)
#: cleanup options:
#:
#: * ``'ALL'``: delete logs and local and remote temp files; stop cluster
#: if on EMR and the job is not done when cleanup is run.
#: * ``'CLOUD_TMP'``: delete temp files on cloud storage (e.g. S3) only
#: * ``'CLUSTER'``: terminate the cluster if on EMR and the job is not done
#: on cleanup
#: * ``'HADOOP_TMP'``: delete temp files on HDFS only
#: * ``'JOB'``: stop job if on EMR and the job is not done when cleanup runs
#: * ``'LOCAL_TMP'``: delete local temp files only
#: * ``'LOGS'``: delete logs only
#: * ``'NONE'``: delete nothing
#: * ``'TMP'``: delete local, HDFS, and cloud storage temp files, but not logs
CLEANUP_CHOICES = [
'ALL',
'CLOUD_TMP',
'CLUSTER',
'HADOOP_TMP',
'JOB',
'LOCAL_TMP',
'LOGS',
'NONE',
'TMP',
]
# use to identify malformed JSON
_PROBABLY_JSON_RE = re.compile(r'^\s*[\{\[\"].*$')
# names of runners
_RUNNER_ALIASES = {
'dataproc',
'emr',
'hadoop',
'inline',
'local',
'spark',
}
### custom actions ###
def _default_to(namespace, dest, value):
"""Helper function; set the given attribute to *value* if it's None."""
if getattr(namespace, dest) is None:
setattr(namespace, dest, value)
# these actions are only used by _add_runner_args(), so we can assume *value*
# is a string
class _KeyValueAction(Action):
"""action for KEY=VALUE pairs"""
# used for --cmdenv, --jobconf, and more
def __call__(self, parser, namespace, value, option_string=None):
try:
k, v = value.split('=', 1)
except ValueError:
parser.error('%s argument %r is not of the form KEY=VALUE' % (
option_string, value))
_default_to(namespace, self.dest, {})
getattr(namespace, self.dest)[k] = v
class _KeyNoneValueAction(Action):
"""action to set KEY to None"""
def __call__(self, parser, namespace, value, option_string=None):
_default_to(namespace, self.dest, {})
getattr(namespace, self.dest)[value] = None
class _CleanupAction(Action):
"""action to parse a comma-separated list of cleanup constants."""
def __call__(self, parser, namespace, value, option_string=None):
result = []
for choice in value.split(','):
if choice in CLEANUP_CHOICES:
result.append(choice)
else:
parser.error(
'%s got %s, which is not one of: %s' %
(option_string, choice, ', '.join(CLEANUP_CHOICES)))
if 'NONE' in result and len(set(result)) > 1:
parser.error(
'%s: Cannot clean up both nothing and something!' %
option_string)
setattr(namespace, self.dest, result)
class _CommaSeparatedListAction(Action):
"""action to parse a comma-separated list of subnets.
This eliminates whitespace
"""
def __call__(self, parser, namespace, value, option_string=None):
items = [s.strip() for s in value.split(',') if s]
setattr(namespace, self.dest, items)
class _AppendCommaSeparatedItemsAction(Action):
"""action to parse a comma-separated list and append
each of them to an existing list.
This eliminates whitespace
"""
def __call__(self, parser, namespace, value, option_string=None):
_default_to(namespace, self.dest, [])
items = [s.strip() for s in value.split(',') if s]
getattr(namespace, self.dest).extend(items)
class _AppendArgsAction(Action):
"""action to parse one or more arguments and append them to a list."""
def __call__(self, parser, namespace, value, option_string=None):
_default_to(namespace, self.dest, [])
args = shlex_split(value)
getattr(namespace, self.dest).extend(args)
class _AppendJSONAction(Action):
"""action to parse JSON and append it to a list."""
def __call__(self, parser, namespace, value, option_string=None):
_default_to(namespace, self.dest, [])
try:
j = json.loads(value)
except ValueError as e:
parser.error('Malformed JSON passed to %s: %s' % (
option_string, str(e)))
getattr(namespace, self.dest).append(j)
class _KeyJSONValueAction(Action):
"""action for KEY=<json> pairs. Allows value to be a string, as long
as it doesn't start with ``[``, ``{``, or ``"``."""
# used for --extra-cluster-param
def __call__(self, parser, namespace, value, option_string=None):
try:
k, v = value.split('=', 1)
except ValueError:
parser.error('%s argument %r is not of the form KEY=VALUE' % (
option_string, value))
try:
v = json.loads(v)
except ValueError:
if _PROBABLY_JSON_RE.match(v):
parser.error('%s argument %r is not valid JSON' % (
option_string, value))
_default_to(namespace, self.dest, {})
getattr(namespace, self.dest)[k] = v
class _JSONAction(Action):
"""action to parse a JSON"""
def __call__(self, parser, namespace, value, option_string=None):
try:
j = json.loads(value)
except ValueError as e:
parser.error('Malformed JSON passed to %s: %s' % (
option_string, str(e)))
setattr(namespace, self.dest, j)
class _PortRangeAction(Action):
"""action to parse --ssh-bind-ports"""
def __call__(self, parser, namespace, value, option_string=None):
try:
ports = _parse_port_range_list(value)
except ValueError as e:
parser.error('%s: invalid port range list %r: \n%s' %
(option_string, value, e.args[0]))
setattr(namespace, self.dest, ports)
### mux opts ###
# these are used by MRJob to determine what part of a job to run
#
# this just maps dest to the args and kwargs to ArgumentParser.add_argument
# (minus the dest keyword arg)
_STEP_OPTS = dict(
run_combiner=(
['--combiner'],
dict(
action='store_true',
help='run a combiner',
),
),
run_mapper=(
['--mapper'],
dict(
action='store_true',
help='run a mapper'
),
),
run_reducer=(
['--reducer'],
dict(
action='store_true',
help='run a reducer',
),
),
run_spark=(
['--spark'],
dict(
action='store_true',
help='run Spark code',
),
),
step_num=(
['--step-num'],
dict(
type=int,
default=0,
help='which step to execute (default is 0)',
),
),
)
# don't show these unless someone types --help -v --deprecated
_DEPRECATED_STEP_OPTS = set() # none at the moment
# don't show these unless someone types --help --deprecated
_DEPRECATED_NON_RUNNER_OPTS = {'deprecated'}
### runner opts ###
# map from runner option name to dict with the following keys (all optional):
# cloud_role:
# 'connect' if needed when interacting with cloud services at all
# 'launch' if needed when creating a new cluster
# (cloud runner options with no cloud role are only needed when running jobs)
# combiner: combiner func from mrjob.conf used to combine option values.
# (if left blank, we use combine_values())
# deprecated: if true, this option is deprecated and slated for removal
# deprecated_aliases: list of old names for this option slated for removal
# switches: list of switches to add to ArgumentParser for this option. Items
# have the format (['--switch-names', ...], dict(**kwargs)), where kwargs
# can be:
# action -- action to pass to add_argument() (e.g. 'store_true')
# deprecated -- if True, this switch is deprecated and slated for removal
# deprecated_aliases -- list of old '--switch-names' slated for removal
# help -- help string to pass to add_argument()
# type -- option type for add_argument() to enforce (e.g. float).
# You can't set the ArgumentParser's default; we use [] if *action* is
# 'append' and None otherwise.
#
# the list of which options apply to which runner is in the runner class
# itself (e.g. EMRJobRunner.OPT_NAMES)
_RUNNER_OPTS = dict(
add_steps_in_batch=dict(
switches=[
(['--add-steps-in-batch'], dict(
action='store_true',
help='For multi-step jobs, submit all steps at once',
)),
(['--no-add-steps-in-batch'], dict(
action='store_false',
help=('For multi-step jobs, submit steps successively after'
' the previous one completes'),
)),
]
),
additional_emr_info=dict(
cloud_role='launch',
switches=[
(['--additional-emr-info'], dict(
help='A JSON string for selecting additional features on EMR',
)),
],
),
applications=dict(
cloud_role='launch',
combiner=combine_lists,
switches=[
(['--applications', '--application'], dict(
action=_AppendCommaSeparatedItemsAction,
help=('Additional applications to run on 4.x and 5.x'
' AMIs, separated by commas (e.g.'
' "Ganglia,Spark")'),
)),
],
),
aws_access_key_id=dict(
cloud_role='connect',
),
aws_secret_access_key=dict(
cloud_role='connect',
),
aws_session_token=dict(
cloud_role='connect',
),
bootstrap=dict(
cloud_role='launch',
combiner=combine_lists,
switches=[
(['--bootstrap'], dict(
action='append',
help=('A shell command to set up libraries etc. before any'
' steps (e.g. "sudo apt-get -qy install python3"). You'
' may interpolate files available via URL or locally'
' with Hadoop Distributed Cache syntax'
' ("sudo yum install -y foo.rpm#")'),
)),
],
),
bootstrap_actions=dict(
cloud_role='launch',
combiner=combine_lists,
switches=[
(['--bootstrap-action'], dict(
action='append',
help=('Raw bootstrap action scripts to run before any of the'
' other bootstrap steps. You can use --bootstrap-action'
' more than once. Local scripts will be automatically'
' uploaded to S3. To add arguments, just use quotes:'
' "foo.sh arg1 arg2"'),
)),
],
),
bootstrap_mrjob=dict(
cloud_role='launch',
switches=[
(['--bootstrap-mrjob'], dict(
action='store_true',
help=("Automatically zip up the mrjob library and install it"
" when we run the mrjob. This is the default. Use"
" --no-bootstrap-mrjob if you've already installed"
" mrjob on your Hadoop cluster."),
)),
(['--no-bootstrap-mrjob'], dict(
action='store_false',
help=("Don't automatically zip up the mrjob library and"
" install it when we run this job. Use this if you've"
" already installed mrjob on your Hadoop cluster."),
)),
],
),
bootstrap_python=dict(
cloud_role='launch',
switches=[
(['--bootstrap-python'], dict(
action='store_true',
help=('Attempt to install a compatible version of Python'
' at bootstrap time. Currently this only does anything'
' for Python 3, for which it is enabled by default.'),
)),
(['--no-bootstrap-python'], dict(
action='store_false',
help=("Don't automatically try to install a compatible version"
" of Python at bootstrap time."),
)),
],
),
bootstrap_spark=dict(
cloud_role='launch',
switches=[
(['--bootstrap-spark'], dict(
action='store_true',
help="Auto-install Spark on the cluster (even if not needed)."
)),
(['--no-bootstrap-spark'], dict(
action='store_false',
help="Don't auto-install Spark on the cluster."
)),
],
),
check_input_paths=dict(
switches=[
(['--check-input-paths'], dict(
action='store_true',
help='Check input paths exist before running (the default)',
)),
(['--no-check-input-paths'], dict(
action='store_false',
help='Skip the checks to ensure all input paths exist',
)),
],
),
check_cluster_every=dict(
switches=[
(['--check-cluster-every'], dict(
help=('How often (in seconds) to check status of your'
' job/cluster'),
type=float,
)),
],
),
cleanup=dict(
switches=[
(['--cleanup'], dict(
action=_CleanupAction,
help=('Comma-separated list of which directories to delete'
' when a job succeeds, e.g. TMP,LOGS. Choices:'
' %s (default: ALL)' % ', '.join(CLEANUP_CHOICES)),
)),
],
),
cleanup_on_failure=dict(
switches=[
(['--cleanup-on-failure'], dict(
action=_CleanupAction,
help=('Comma-separated list of which directories to delete'
' when a job fails, e.g. TMP,LOGS. Choices:'
' %s (default: NONE)' % ', '.join(CLEANUP_CHOICES)),
)),
],
),
cloud_fs_sync_secs=dict(
cloud_role='launch',
switches=[
(['--cloud-fs-sync-secs'], dict(
help=('How long to wait for remote FS to reach eventual'
' consistency. This'
' is typically less than a second but the'
' default is 5.0 to be safe.'),
type=float,
)),
],
),
cloud_log_dir=dict(
cloud_role='launch',
combiner=combine_paths,
switches=[
(['--cloud-log-dir'], dict(
help='URI on remote FS to write logs into',
)),
],
),
cloud_tmp_dir=dict(
cloud_role='launch',
combiner=combine_paths,
switches=[
(['--cloud-tmp-dir'], dict(
help='URI on remote FS to use as our temp directory.',
)),
],
),
cloud_part_size_mb=dict(
cloud_role='launch',
deprecated_aliases=['cloud_upload_part_size'],
switches=[
(['--cloud-part-size-mb'], dict(
deprecated_aliases=['--cloud-upload-part-size'],
help=('Upload files to cloud FS in parts no bigger than this'
' many megabytes. Default is 100 MiB. Set to 0 to'
' disable multipart uploading entirely.'),
type=float,
)),
],
),
cluster_id=dict(
switches=[
(['--cluster-id'], dict(
help='ID of an existing cluster to run our job on',
)),
],
),
cluster_properties=dict(
cloud_role='launch',
combiner=combine_dicts,
switches=[
(['--cluster-property'], dict(
action=_KeyValueAction,
help=('Properties to set in Hadoop config files on Dataproc.'
'Args take the form file_prefix:property=value.'
' You can use --cluster-property multiple times.'
' For more info, see'
' https://cloud.google.com/dataproc/docs/concepts'
'/configuring-clusters/cluster-properties'),
)),
],
),
cmdenv=dict(
combiner=combine_envs,
switches=[
(['--cmdenv'], dict(
action=_KeyValueAction,
help=('Set an environment variable for your job inside Hadoop '
'streaming/Spark. Must take the form KEY=VALUE.'
' You can use --cmdenv multiple times.'),
)),
],
),
core_instance_config=dict(
cloud_role='launch',
switches=[
(['--core-instance-config'], dict(
action=_JSONAction,
help=('detailed JSON dict of configs for the core'
' (worker) instances'
' on Dataproc, including disk config. For format, see'
' https://cloud.google.com/dataproc/docs/reference/rest'
'/v1/projects.regions.clusters#InstanceGroupConfig'
' (except that fields in your JSON should use'
' snake_case, not camelCase).')
)),
],
),
core_instance_bid_price=dict(
cloud_role='launch',
switches=[
(['--core-instance-bid-price'], dict(
help=('Bid price to specify for core nodes when'
' setting them up as EC2 spot instances (you probably'
' only want to do this for task instances).'),
)),
],
),
core_instance_type=dict(
cloud_role='launch',
switches=[
(['--core-instance-type'], dict(
help='Type of GCE/EC2 core instance(s) to launch',
)),
],
),
docker_client_config=dict(
switches=[
(['--docker-client-config'], dict(
help='Patch of docker client config, used connect to ECR',
)),
],
),
docker_image=dict(
switches=[
(['--docker-image'], dict(
help='ID of Docker image to run tasks inside',
)),
(['--no-docker'], dict(
action='store_const',
const='',
help="Don't run tasks inside Docker",
)),
],
),
docker_mounts=dict(
combiner=combine_lists,
switches=[
(['--docker-mount'], dict(
action='append',
help=('Volume to mount into docker, e.g. '
' "/etc/passwd:/etc/passwd:ro". May be used multiple'
' times'),
)),
]
),
ebs_root_volume_gb=dict(
cloud_role='launch',
switches=[
(['--ebs-root-volume-gb'], dict(
help=('Size of root EBS volume, in GiB. Must be an integer.'
'Set to 0 to use the default'),
type=int,
)),
],
),
ec2_endpoint=dict(
cloud_role='connect',
switches=[
(['--ec2-endpoint'], dict(
help=('Force mrjob to connect to EC2 on this endpoint'
' (e.g. ec2.us-west-1.amazonaws.com).'
' Default is to infer this from region.'),
)),
],
),
ec2_key_pair=dict(
cloud_role='launch',
switches=[
(['--ec2-key-pair'], dict(
help='Name of the SSH key pair you set up for EMR',
)),
],
),
ec2_key_pair_file=dict(
combiner=combine_paths,
switches=[
(['--ec2-key-pair-file'], dict(
help='Path to file containing SSH key for EMR',
)),
],
),
emr_action_on_failure=dict(
cloud_role='launch',
switches=[
(['--emr-action-on-failure'], dict(
help=('Action to take when a step fails'
' (e.g. TERMINATE_CLUSTER, CANCEL_AND_WAIT, CONTINUE)'),
)),
],
),
emr_configurations=dict(
cloud_role='launch',
combiner=combine_lists,
switches=[
(['--emr-configuration'], dict(
action=_AppendJSONAction,
help=('Configuration to use on 4.x AMIs as a JSON-encoded'
' dict; see'
' http://docs.aws.amazon.com/ElasticMapReduce/latest/'
'ReleaseGuide/emr-configure-apps.html for examples'),
)),
],
),
emr_endpoint=dict(
cloud_role='connect',
switches=[
(['--emr-endpoint'], dict(
help=('Force mrjob to connect to EMR on this endpoint'
' (e.g. us-west-1.elasticmapreduce.amazonaws.com).'
' Default is to infer this from region.'),
)),
],
),
emulate_map_input_file=dict(
switches=[
(['--emulate-map-input-file'], dict(
action='store_true',
help=("In the first mapper, set $mapreduce_map_input_file to"
" the input file path, like Hadoop would, to support"
" jobs that use"
" jobconf_from_env('mapreduce.map.input.file')."
" Ignored if job sets HADOOP_INPUT_FORMAT."),
)),
(['--no-emulate-map-input-file'], dict(
action='store_false',
help=("Disables setting $mapreduce_map_input_file"),
)),
],
),
enable_emr_debugging=dict(
cloud_role='launch',
switches=[
(['--enable-emr-debugging'], dict(
action='store_true',
help='Enable storage of Hadoop logs in SimpleDB',
)),
(['--disable-emr-debugging'], dict(
action='store_false',
help=('Disable storage of Hadoop logs in SimpleDB (the'
' default)'),
)),
],
),
extra_cluster_params=dict(
cloud_role='launch',
combiner=combine_dicts,
switches=[
(['--extra-cluster-param'], dict(
action=_KeyJSONValueAction,
help=('extra parameter to pass to cloud API when creating'
' a cluster, to access features not currently supported'
' by mrjob. Takes the form <param>=<value>, where value'
' is JSON or a string. Use <param>=null to unset a'
' parameter'),
)),
],
),
gcloud_bin=dict(
combiner=combine_cmds,
switches=[
(['--gcloud-bin'], dict(help='path to gcloud binary')),
],
),
gcs_region=dict(
cloud_role='connect',
switches=[
(['--gcs-region'], dict(
help='region to create Google Cloud Storage buckets in',
)),
],
),
hadoop_bin=dict(
combiner=combine_cmds,
switches=[
(['--hadoop-bin'], dict(help='path to hadoop binary')),
],
),
hadoop_extra_args=dict(
combiner=combine_lists,
switches=[
(['--hadoop-args'], dict(
action=_AppendArgsAction,
help=('One or more arguments to pass to the hadoop binary.'
' (e.g. --hadoop-args="-fs file:///").'),
)),
],
),
hadoop_log_dirs=dict(
combiner=combine_path_lists,
switches=[
(['--hadoop-log-dirs'], dict(
action='append',
help=('Directory to search for hadoop logs in. You can use'
' --hadoop-log-dir multiple times.'),
)),
],
),
hadoop_streaming_jar=dict(
combiner=combine_paths,
switches=[
(['--hadoop-streaming-jar'], dict(
help=('Path of your hadoop streaming jar (locally, or on'
' S3/HDFS). In EMR, use a file:// URI to refer to a jar'
' on the master node of your cluster.'),
)),
],
),
hadoop_tmp_dir=dict(
combiner=combine_paths,
switches=[
(['--hadoop-tmp-dir'], dict(
help='Temp space on HDFS (default is tmp/mrjob)',
)),
],
),
hadoop_version=dict(
switches=[
(['--hadoop-version'], dict(
help='Specific version of Hadoop to simulate',
)),
],
),
iam_endpoint=dict(
cloud_role='launch', # not 'connect'; only used to create clusters
switches=[
(['--iam-endpoint'], dict(
help=('Force mrjob to connect to IAM on this endpoint'
' (e.g. iam.us-gov.amazonaws.com)'),
)),
],
),
iam_instance_profile=dict(
cloud_role='launch',
switches=[
(['--iam-instance-profile'], dict(
help=('EC2 instance profile to use for the EMR cluster -- see'
' "Configure IAM Roles for Amazon EMR" in AWS docs'),
)),
],
),
iam_service_role=dict(
cloud_role='launch',
switches=[
(['--iam-service-role'], dict(
help=('IAM service role to use for the EMR cluster -- see'
' "Configure IAM Roles for Amazon EMR" in AWS docs')
)),
],
),
image_id=dict(
cloud_role='launch',
switches=[
(['--image-id'], dict(
help='ID of custom AWS machine image (AMI) to use',
)),
],
),
image_version=dict(
cloud_role='launch',
switches=[
(['--image-version'], dict(
help='version of EMR/Dataproc machine image to run',
)),
],
),
instance_groups=dict(
cloud_role='launch',
switches=[
(['--instance-groups'], dict(
action=_JSONAction,
help=('detailed JSON list of EMR instance configs, including'
' EBS configuration. See docs for --instance-groups'
' at http://docs.aws.amazon.com/cli/latest/reference'
'/emr/create-cluster.html'),
)),
],
),
instance_fleets=dict(
cloud_role='launch',
switches=[
(['--instance-fleets'], dict(
action=_JSONAction,
help=('detailed JSON list of instance fleets, including'
' EBS configuration. See docs for --instance-fleets'
' at http://docs.aws.amazon.com/cli/latest/reference'
'/emr/create-cluster.html'),
)),
],
),
instance_type=dict(
cloud_role='launch',
switches=[
(['--instance-type'], dict(
help=('Type of GCE/EC2 instance(s) to launch \n'
' GCE - e.g. n1-standard-1, n1-highcpu-4, n1-highmem-4'
' -- See'
' https://cloud.google.com/compute/docs/machine-types\n'
' EC2 - e.g. m1.medium, c3.xlarge, r3.xlarge '
' -- See http://aws.amazon.com/ec2/instance-types/'),
)),
],
),
jobconf=dict(
combiner=combine_jobconfs,
switches=[
(['-D', '--jobconf'], dict(
action=_KeyValueAction,
help=('passed through to hadoop streaming as -D and to Spark'
' as --conf. Should take the form KEY=VALUE'),
)),
],
),
label=dict(
cloud_role='launch',
switches=[
(['--label'], dict(
help='Alternate label for the job, to help us identify it.',
)),
],
),
libjars=dict(
combiner=combine_path_lists,
switches=[
(['--libjars'], dict(
action=_AppendCommaSeparatedItemsAction,
help=('Paths of JARs to pass to Hadoop with -libjars,'
' separated by commas. On EMR,'
' these can also be URIs; use file:/// to'
' reference JARs already on the EMR cluster.')
)),
],
),
local_tmp_dir=dict(
combiner=combine_paths,
switches=[
(['--local-tmp-dir'], dict(
help='temp directory on local filesystem',
)),
],
),
master_instance_bid_price=dict(
cloud_role='launch',
switches=[
(['--master-instance-bid-price'], dict(
help=('Bid price to specify for the master node when'
' setting it up as an EC2 spot instance (you probably'
' only want to do this for task instances).'),
)),
],
),
master_instance_config=dict(
cloud_role='launch',
switches=[
(['--master-instance-config'], dict(
action=_JSONAction,
help=('detailed JSON dict of configs for the master instance'
' on Dataproc including disk config. For format, see'
' https://cloud.google.com/dataproc/docs/reference/rest'
'/v1/projects.regions.clusters#InstanceGroupConfig'
' (except that fields in your JSON should use'
' snake_case, not camelCase).')
)),
],
),
master_instance_type=dict(
cloud_role='launch',
switches=[
(['--master-instance-type'], dict(
help='Type of GCE/EC2 master instance to launch',
)),
],
),
max_clusters_in_pool=dict(
switches=[
(['--max-clusters-in-pool'], dict(
help=("If attempting to join a pooled cluster, don't bail"
" out and create a new one if there are at least"
" this many clusters already in the pool."),
type=int,
)),
],
),
max_concurrent_steps=dict(
cloud_role='launch',
switches=[
(['--max-concurrent-steps'], dict(
help=("Maximum number of steps that may run on the cluster"
" at one time. Default is 1"),
type=int,
)),
],
),
max_mins_idle=dict(
cloud_role='launch',
switches=[
(['--max-mins-idle'], dict(
help=("If we create a cluster, have it automatically"
" terminate itself after it's been idle this many"
" minutes"),
type=float,
)),
],
),
# Spark runner only, only passed in on the command line (see #2040)
max_output_files=dict(
switches=[
(['--max-output-files'], dict(
help=('Maximum number of output files when running a'
' streaming job on Spark; just runs rdd.coalesce()'
' before outputting files'),
type=int,
)),
],
),
# min_available_* options aren't in the "launch" group because they
# are only used when joining a cluster, not creating one
min_available_mb=dict(
switches=[
(['--min-available-mb'], dict(
help=('When attempting to join a pooled cluster, check'
' the YARN resource manager to ensure at least'
' this much memory is available.'),
type=int,
)),
]
),
min_available_virtual_cores=dict(
switches=[
(['--min-available-virtual-cores'], dict(
help=('When attempting to join a pooled cluster, check'
' the YARN resource manager to ensure at least'
' this much CPU capacity is available.'),
type=float,
)),
]
),
network=dict(
cloud_role='launch',
switches=[
(['--network'], dict(
help=('URI of Google Compute Engine network to launch cluster'
" in. Can't be used with --subnet."),
)),
],
),
num_core_instances=dict(
cloud_role='launch',
switches=[
(['--num-core-instances'], dict(
help='Total number of core instances to launch',
type=int,
)),
],
),
num_task_instances=dict(
cloud_role='launch',
switches=[
(['--num-task-instances'], dict(
help='Total number of task instances to launch',
type=int,
)),
],
),
num_cores=dict(
cloud_role='launch',
switches=[
(['--num-cores'], dict(
help='Total number of core to use while running in local mode',
type=int,
)),
],
),
owner=dict(
cloud_role='launch',
switches=[
(['--owner'], dict(
help='User who ran the job (default is the current user)',
)),
],
),
pool_clusters=dict(
cloud_role='launch',
switches=[
(['--pool-clusters'], dict(
action='store_true',
help=('Add to an existing cluster or create a new one that'
' does not terminate when the job completes.'),
)),
(['--no-pool-clusters'], dict(
action='store_false',
help="Don't run job on a pooled cluster (the default)",
)),
],
),
pool_jitter_seconds=dict(
switches=[
(['--pool-jitter-seconds'], dict(
help=('If --max-pools-in-cluster is set, before launching a'
' cluster, wait a random amount of time between 0 and'
' this many seconds and then double-check the number'
' of clusters in the pool before launching'),
type=int,
)),
],
),
pool_name=dict(
cloud_role='launch',
switches=[
(['--pool-name'], dict(
help='Specify a pool name to join. Default is "default"',
)),
],
),
pool_timeout_minutes=dict(
switches=[
(['--pool-timeout-minutes'], dict(
help=("If pooling can't join or create a cluster within this"
" many minutes, raise an exception. (0 means don't"
" timeout"),
type=int,
)),
],
),
pool_wait_minutes=dict(
switches=[
(['--pool-wait-minutes'], dict(
help=('Wait for a number of minutes for a cluster to finish'
' if a job finishes, run job on its cluster. Otherwise'
" create a new one. (0, the default, means don't wait)"),
type=int,
)),
],
),
project_id=dict(
cloud_role='connect',
deprecated_aliases=['gcp_project'],
switches=[
(['--project-id'], dict(
deprecated_aliases=['--gcp-project'],
help=('Project to use when connecting to Google Cloud Services'
' and to run Cloud Dataproc jobs in')
)),
],
),
py_files=dict(
combiner=combine_path_lists,
switches=[
(['--py-files'], dict(
action=_AppendCommaSeparatedItemsAction,
help=('.zip or .egg files to add to PYTHONPATH,'
' separated by commas'),
)),
],
),
python_bin=dict(
combiner=combine_cmds,
switches=[
(['--python-bin'], dict(
help=('Alternate python command. You can include arguments,'
' e.g. --python-bin "python -v"'),
)),
],
),
read_logs=dict(
switches=[
(['--read-logs'], dict(
action='store_true',
help=('Parse logs generated by the job to get counters and'
' cause of error (the default).')
)),
(['--no-read-logs'], dict(
action='store_false',
help="Don't list or read logs generated by the job."
)),
],
),
region=dict(
cloud_role='connect',
switches=[
(['--region'], dict(
help='GCE/AWS region to run Dataproc/EMR jobs in.',
)),
],
),
release_label=dict(
cloud_role='launch',
switches=[
(['--release-label'], dict(
help=('Release Label (e.g. "emr-4.0.0"). Overrides'
' --image-version'),
)),
],
),
s3_endpoint=dict(
cloud_role='connect',
switches=[
(['--s3-endpoint'], dict(
help=("Force mrjob to connect to S3 on this endpoint (e.g."
" s3-us-west-1.amazonaws.com). You usually shouldn't"
" set this; by default mrjob will choose the correct"
" endpoint for each S3 bucket based on its location."),
)),
],
),
s3_region=dict(
cloud_role='connect',
switches=[
(['--s3-region'], dict(
help='AWS region to create s3 buckets in',
)),
],
),
service_account=dict(
cloud_role='launch',
switches=[
(['--service-account'], dict(
help=('Service account to use when creating a Dataproc'
' cluster. Usually takes the form'
' [account_id]@[project_id].iam.gserviceaccount.com.'
' Set to "" to use the default.'),
)),
],
),
service_account_scopes=dict(
cloud_role='launch',
switches=[
(['--service-account-scopes'], dict(
action=_CommaSeparatedListAction,
help=("A comma-separated list of service account scopes"
" on Dataproc, used to limit your cluster's access."
" For each scope, you can specify the"
" full URI or just the name (e.g. 'logging.write')"),
)),
],
),
setup=dict(
combiner=combine_lists,
switches=[
(['--setup'], dict(
action='append',
help=('A command to run before each mapper/reducer step in the'
' shell ("touch foo"). You may interpolate files'
' available via URL or on your local filesystem using'
' Hadoop Distributed Cache syntax (". setup.sh#"). To'
' interpolate archives, use #/: "cd foo.tar.gz#/; make'),
)),
],
),
sh_bin=dict(
combiner=combine_cmds,
switches=[
(['--sh-bin'], dict(
help=('Alternate shell command for setup scripts. You may'
' include arguments, e.g. --sh-bin "bash -ex"'),
)),
],
),
skip_internal_protocol=dict(
switches=[
(['--skip-internal-protocol'], dict(
action='store_true',
help=("Don't use the job's internal protocol to communicate"
" between tasks internal to the job, instead relying"
" on Spark to encode and decode raw data structures.")
)),
(['--no-skip-internal-protocol'], dict(
action='store_false',
help='Use internal protocols as usual',
)),
],
),
sort_bin=dict(
combiner=combine_cmds,
switches=[
(['--sort-bin'], dict(
help=('Alternate shell command for the external sort binary.'
'You may include arguments, e.g. --sort-bin "sort -r"')
)),
],
),
spark_args=dict(
combiner=combine_lists,
switches=[
(['--spark-args'], dict(
action=_AppendArgsAction,
help=('One or more arguments to pass to spark-submit'
' (e.g. --spark-args="--properties-file my.conf").'),
)),
],
),
spark_deploy_mode=dict(
switches=[
(['--spark-deploy-mode'], dict(
help=('--deploy-mode argument to spark-submit (e.g.'
' "cluster". Default is "client"'),
)),
]
),
spark_master=dict(
switches=[
(['--spark-master'], dict(
help=('--master argument to spark-submit (e.g. '
'spark://host:port, local). Default is "yarn"'),
)),
],
),
spark_submit_bin=dict(
combiner=combine_cmds,
switches=[
(['--spark-submit-bin'], dict(
help='spark-submit binary. You may include arguments.'
)),
],
),
spark_tmp_dir=dict(
cloud_role='launch',
combiner=combine_paths,
switches=[
(['--spark-tmp-dir'], dict(
help=('optional URI visible to Spark executors to use as our'
' temp directory.'),
)),
],
),
ssh_add_bin=dict(
combiner=combine_cmds,
switches=[
(['--ssh-add-bin'], dict(
help=("Name/path of ssh-add binary. Arguments are allowed"
" (e.g. --ssh-bin 'ssh-add -v')"),
)),
],
),
ssh_bin=dict(
combiner=combine_cmds,
switches=[
(['--ssh-bin'], dict(
help=("Name/path of ssh binary. Arguments are allowed (e.g."
" --ssh-bin 'ssh -v')"),
)),
],
),
ssh_bind_ports=dict(
switches=[
(['--ssh-bind-ports'], dict(
action=_PortRangeAction,
help=('A list of port ranges that are safe to listen on,'
' delimited by colons and commas, with syntax like'
' 2000[:2001][,2003,2005:2008,etc].'
' Defaults to 40001:40840.'),
)),
],
),
ssh_tunnel=dict(
switches=[
(['--ssh-tunnel'], dict(
action='store_true',
help=('Open an SSH tunnel to the Hadoop job tracker/resource'
' manager'),
)),
(['--no-ssh-tunnel'], dict(
action='store_false',
help=("Don't open an SSH tunnel to the Hadoop job"
" tracker/resource manager (the default)"),
)),
],
),
ssh_tunnel_is_open=dict(
switches=[
(['--ssh-tunnel-is-open'], dict(
action='store_true',
help=('Make ssh tunnel accessible from remote hosts (not just'
' localhost)'),
)),
(['--ssh-tunnel-is-closed'], dict(
action='store_false',
help=('Make ssh tunnel accessible from localhost only (the'
' default)'),
)),
],
),
subnet=dict(
cloud_role='launch',
switches=[
(['--subnet'], dict(
help=('ID of Amazon VPC subnet/URI of Google Compute Engine'
' subnetwork to launch cluster in.'),
)),
(['--subnets'], dict(
action=_CommaSeparatedListAction,
help=('Like --subnet, but with a comma-separated list, to'
' specify multiple subnets in conjunction with'
' --instance-fleets (EMR only)'),
)),
],
),
tags=dict(
cloud_role='launch',
combiner=combine_dicts,
switches=[
(['--tag'], dict(
action=_KeyValueAction,
help=('Metadata tags to apply to the EMR cluster; '
'should take the form KEY=VALUE. You can use --tag '
'multiple times'),
)),
],
),
task_instance_bid_price=dict(
cloud_role='launch',
switches=[
(['--task-instance-bid-price'], dict(
help=('Bid price to specify for task nodes when'
' setting them up as EC2 spot instances'),
)),
],
),
task_instance_config=dict(
cloud_role='launch',
switches=[
(['--task-instance-config'], dict(
action=_JSONAction,
help=('detailed JSON dict of configs for the task'
' (secondary worker) instances'
' on Dataproc including disk config. For format, see'
' https://cloud.google.com/dataproc/docs/reference/rest'
'/v1/projects.regions.clusters#InstanceGroupConfig'
' (except that fields in your JSON should use'
' snake_case, not camelCase).')
)),
],
),
task_instance_type=dict(
cloud_role='launch',
switches=[
(['--task-instance-type'], dict(
help='Type of GCE/EC2 task instance(s) to launch',
)),
],
),
task_python_bin=dict(
combiner=combine_cmds,
switches=[
(['--task-python-bin'], dict(
help=('Name/path of alternate python command to use to'
" run tasks (e.g. mappers); doesn't affect setup"
' wrapper scripts. Defaults to'
' current Python interpreter.'),
)),
],
),
upload_archives=dict(
combiner=combine_path_lists,
switches=[
(['--archives'], dict(
action=_AppendCommaSeparatedItemsAction,
help=('Archives to unpack in the working directory of the'
' script, separated by commas. Use "#" to assign a'
' different name to each directory (e.g. '
'"foo-libs.zip#lib,bar.tar.gz#bar")'),
)),
],
),
upload_dirs=dict(
combiner=combine_path_lists,
switches=[
(['--dirs'], dict(
action=_AppendCommaSeparatedItemsAction,
help=('Directories to tarball and unpack in the working'
' directory of the script, separated by commas. Append'
'#<name> to each directory to assign a different name'
' (e.g. "foo#lib,bar#local-bar")'),
)),
],
),
upload_files=dict(
combiner=combine_path_lists,
switches=[
(['--files'], dict(
action=_AppendCommaSeparatedItemsAction,
help=('Files to copy to the working directory of the script,'
' separated by commas. Use "#"'
' to assign a different name to each file (e.g. '
'"foo.db#bar.db")'),
)),
],
),
zone=dict(
cloud_role='launch',
switches=[
(['--zone'], dict(
help=('GCE zone/AWS availability zone to run Dataproc/EMR jobs'
' in.'),
)),
],
),
)
def _combiners(opt_names, runner_alias=None):
return {
name: config['combiner']
for name, config in _RUNNER_OPTS.items()
if name in opt_names and 'combiner' in config
}
def _deprecated_aliases(opt_names):
results = {}
for name, config in _RUNNER_OPTS.items():
if name not in opt_names:
continue
if config.get('deprecated_aliases'):
for alias in config['deprecated_aliases']:
results[alias] = name
return results
def _filter_by_role(opt_names, *cloud_roles):
return {
opt_name
for opt_name, conf in _RUNNER_OPTS.items()
if opt_name in opt_names and conf.get('cloud_role') in cloud_roles
}
def _add_runner_args(parser, opt_names=None, include_deprecated=True,
customize_switches=None, suppress_switches=None):
"""add switches for the given runner opts to the given
ArgumentParser, alphabetically by destination. If *opt_names* is
None, include all runner opts."""
if opt_names is None:
opt_names = set(_RUNNER_OPTS)
for opt_name in sorted(opt_names):
_add_runner_args_for_opt(
parser, opt_name,
include_deprecated=include_deprecated,
customize_switches=customize_switches,
suppress_switches=suppress_switches
)
def _add_runner_args_for_opt(parser, opt_name, include_deprecated=True,
customize_switches=None, suppress_switches=None):
"""Add switches for a single option (*opt_name*) to the given parser."""
if customize_switches is None:
customize_switches = {}
if suppress_switches is None:
suppress_switches = set()
conf = _RUNNER_OPTS[opt_name]
if conf.get('deprecated') and not include_deprecated:
return
switches = conf.get('switches') or []
def suppressed(switches):
return any(sw in suppress_switches for sw in switches)
for args, kwargs in switches:
kwargs = dict(kwargs)
# allow customization
for switch in args:
if switch in customize_switches:
kwargs.update(customize_switches[switch])
deprecated_aliases = kwargs.pop('deprecated_aliases', None)
deprecated = kwargs.pop('deprecated', False)
# add this switch
if (include_deprecated or not deprecated) and not suppressed(args):
kwargs['dest'] = opt_name
if kwargs.get('action') == 'append':
kwargs['default'] = []
else:
kwargs['default'] = None
parser.add_argument(*args, **kwargs)
# add a switch for deprecated aliases
if (deprecated_aliases and include_deprecated and
not suppressed(deprecated_aliases)):
help = 'Deprecated alias%s for %s' % (
('es' if len(deprecated_aliases) > 1 else ''),
args[-1])
parser.add_argument(
*deprecated_aliases,
**combine_dicts(kwargs, dict(help=help)))
### non-runner switches ###
def _add_basic_args(parser):
"""Switches for all command line tools"""
parser.add_argument(
'-c', '--conf-path', dest='conf_paths',
action='append',
help='Path to alternate mrjob.conf file to read from')
parser.add_argument(
'--no-conf', dest='conf_paths', action='store_const', const=[],
help="Don't load mrjob.conf even if it's available")
parser.add_argument(
'-q', '--quiet', dest='quiet', default=None,
action='store_true',
help="Don't print anything to stderr")
parser.add_argument(
'-v', '--verbose', dest='verbose', default=None,
action='store_true', help='print more messages to stderr')
def _add_job_args(parser, include_deprecated=True, include_steps=True):
parser.add_argument(
'--cat-output', dest='cat_output',
default=None, action='store_true',
help="Stream job output to stdout")
parser.add_argument(
'--no-cat-output', dest='cat_output',
default=None, action='store_false',
help="Don't stream job output to stdout")
if include_deprecated:
parser.add_argument(
'--no-output', dest='cat_output',
default=None, action='store_false',
help='Deprecated alias for --no-cat-output')
parser.add_argument(
'-o', '--output-dir', dest='output_dir', default=None,
help='Where to put final job output. This must be an s3:// URL ' +
'for EMR, an HDFS path for Hadoop, and a system path for local,' +
'and must be empty')
parser.add_argument(
'-r', '--runner', dest='runner',
choices=sorted(_RUNNER_ALIASES),
help=('Where to run the job; one of: %s' % ', '.join(
sorted(_RUNNER_ALIASES))))
parser.add_argument(
'--step-output-dir', dest='step_output_dir', default=None,
help=('A directory to store output from job steps other than'
' the last one. Useful for debugging. Currently'
' ignored by local runners.'))
if include_deprecated:
parser.add_argument(
'--deprecated', dest='deprecated', action='store_true',
help='include help for deprecated options')
parser.add_argument(
'-h', '--help', dest='help', action='store_true',
help='show this message and exit')
def _add_step_args(parser, include_deprecated=False):
"""Add switches that determine what part of the job a MRJob runs."""
for dest, (args, kwargs) in _STEP_OPTS.items():
if dest in _DEPRECATED_STEP_OPTS and not include_deprecated:
continue
kwargs = dict(dest=dest, **kwargs)
parser.add_argument(*args, **kwargs)
### other utilities for switches ###
def _print_help_for_runner(opt_names, include_deprecated=False):
help_parser = ArgumentParser(usage=SUPPRESS, add_help=False)
_add_runner_args(help_parser, opt_names,
include_deprecated=include_deprecated)
help_parser.print_help()
def _print_basic_help(option_parser, usage, include_deprecated=False,
include_steps=False):
"""Print all help for the parser. Unlike similar functions, this needs a
parser so that it can include custom options added by a
:py:class:`~mrjob.job.MRJob`.
"""
help_parser = ArgumentParser(usage=usage, add_help=False)
_add_basic_args(help_parser)
_add_job_args(help_parser, include_deprecated=include_deprecated,
include_steps=include_steps)
basic_dests = {action.dest for action in help_parser._actions}
# add other custom args added by the user
for action in option_parser._actions:
# option_parser already includes deprecated option dests
# this excludes deprecated switch aliases (e.g. --no-output)
if action.dest in basic_dests:
continue
# this excludes the --deprecated switch (which is explained below)
if action.dest in _DEPRECATED_NON_RUNNER_OPTS:
continue
# this excludes options that are shown with --help -r <runner>
if action.dest in _RUNNER_OPTS:
continue
# don't include steps if *include_steps* isn't set
if action.dest in _STEP_OPTS and not include_steps:
continue
# this excludes the ARGS option, which is already covered by usage
if not action.option_strings:
continue
# found a custom option. thanks, library user!
help_parser._add_action(action)
help_parser.print_help()
print()
print('To see help for a specific runner, use --help -r <runner name>')
print()
if not include_steps:
print('To include switches that control what part of a job runs,'
' use --help -v')
print()
if not include_deprecated:
print('To include help for deprecated options, add --deprecated')
print()
def _parse_raw_args(parser, args):
"""Simulate parsing by *parser*, return a list of tuples of
(dest, option_string, args).
If *args* contains unknown args or is otherwise malformed, we don't
raise an error (we leave this to the actual argument parser).
"""
results = []
class RawArgAction(Action):
def __call__(self, parser, namespace, values, option_string=None):
# ignore *namespace*, append to *results*
results.append((self.dest, option_string, values))
def error(msg):
raise ValueError(msg)
raw_parser = ArgumentParser(add_help=False)
raw_parser.error = error
for action in parser._actions:
# single args become single item lists
nargs = 1 if action.nargs is None else action.nargs
raw_parser.add_argument(*action.option_strings,
action=RawArgAction,
dest=action.dest,
nargs=nargs)
# leave errors to the real parser
raw_parser.parse_known_args(args)
return results
def _alphabetize_actions(arg_parser):
"""Alphabetize arg parser actions for the sake of nicer help printouts."""
# based on https://stackoverflow.com/questions/12268602/sort-argparse-help-alphabetically # noqa
for g in arg_parser._action_groups:
g._group_actions.sort(key=lambda opt: opt.dest)