GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/emr.py
2025-06-22 13:36:01 +05:30

3374 lines
122 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2009-2017 Yelp and Contributors
# Copyright 2018 Yelp
# Copyright 2019 Yelp and Contributors
# Copyright 2020 Affirm, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import json
import logging
import os
import os.path
import pipes
import posixpath
import re
import time
from collections import OrderedDict
from collections import defaultdict
from datetime import datetime
from datetime import timedelta
from math import ceil
from random import randint
try:
import botocore.client
import botocore.exceptions
botocore # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
botocore = None
try:
import boto3
boto3 # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
# don't require boto3; MRJobs don't actually need it when running
# inside hadoop streaming
boto3 = None
import mrjob
import mrjob.step
from mrjob.aws import _DEFAULT_AWS_REGION
from mrjob.aws import EC2_INSTANCE_TYPE_TO_MEMORY
from mrjob.aws import _boto3_now
from mrjob.aws import _boto3_paginate
from mrjob.aws import _wrap_aws_client
from mrjob.cloud import HadoopInTheCloudJobRunner
from mrjob.compat import map_version
from mrjob.compat import version_gte
from mrjob.conf import combine_dicts
from mrjob.fs.composite import CompositeFilesystem
from mrjob.fs.hadoop import HadoopFilesystem
from mrjob.fs.local import LocalFilesystem
from mrjob.fs.s3 import S3Filesystem
from mrjob.fs.s3 import _client_error_status
from mrjob.fs.s3 import _endpoint_url
from mrjob.fs.s3 import _get_bucket_region
from mrjob.fs.ssh import SSHFilesystem
from mrjob.hadoop import _DEFAULT_YARN_HDFS_LOG_DIR
from mrjob.iam import _FALLBACK_INSTANCE_PROFILE
from mrjob.iam import _FALLBACK_SERVICE_ROLE
from mrjob.iam import get_or_create_mrjob_instance_profile
from mrjob.iam import get_or_create_mrjob_service_role
from mrjob.logs.bootstrap import _check_for_nonzero_return_code
from mrjob.logs.bootstrap import _interpret_emr_bootstrap_stderr
from mrjob.logs.bootstrap import _ls_emr_bootstrap_stderr_logs
from mrjob.logs.counters import _pick_counters
from mrjob.logs.errors import _log_probable_cause_of_failure
from mrjob.logs.mixin import LogInterpretationMixin
from mrjob.logs.spark import _interpret_spark_logs
from mrjob.logs.step import _interpret_emr_step_stderr
from mrjob.logs.step import _interpret_emr_step_syslog
from mrjob.logs.step import _ls_emr_step_stderr_logs
from mrjob.logs.step import _ls_emr_step_syslogs
from mrjob.parse import is_s3_uri
from mrjob.parse import _parse_progress_from_job_tracker
from mrjob.parse import _parse_progress_from_resource_manager
from mrjob.pool import _attempt_to_lock_cluster
from mrjob.pool import _attempt_to_unlock_cluster
from mrjob.pool import _cluster_name_suffix
from mrjob.pool import _instance_fleets_satisfy
from mrjob.pool import _instance_groups_satisfy
from mrjob.pool import _parse_cluster_name_suffix
from mrjob.py2 import PY2
from mrjob.py2 import string_types
from mrjob.py2 import to_unicode
from mrjob.py2 import urljoin
from mrjob.py2 import urlopen
from mrjob.runner import _blank_out_conflicting_opts
from mrjob.setup import UploadDirManager
from mrjob.setup import WorkingDirManager
from mrjob.step import StepFailedException
from mrjob.step import _is_spark_step_type
from mrjob.util import shlex_split
from mrjob.util import strip_microseconds
from mrjob.util import random_identifier
log = logging.getLogger(__name__)
# how to set up the SSH tunnel for various AMI versions
_IMAGE_VERSION_TO_SSH_TUNNEL_CONFIG = {
'2': dict(
localhost=True,
name='job tracker',
path='/jobtracker.jsp',
port=9100,
),
'3': dict(
localhost=False,
name='resource manager',
path='/cluster',
port=9026,
),
'4': dict(
localhost=False,
name='resource manager',
path='/cluster',
port=8088,
),
}
# if we SSH into a node, default place to look for logs
_EMR_LOG_DIR = '/mnt/var/log'
# Prior to AMI 2.4.8/3.1.1, there is a limit of 256 steps total per cluster.
# We issue a warning for users who are continuing to used pooling on these
# very old AMIs
_IMAGE_SUPPORTS_POOLING = {
'2': False,
'2.4.8': True,
'3': False,
'3.1.1': True,
}
_MAX_SSH_RETRIES = 20
# ssh should fail right away if it can't bind a port
_WAIT_FOR_SSH_TO_FAIL = 1.0
# amount of time to wait between checks for available pooled clusters
_POOLING_SLEEP_INTERVAL = 30.01 # Add .1 seconds so minutes arent spot on.
# bootstrap action which automatically terminates idle clusters
_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH = os.path.join(
os.path.dirname(mrjob.__file__),
'bootstrap',
'terminate_idle_cluster_emr.sh')
# default AWS region to use for EMR. Using us-west-2 because it is the default
# for new (since October 10, 2012) accounts (see #1025)
_DEFAULT_EMR_REGION = 'us-west-2'
# default AMI to use on EMR. This may be updated with each version
_DEFAULT_IMAGE_VERSION = '6.0.0'
# first AMI version that we can't run bash -e on (see #1548)
_BAD_BASH_IMAGE_VERSION = '5.2.0'
# use this if bash -e works (/bin/sh is actually bash)
_GOOD_BASH_SH_BIN = ['/bin/sh', '-ex']
# use this if bash -e doesn't work
_BAD_BASH_SH_BIN = ['/bin/sh', '-x']
# Hadoop streaming jar on 1-3.x AMIs
_PRE_4_X_STREAMING_JAR = '/home/hadoop/contrib/streaming/hadoop-streaming.jar'
# intermediary jar used on 4.x AMIs
_4_X_COMMAND_RUNNER_JAR = 'command-runner.jar'
# path to spark-submit on 3.x AMIs. (On 4.x, it's just 'spark-submit')
_3_X_SPARK_SUBMIT = '/home/hadoop/spark/bin/spark-submit'
# bootstrap action to install Spark on 3.x AMIs (On 4.x+, we use
# Applications instead)
_3_X_SPARK_BOOTSTRAP_ACTION = (
'file:///usr/share/aws/emr/install-spark/install-spark')
# first AMI version to support Spark
_MIN_SPARK_AMI_VERSION = '3.8.0'
# first AMI version with Spark that supports Python 3
_MIN_SPARK_PY3_AMI_VERSION = '4.0.0'
# first AMI version that allows steps to run concurrently
_MIN_STEP_CONCURRENCY_AMI_VERSION = '5.28.0'
# we have to wait this many minutes for logs to transfer to S3 (or wait
# for the cluster to terminate). Docs say logs are transferred every 5
# minutes, but I've seen it take longer on the 4.3.0 AMI. Probably it's
# 5 minutes plus time to copy the logs, or something like that.
_S3_LOG_WAIT_MINUTES = 10
# minimum amount of memory to run spark jobs
#
# it's possible that we could get by with slightly less memory, but
# m1.medium (3.75) definitely doesn't work.
_MIN_SPARK_INSTANCE_MEMORY = 7.5
# these are the only kinds of instance roles that exist
_INSTANCE_ROLES = ('MASTER', 'CORE', 'TASK')
# where to find the history log in HDFS
_YARN_HDFS_HISTORY_LOG_DIR = 'hdfs:///tmp/hadoop-yarn/staging/history'
# mildly flexible regex to detect cluster self-termination. Termination of
# non-master nodes won't shut down the cluster, so don't need to match that.
_CLUSTER_SELF_TERMINATED_RE = re.compile(
'^.*(node|instances) .* terminated.*$', re.I)
# if this appears in an S3 object's "restore" field, the object
# is available to read even if it's Glacier-archived
_RESTORED_FROM_GLACIER = 'ongoing-request="false"'
# Amount of time in seconds before we timeout yarn api calls.
_YARN_API_TIMEOUT = 20
# which port to connect to the YARN resource manager on
_YARN_RESOURCE_MANAGER_PORT = 8088
# base path for YARN resource manager
_YRM_BASE_PATH = '/ws/v1/cluster'
# all the cluster states other than terminating/terminated. We need this list
# because the ListClusters call can't filter out unwanted cluster states;
# it can only accept a whitelist of desired ones
#
# valid states are here:
# https://docs.aws.amazon.com/emr/latest/APIReference/API_ListClusters.html
_ACTIVE_CLUSTER_STATES = ['STARTING', 'BOOTSTRAPPING', 'RUNNING', 'WAITING']
# used to bail out and retry when a pooled cluster self-terminates
class _PooledClusterSelfTerminatedException(Exception):
pass
if PY2:
# this was introduced in Python 3.3
TimeoutError = OSError
class PoolTimeoutException(TimeoutError):
pass
class EMRJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
"""Runs an :py:class:`~mrjob.job.MRJob` on Amazon Elastic MapReduce.
Invoked when you run your job with ``-r emr``.
:py:class:`EMRJobRunner` runs your job in an EMR cluster, which is
basically a temporary Hadoop cluster. Normally, it creates a cluster
just for your job; it's also possible to run your job in a specific
cluster by setting *cluster_id* or to automatically choose a
waiting cluster, creating one if none exists, by setting
*pool_clusters*.
Input, support, and jar files can be either local or on S3; use
``s3://...`` URLs to refer to files on S3.
This class has some useful utilities for talking directly to S3 and EMR,
so you may find it useful to instantiate it without a script::
from mrjob.emr import EMRJobRunner
emr_client = EMRJobRunner().make_emr_client()
clusters = emr_client.list_clusters()
...
"""
alias = 'emr'
OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
'add_steps_in_batch',
'additional_emr_info',
'applications',
'aws_access_key_id',
'aws_secret_access_key',
'aws_session_token',
'bootstrap_actions',
'bootstrap_spark',
'cloud_log_dir',
'core_instance_bid_price',
'docker_client_config',
'docker_image',
'docker_mounts',
'ebs_root_volume_gb',
'ec2_endpoint',
'ec2_key_pair',
'ec2_key_pair_file',
'emr_action_on_failure',
'emr_configurations',
'emr_endpoint',
'enable_emr_debugging',
'hadoop_extra_args',
'iam_endpoint',
'iam_instance_profile',
'iam_service_role',
'instance_fleets',
'instance_groups',
'master_instance_bid_price',
'max_clusters_in_pool',
'max_concurrent_steps',
'min_available_mb',
'min_available_virtual_cores',
'pool_clusters',
'pool_jitter_seconds',
'pool_name',
'pool_timeout_minutes',
'pool_wait_minutes',
'release_label',
's3_endpoint',
'ssh_add_bin',
'ssh_bin',
'ssh_bind_ports',
'ssh_tunnel',
'ssh_tunnel_is_open',
'subnet',
'tags',
'task_instance_bid_price',
}
# supports everything (so far)
_STEP_TYPES = {
'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'}
# everything that controls instances number, type, or price
_INSTANCE_OPT_NAMES = {
name for name in OPT_NAMES
if 'instance' in name and 'iam' not in name
}
def __init__(self, **kwargs):
""":py:class:`~mrjob.emr.EMRJobRunner` takes the same arguments as
:py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
*aws_access_key_id* and *aws_secret_access_key* are required if you
haven't set them up already for boto3 (e.g. by setting the environment
variables :envvar:`AWS_ACCESS_KEY_ID` and
:envvar:`AWS_SECRET_ACCESS_KEY`)
A lengthy list of additional options can be found in
:doc:`guides/emr-opts.rst`.
"""
super(EMRJobRunner, self).__init__(**kwargs)
self._fix_s3_tmp_and_log_uri_opts()
# use job key to make a unique tmp dir
self._cloud_tmp_dir = self._opts['cloud_tmp_dir'] + self._job_key + '/'
# pick/validate output dir
if self._output_dir:
self._output_dir = self._check_and_fix_s3_dir(self._output_dir)
else:
self._output_dir = self._cloud_tmp_dir + 'output/'
# check AMI version
if self._opts['image_version'].startswith('1.'):
log.warning('1.x AMIs will not work because they use'
' Python 2.5. Use a later AMI version or mrjob v0.4.2')
elif not version_gte(self._opts['image_version'], '2.4.3'):
log.warning("AMIs prior to 2.4.3 probably will not work because"
" they don't support Python 2.7.")
elif not self._image_version_gte('5.7.0'):
if self._opts['image_id']:
log.warning('AMIs prior to 5.7.0 will probably not work'
' with custom machine images')
if self._opts['pool_clusters'] and not map_version(
self._opts['image_version'], _IMAGE_SUPPORTS_POOLING):
log.warning(
"Cluster pooling is not fully supported on AMIs prior to"
" 2.4.8/3.1.1 due to the limit on total number of steps")
if self._opts['max_concurrent_steps'] < 1:
raise ValueError('max_concurrent_steps must be at least 1')
# manage local files that we want to upload to S3. We'll add them
# to this manager just before we need them.
s3_files_dir = self._cloud_tmp_dir + 'files/'
self._upload_mgr = UploadDirManager(s3_files_dir)
# master node setup script (handled later by
# _add_master_node_setup_files_for_upload())
self._master_node_setup_mgr = WorkingDirManager()
self._master_node_setup_script_path = None
# where our own logs ended up (we'll find this out once we run the job)
self._s3_log_dir_uri = None
# did we create the cluster we're running on?
self._created_cluster = False
# did we acquire a lock on self._cluster_id? (used for pooling)
self._locked_cluster = None
# IDs of steps we have submitted to the cluster
self._step_ids = []
# we don't upload the ssh key to master until it's needed
self._ssh_key_is_copied = False
# map from cluster ID to a dictionary containing cached info about
# that cluster. Includes the following keys:
#
# - image_version
# - hadoop_version
# - master_public_dns
# - master_private_ip
#
# (we may do this for multiple cluster IDs if we join a pooled cluster
# that self-terminates)
self._cluster_to_cache = defaultdict(dict)
# set of cluster IDs for which we logged the master node's public DNS
self._logged_address_of_master = set()
# List of dicts (one for each step) potentially containing
# the keys 'history', 'step', and 'task'. These will also always
# contain 'step_id' (the s-XXXXXXXX step ID on EMR).
#
# This will be filled by _wait_for_steps_to_complete()
#
# This might work better as a dictionary.
self._log_interpretations = []
# log interpretation for master node setup step (currently we don't
# use this for anything; we just want to keep it out of
# self._log_interpretations)
self._mns_log_interpretation = None
# set of step numbers (0-indexed) where we waited 5 minutes for logs to
# transfer to S3 (so we don't do it twice)
self._waited_for_logs_on_s3 = set()
# info used to match clusters. catches _pool_hash_dict()
self._pool_hash_dict_cached = None
# add_steps_in_batch and concurrent steps don't mix
if (self._add_steps_in_batch() and
self._opts['max_concurrent_steps'] > 1):
log.warning('add_steps_in_batch will probably not work'
' with max_concurrent_steps > 1')
# min_available_* options require SSH
if ((self._opts['min_available_mb'] or
self._opts['min_available_virtual_cores']) and
not (self._opts['ec2_key_pair'] and
self._opts['ec2_key_pair_file'])):
raise ValueError('you must set up SSH (ec2_key_pair and'
' ec2_key_pair_file) to use the'
' min_available_* options')
### Options ###
@classmethod
def _default_opts(cls):
return combine_dicts(
super(EMRJobRunner, cls)._default_opts(),
dict(
bootstrap_python=None,
check_cluster_every=30,
cleanup_on_failure=['JOB'],
cloud_fs_sync_secs=5.0,
docker_client_config=None,
docker_image=None,
image_version=_DEFAULT_IMAGE_VERSION,
max_concurrent_steps=1,
min_available_mb=0,
min_available_virtual_cores=0,
num_core_instances=0,
num_task_instances=0,
pool_clusters=False,
pool_name='default',
pool_jitter_seconds=60,
pool_wait_minutes=0,
region=_DEFAULT_EMR_REGION,
)
)
def _combine_opts(self, opt_list):
"""Blank out overriden *instance_fleets* and *instance_groups*
Convert image_version of 4.x and later to release_label."""
# blank out any instance_fleets/groups before the last config
# where they are set
opt_list = _blank_out_conflicting_opts(
opt_list,
['instance_fleets', 'instance_groups'],
self._INSTANCE_OPT_NAMES)
# now combine opts, with instance_groups/fleets blanked out
opts = super(EMRJobRunner, self)._combine_opts(opt_list)
# set release_label based on image_version
if (version_gte(opts['image_version'], '4') and
not opts['release_label']):
opts['release_label'] = 'emr-' + opts['image_version']
# don't keep two confs with the same Classification (see #2097)
opts['emr_configurations'] = _deduplicate_emr_configurations(
opts['emr_configurations'])
return opts
def _fix_opt(self, opt_key, opt_value, source):
"""Fix and check various EMR-specific options"""
opt_value = super(EMRJobRunner, self)._fix_opt(
opt_key, opt_value, source)
# *_instance_bid_price
if opt_key.endswith('_instance_bid_price'):
if not opt_value: # don't allow blank bid price
return None
try:
if not float(opt_value):
return None
except ValueError: # maybe EMR allows non-floats?
pass
return str(opt_value) # should be str, not a number
# additional_emr_info
elif opt_key == 'additional_emr_info' and not isinstance(
opt_value, string_types):
return json.dumps(opt_value)
# emr_configurations
elif opt_key == 'emr_configurations':
return [_fix_configuration_opt(c) for c in opt_value]
# region
elif opt_key == 'region':
# don't allow blank region
return opt_value or _DEFAULT_EMR_REGION
# subnet should be None, a string, or a multi-item list
elif opt_key == 'subnet':
return _fix_subnet_opt(opt_value)
else:
return opt_value
def _obfuscate_opt(self, opt_key, opt_value):
"""Obfuscate AWS credentials."""
# don't need to obfuscate empty values
if not opt_value:
return opt_value
if opt_key in ('aws_secret_access_key', 'aws_session_token'):
# don't expose any part of secret credentials
return '...'
elif opt_key == 'aws_access_key_id':
if isinstance(opt_value, string_types):
return '...' + opt_value[-4:]
else:
# don't expose aws_access_key_id if it was accidentally
# put in a list or something
return '...'
else:
return opt_value
def _image_version_gte(self, version):
"""Check if the requested image version is greater than
or equal to *version*. If the *release_label* opt is set,
look at that instead.
If you're checking the actual image version of a cluster, just
use :py:func:`~mrjob.compat.version_gte` and
:py:meth:`get_image_version`.
"""
if self._opts['release_label']:
return version_gte(
self._opts['release_label'].lstrip('emr-'), version)
else:
return version_gte(self._opts['image_version'], version)
def _fix_s3_tmp_and_log_uri_opts(self):
"""Fill in cloud_tmp_dir and cloud_log_dir (in self._opts) if they
aren't already set.
Helper for __init__.
"""
# set cloud_tmp_dir by checking for existing buckets
if not self._opts['cloud_tmp_dir']:
self._set_cloud_tmp_dir()
log.info('Using %s as our temp dir on S3' %
self._opts['cloud_tmp_dir'])
self._opts['cloud_tmp_dir'] = self._check_and_fix_s3_dir(
self._opts['cloud_tmp_dir'])
# set cloud_log_dir
if self._opts['cloud_log_dir']:
self._opts['cloud_log_dir'] = self._check_and_fix_s3_dir(
self._opts['cloud_log_dir'])
else:
self._opts['cloud_log_dir'] = self._opts['cloud_tmp_dir'] + 'logs/'
def _set_cloud_tmp_dir(self):
"""Helper for _fix_s3_tmp_and_log_uri_opts"""
client = self.fs.s3.make_s3_client()
for bucket_name in self.fs.s3.get_all_bucket_names():
if not bucket_name.startswith('mrjob-'):
continue
bucket_region = _get_bucket_region(client, bucket_name)
if bucket_region == self._opts['region']:
# Regions are both specified and match
log.debug("using existing temp bucket %s" % bucket_name)
self._opts['cloud_tmp_dir'] = 's3://%s/tmp/' % bucket_name
return
# That may have all failed. If so, pick a name.
bucket_name = 'mrjob-' + random_identifier()
self._opts['cloud_tmp_dir'] = 's3://%s/tmp/' % bucket_name
log.info('Auto-created temp S3 bucket %s' % bucket_name)
self._wait_for_s3_eventual_consistency()
def _s3_log_dir(self):
"""Get the URI of the log directory for this job's cluster."""
if not self._s3_log_dir_uri:
cluster = self._describe_cluster()
log_uri = cluster.get('LogUri')
if log_uri:
self._s3_log_dir_uri = '%s%s/' % (
log_uri.replace('s3n://', 's3://'), self._cluster_id)
return self._s3_log_dir_uri
def _check_and_fix_s3_dir(self, s3_uri):
"""Helper for __init__"""
if not is_s3_uri(s3_uri):
raise ValueError('Invalid S3 URI: %r' % s3_uri)
if not s3_uri.endswith('/'):
s3_uri = s3_uri + '/'
return s3_uri
def _bash_is_bad(self):
# hopefully, there will eventually be an image version
# where this issue is fixed. See #1548
return self._image_version_gte(_BAD_BASH_IMAGE_VERSION)
def _default_sh_bin(self):
if self._bash_is_bad():
return _BAD_BASH_SH_BIN
else:
return _GOOD_BASH_SH_BIN
def _sh_pre_commands(self):
if self._bash_is_bad() and not self._opts['sh_bin']:
return ['set -e']
else:
return []
@property
def fs(self):
""":py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, and the
local filesystem.
"""
if self._fs is None:
self._fs = CompositeFilesystem()
if self._opts['ec2_key_pair_file']:
self._fs.add_fs('ssh', SSHFilesystem(
ssh_bin=self._ssh_bin(),
ssh_add_bin=self._ssh_add_bin(),
ec2_key_pair_file=self._opts['ec2_key_pair_file']))
self._fs.add_fs('s3', S3Filesystem(
aws_access_key_id=self._opts['aws_access_key_id'],
aws_secret_access_key=self._opts['aws_secret_access_key'],
aws_session_token=self._opts['aws_session_token'],
s3_endpoint=self._opts['s3_endpoint'],
s3_region=self._opts['region'],
part_size=self._upload_part_size()))
if self._opts['ec2_key_pair_file']:
# add hadoop fs after S3 because it tries to handle all URIs
# we'll set hadoop_bin later, once the cluster is set up
self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin=[]))
self._fs.add_fs('local', LocalFilesystem())
return self._fs
def _run(self):
self._launch()
self._finish_run()
def _finish_run(self):
while True:
try:
self._wait_for_steps_to_complete()
break
except _PooledClusterSelfTerminatedException:
self._relaunch()
def _prepare_for_launch(self):
"""Set up files needed for the job."""
self._check_output_not_exists()
self._create_setup_wrapper_scripts()
self._add_bootstrap_files_for_upload()
self._add_master_node_setup_files_for_upload()
self._add_job_files_for_upload()
self._upload_local_files()
# make sure we can see the files we copied to S3
self._wait_for_s3_eventual_consistency()
def _launch(self):
"""Set up files and then launch our job on EMR."""
self._prepare_for_launch()
self._launch_emr_job()
def _relaunch(self):
# files are already in place; just start with a fresh cluster
assert not self._opts['cluster_id']
self._cluster_id = None
self._created_cluster = False
self._step_ids = []
# old SSH tunnel isn't valid for this cluster (see #1549)
if self._ssh_proc:
self._kill_ssh_tunnel()
# don't try to connect to HDFS on the old cluster
if hasattr(self.fs, 'hadoop'):
self.fs.hadoop.set_hadoop_bin([])
self._launch_emr_job()
def _check_input_path(self, path):
"""Add a custom check for S3 paths to ensure they're not in
Glacier (which causes a cryptic error). See #1887."""
# handle non-S3 paths the usual way
if not is_s3_uri(path):
super(EMRJobRunner, self)._check_input_path(path)
return
exists = False
for uri, obj in self.fs.s3._ls(path):
exists = True
# we currently just look for 'ongoing-request="false"'
# in the *restore* field and ignore the expiration date
# (if the object has expired, the *restore* field won't be set).
#
# See #1887 for more discussion of checking expiration.
if obj.storage_class == 'GLACIER' and not (
obj.restore and _RESTORED_FROM_GLACIER in obj.restore):
raise IOError(
'%s is archived in Glacier and'
' cannot be read as input!' % uri)
if not exists:
raise IOError(
'Input path %s does not exist!' % (path,))
def _check_output_not_exists(self):
"""Verify the output path does not already exist. This avoids
provisioning a cluster only to have Hadoop refuse to launch.
"""
try:
if self.fs.exists(self._output_dir):
raise IOError(
'Output path %s already exists!' % (self._output_dir,))
except botocore.exceptions.ClientError:
pass
def _add_bootstrap_files_for_upload(self, persistent=False):
"""Add files needed by the bootstrap script to self._upload_mgr.
Create the master bootstrap script if necessary.
persistent -- set by make_persistent_cluster()
"""
# all other files needed by the script are already in
# _bootstrap_dir_mgr
for path in self._bootstrap_dir_mgr.paths():
self._upload_mgr.add(path)
# now that we know where the above files live, we can create
# the master bootstrap script
self._create_master_bootstrap_script_if_needed()
if self._master_bootstrap_script_path:
self._upload_mgr.add(self._master_bootstrap_script_path)
# make sure bootstrap action scripts are on S3
for bootstrap_action in self._bootstrap_actions():
self._upload_mgr.add(bootstrap_action['path'])
# Add max-mins-idle script if we need it
if persistent or self._opts['pool_clusters']:
self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)
def _add_master_node_setup_files_for_upload(self):
"""Add files necesary for the master node setup script to
self._master_node_setup_mgr() and self._upload_mgr().
Create the master node setup script if necessary.
"""
# currently, only used by libjars; see #1336 for how we might open
# this up more generally
for path in self._opts['libjars']:
# passthrough for libjars already on EMR
if path.startswith('file:///'):
continue
self._master_node_setup_mgr.add('file', path)
self._upload_mgr.add(path)
self._create_master_node_setup_script_if_needed()
if self._master_node_setup_script_path:
self._upload_mgr.add(self._master_node_setup_script_path)
def _add_job_files_for_upload(self):
"""Add files needed for running the job (setup and input)
to self._upload_mgr."""
for path in self._py_files():
self._upload_mgr.add(path)
if self._opts['hadoop_streaming_jar']:
self._upload_mgr.add(self._opts['hadoop_streaming_jar'])
# upload JARs and (Python) scripts run by steps
for step in self._get_steps():
for key in 'jar', 'script':
if step.get(key):
self._upload_mgr.add(step[key])
def _ssh_add_bin(self):
# the args of the ssh-add binary
return self._opts['ssh_add_bin'] or ['ssh-add']
def _ssh_bin(self):
# the args of the ssh binary
return self._opts['ssh_bin'] or ['ssh']
def _set_up_ssh_tunnel_and_hdfs(self):
if hasattr(self.fs, 'hadoop'):
self.fs.hadoop.set_hadoop_bin(self._ssh_hadoop_bin())
self._set_up_ssh_tunnel()
def _ssh_tunnel_config(self):
"""Look up AMI version, and return a dict with the following keys:
name: "job tracker" or "resource manager"
path: path to start page of job tracker/resource manager
port: port job tracker/resource manager is running on.
"""
return map_version(self.get_image_version(),
_IMAGE_VERSION_TO_SSH_TUNNEL_CONFIG)
def _job_tracker_host(self):
"""The host of the job tracker/resource manager, from the master node.
"""
tunnel_config = self._ssh_tunnel_config()
if tunnel_config['localhost']:
# Issue #1311: on the 2.x AMIs, we want to tunnel to the job
# tracker on localhost; otherwise it won't
# work on some VPC setups.
return 'localhost'
else:
# Issue #1397: on the 3.x and 4.x AMIs we want to tunnel to the
# resource manager on the master node's *internal* IP; otherwise
# it work won't work on some VPC setups
return self._master_private_ip()
def _ssh_tunnel_args(self, bind_port):
for opt_name in ('ec2_key_pair', 'ec2_key_pair_file',
'ssh_bind_ports'):
if not self._opts[opt_name]:
log.warning(
" You must set %s in order to set up the SSH tunnel!"
% opt_name)
self._give_up_on_ssh_tunnel = True
return
host = self._address_of_master()
if not host:
return
return self._ssh_bin() + [
'-o', 'VerifyHostKeyDNS=no',
'-o', 'StrictHostKeyChecking=no',
'-o', 'ExitOnForwardFailure=yes',
'-o', 'UserKnownHostsFile=%s' % os.devnull,
] + self._ssh_tunnel_opts(bind_port) + [
'-i', self._opts['ec2_key_pair_file'],
'hadoop@%s' % host,
]
def _ssh_hadoop_bin(self):
if not self._opts['ec2_key_pair_file']:
return []
host = self._address_of_master()
if not host:
return []
return self._ssh_bin() + [
'-o', 'VerifyHostKeyDNS=no',
'-o', 'StrictHostKeyChecking=no',
'-o', 'ExitOnForwardFailure=yes',
'-o', 'UserKnownHostsFile=%s' % os.devnull,
'-i', self._opts['ec2_key_pair_file'],
'-q', # don't care about SSH warnings, we just want hadoop
'hadoop@%s' % host,
'hadoop',
]
def _job_tracker_url(self):
"""Not actually used to set up the SSH tunnel, used to run curl
over SSH to fetch from the job tracker directly."""
tunnel_config = self._ssh_tunnel_config()
return 'http://%s:%d%s' % (
self._job_tracker_host(),
tunnel_config['port'],
tunnel_config['path'])
### Running the job ###
def cleanup(self, mode=None):
super(EMRJobRunner, self).cleanup(mode=mode)
# always stop our SSH tunnel if it's still running
self._kill_ssh_tunnel()
# stop the cluster if it belongs to us (it may have stopped on its
# own already, but that's fine)
# don't stop it if it was created due to --pool because the user
# probably wants to use it again
if self._cluster_id and not self._opts['cluster_id'] \
and not self._opts['pool_clusters']:
log.info('Terminating cluster: %s' % self._cluster_id)
try:
self.make_emr_client().terminate_job_flows(
JobFlowIds=[self._cluster_id]
)
except Exception as e:
log.exception(e)
# TODO: otherwise, cancel any steps we submitted (#1570)
def _cleanup_cloud_tmp(self):
# delete all the files we created on S3
if self._cloud_tmp_dir:
try:
log.info('Removing s3 temp directory %s...' %
self._cloud_tmp_dir)
self.fs.rm(self._cloud_tmp_dir)
self._cloud_tmp_dir = None
except Exception as e:
log.exception(e)
def _cleanup_logs(self):
super(EMRJobRunner, self)._cleanup_logs()
# delete the log files, if it's a cluster we created (the logs
# belong to the cluster)
if self._s3_log_dir() and not self._opts['cluster_id'] \
and not self._opts['pool_clusters']:
try:
log.info('Removing log files in %s...' % self._s3_log_dir())
self.fs.rm(self._s3_log_dir())
except Exception as e:
log.exception(e)
def _cleanup_cluster(self):
if not self._cluster_id:
# If we don't have a cluster, then we can't terminate it.
return
emr_client = self.make_emr_client()
try:
log.info("Attempting to terminate cluster")
emr_client.terminate_job_flows(
JobFlowIds=[self._cluster_id]
)
except Exception as e:
# Something happened with boto3 and the user should know.
log.exception(e)
return
log.info('Cluster %s successfully terminated' % self._cluster_id)
def _wait_for_s3_eventual_consistency(self):
"""Sleep for a little while, to give S3 a chance to sync up.
"""
log.debug('Waiting %.1fs for S3 eventual consistency...' %
self._opts['cloud_fs_sync_secs'])
time.sleep(self._opts['cloud_fs_sync_secs'])
def _wait_for_cluster_to_terminate(self, cluster=None):
if not cluster:
cluster = self._describe_cluster()
log.info('Waiting for cluster (%s) to terminate...' %
cluster['Id'])
if (cluster['Status']['State'] == 'WAITING' and
cluster['AutoTerminate']):
raise Exception('Operation requires cluster to terminate, but'
' it may never do so.')
while True:
log.info(' %s' % cluster['Status']['State'])
if cluster['Status']['State'] in (
'TERMINATED', 'TERMINATED_WITH_ERRORS'):
return
time.sleep(self._opts['check_cluster_every'])
cluster = self._describe_cluster()
# instance types
def _instance_type(self, role):
"""What instance type should we use for the given role?
(one of 'MASTER', 'CORE', 'TASK')"""
if role not in _INSTANCE_ROLES:
raise ValueError
# explicitly set
if self._opts[role.lower() + '_instance_type']:
return self._opts[role.lower() + '_instance_type']
elif self._instance_is_worker(role):
# using *instance_type* here is defensive programming;
# if set, it should have already been popped into the worker
# instance type option(s) by _fix_instance_opts() above
return self._opts['instance_type'] or self._default_instance_type()
else:
return self._default_instance_type()
def _default_instance_type(self):
"""Default instance type if not set by the user."""
# m5.xlarge is available on all regions, but only works in AMI 5.13.0
# or later. See #2098.
if self._image_version_gte('5.13.0'):
return 'm5.xlarge'
else:
return 'm4.large'
def _instance_is_worker(self, role):
"""Do instances of the given role run tasks? True for non-master
instances and sole master instance."""
if role not in _INSTANCE_ROLES:
raise ValueError
return (role != 'MASTER' or
sum(self._num_instances(role)
for role in _INSTANCE_ROLES) == 1)
def _num_instances(self, role):
"""How many of the given instance type do we want?"""
if role not in _INSTANCE_ROLES:
raise ValueError
if role == 'MASTER':
return 1 # there can be only one
else:
return self._opts['num_' + role.lower() + '_instances']
def _instance_bid_price(self, role):
"""What's the bid price for the given role (if any)?"""
if role not in _INSTANCE_ROLES:
raise ValueError
return self._opts[role.lower() + '_instance_bid_price']
def _instance_groups(self):
"""Which instance groups do we want to request?
Returns the value of the ``InstanceGroups`` parameter
passed to the EMR API.
"""
if self._opts['instance_groups']:
return self._opts['instance_groups']
return [
_build_instance_group(
role=role,
instance_type=self._instance_type(role),
num_instances=self._num_instances(role),
bid_price=self._instance_bid_price(role),
)
for role in _INSTANCE_ROLES
if self._num_instances(role)
]
def _create_cluster(self, persistent=False):
"""Create an empty cluster on EMR, and return the ID of that
job.
If the ``tags`` option is set, also tags the cluster (which
is a separate API call).
persistent -- if this is true, create the cluster with the keep_alive
option, indicating the job will have to be manually terminated.
"""
log.debug('Creating Elastic MapReduce cluster')
emr_client = self.make_emr_client()
kwargs = self._cluster_kwargs(persistent)
log.debug('Calling run_job_flow(%s)' % (
', '.join('%s=%r' % (k, v)
for k, v in sorted(kwargs.items()))))
cluster_id = emr_client.run_job_flow(**kwargs)['JobFlowId']
log.info('Created new cluster %s' % cluster_id)
# set EMR tags for the cluster
tags = dict(self._opts['tags'])
# patch in version
tags['__mrjob_version'] = mrjob.__version__
# patch in cluster label and owner
tags['__mrjob_label'] = self._label()
tags['__mrjob_owner'] = self._owner()
# add pooling tags
if self._opts['pool_clusters']:
tags['__mrjob_pool_hash'] = self._pool_hash()
tags['__mrjob_pool_name'] = self._opts['pool_name']
self._add_tags(tags, cluster_id)
return cluster_id
def _add_tags(self, tags, cluster_id):
"""Add tags in the dict *tags* to cluster *cluster_id*. Do nothing
if *tags* is empty or ``None``"""
if not tags:
return
tags_items = sorted(tags.items())
self.make_emr_client().add_tags(
ResourceId=cluster_id,
Tags=[dict(Key=k, Value=v) for k, v in tags_items])
log.info('Added EMR tags to cluster %s: %s' % (
cluster_id,
', '.join('%s=%s' % (tag, value) for tag, value in tags_items)))
# TODO: could break this into sub-methods for clarity
def _cluster_kwargs(self, persistent=False):
"""Build kwargs for emr_client.run_job_flow()"""
kwargs = {}
kwargs['Name'] = self._job_key + self._cluster_name_pooling_suffix()
kwargs['LogUri'] = self._opts['cloud_log_dir']
if self._opts['release_label']:
kwargs['ReleaseLabel'] = self._opts['release_label']
else:
kwargs['AmiVersion'] = self._opts['image_version']
if self._opts['image_id']:
kwargs['CustomAmiId'] = self._opts['image_id']
# capitalizing Instances because it's just an API parameter
kwargs['Instances'] = Instances = {}
if self._opts['zone']:
Instances['Placement'] = dict(AvailabilityZone=self._opts['zone'])
if self._opts['instance_fleets']:
Instances['InstanceFleets'] = self._opts['instance_fleets']
else:
Instances['InstanceGroups'] = self._instance_groups()
# EBS Root volume size
if self._opts['ebs_root_volume_gb']:
kwargs['EbsRootVolumeSize'] = self._opts['ebs_root_volume_gb']
# bootstrap actions
kwargs['BootstrapActions'] = BootstrapActions = []
for i, bootstrap_action in enumerate(self._bootstrap_actions()):
uri = self._upload_mgr.uri(bootstrap_action['path'])
BootstrapActions.append(dict(
Name=('action %d' % i),
ScriptBootstrapAction=dict(
Path=uri,
Args=bootstrap_action['args'])))
if self._master_bootstrap_script_path:
uri = self._upload_mgr.uri(self._master_bootstrap_script_path)
BootstrapActions.append(dict(
Name='master',
ScriptBootstrapAction=dict(
Path=uri,
Args=[])))
if persistent or self._opts['pool_clusters']:
Instances['KeepJobFlowAliveWhenNoSteps'] = True
# use idle termination script on persistent clusters
# add it last, so that we don't count bootstrapping as idle time
uri = self._upload_mgr.uri(
_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)
# script takes args in (integer) seconds
ba_args = [str(int(self._opts['max_mins_idle'] * 60))]
BootstrapActions.append(dict(
Name='idle timeout',
ScriptBootstrapAction=dict(
Path=uri,
Args=ba_args)))
if self._opts['ec2_key_pair']:
Instances['Ec2KeyName'] = self._opts['ec2_key_pair']
kwargs['Steps'] = Steps = []
kwargs['StepConcurrencyLevel'] = self._opts['max_concurrent_steps']
if self._opts['enable_emr_debugging']:
# other steps are added separately
Steps.append(self._build_debugging_step())
if self._opts['additional_emr_info']:
kwargs['AdditionalInfo'] = self._opts['additional_emr_info']
kwargs['VisibleToAllUsers'] = True
kwargs['JobFlowRole'] = self._instance_profile()
kwargs['ServiceRole'] = self._service_role()
applications = self._applications()
if applications:
kwargs['Applications'] = [
dict(Name=a) for a in sorted(applications)]
emr_configurations = self._emr_configurations()
if emr_configurations:
kwargs['Configurations'] = emr_configurations
if self._opts['subnet']:
# handle lists of subnets (for instance fleets)
if isinstance(self._opts['subnet'], list):
Instances['Ec2SubnetIds'] = self._opts['subnet']
else:
Instances['Ec2SubnetId'] = self._opts['subnet']
return self._add_extra_cluster_params(kwargs)
def _instance_profile(self):
try:
return (self._opts['iam_instance_profile'] or
get_or_create_mrjob_instance_profile(
self.make_iam_client()))
except botocore.exceptions.ClientError as ex:
if _client_error_status(ex) != 403:
raise
log.warning(
"Can't access IAM API, trying default instance profile: %s" %
_FALLBACK_INSTANCE_PROFILE)
return _FALLBACK_INSTANCE_PROFILE
def _service_role(self):
try:
return (self._opts['iam_service_role'] or
get_or_create_mrjob_service_role(self.make_iam_client()))
except botocore.exceptions.ClientError as ex:
if _client_error_status(ex) != 403:
raise
log.warning(
"Can't access IAM API, trying default service role: %s" %
_FALLBACK_SERVICE_ROLE)
return _FALLBACK_SERVICE_ROLE
def _action_on_failure(self):
# don't terminate other people's clusters
if (self._opts['emr_action_on_failure']):
return self._opts['emr_action_on_failure']
elif not self._add_steps_in_batch():
# concurrent clusters don't allow CANCEL_ON_WAIT
return 'CONTINUE'
elif (self._opts['cluster_id'] or
self._opts['pool_clusters']):
return 'CANCEL_AND_WAIT'
else:
return 'TERMINATE_CLUSTER'
def _add_steps_in_batch(self):
if self._opts['add_steps_in_batch'] is None:
# by default, add steps in batch only when concurrent steps
# are not possible
return not self._image_version_gte('5.28.0')
else:
return self._opts['add_steps_in_batch']
def _steps_to_submit(self):
"""Return a step data structures to pass to ``boto3``"""
# quick, add the other steps before the job spins up and
# then shuts itself down! (in practice that won't happen
# for several minutes)
steps = []
if self._master_node_setup_script_path:
steps.append(self._build_master_node_setup_step())
for n in range(self._num_steps()):
steps.append(self._build_step(n))
return steps
def _build_step(self, step_num):
step = self._get_step(step_num)
if step['type'] == 'streaming':
method = self._streaming_step_hadoop_jar_step
elif step['type'] == 'jar':
method = self._jar_step_hadoop_jar_step
elif _is_spark_step_type(step['type']):
method = self._spark_step_hadoop_jar_step
else:
raise ValueError('Bad step type: %r' % (step['type'],))
hadoop_jar_step = method(step_num)
return dict(
ActionOnFailure=self._action_on_failure(),
HadoopJarStep=hadoop_jar_step,
Name=self._step_name(step_num),
)
def _streaming_step_hadoop_jar_step(self, step_num):
jar, step_arg_prefix = self._get_streaming_jar_and_step_arg_prefix()
args = (step_arg_prefix +
self._hadoop_streaming_jar_args(step_num))
return dict(Jar=jar, Args=args)
def _jar_step_hadoop_jar_step(self, step_num):
step = self._get_step(step_num)
jar = self._upload_uri_or_remote_path(step['jar'])
args = (
self._interpolate_jar_step_args(step['args'], step_num))
hadoop_jar_step = dict(Jar=jar, Args=args)
if step.get('main_class'):
hadoop_jar_step['MainClass'] = step['main_class']
return hadoop_jar_step
def _spark_step_hadoop_jar_step(self, step_num):
return dict(
Jar=self._spark_jar(),
Args=self._args_for_spark_step(step_num))
def _interpolate_spark_script_path(self, path):
if path in self._working_dir_mgr.paths():
return self._dest_in_wd_mirror(
path, self._working_dir_mgr.name('file', path)) or path
else:
return self._upload_mgr.uri(path)
def _find_spark_submit_bin(self):
if version_gte(self.get_image_version(), '4'):
return ['spark-submit']
else:
return [_3_X_SPARK_SUBMIT]
def _spark_master(self):
# hard-coded for EMR
return 'yarn'
def _spark_deploy_mode(self):
# hard-coded for EMR; otherwise it can't access S3
return 'cluster'
def _spark_jar(self):
if version_gte(self.get_image_version(), '4'):
return _4_X_COMMAND_RUNNER_JAR
else:
return self._script_runner_jar_uri()
def _step_name(self, step_num):
"""Return something like: ``'mr_your_job Step X of Y'``"""
return '%s: Step %d of %d' % (
self._job_key, step_num + 1, self._num_steps())
def _upload_uri_or_remote_path(self, path):
"""Return where *path* will be uploaded, or, if it starts with
``'file:///'``, a local path."""
if path.startswith('file:///'):
return path[7:] # keep leading slash
else:
return self._upload_mgr.uri(path)
def _build_master_node_setup_step(self):
name = '%s: Master node setup' % self._job_key
jar = self._script_runner_jar_uri()
step_args = [self._upload_mgr.uri(self._master_node_setup_script_path)]
return dict(
Name=name,
ActionOnFailure=self._action_on_failure(),
HadoopJarStep=dict(
Jar=jar,
Args=step_args,
)
)
def _libjar_paths(self):
results = []
# libjars should be in the working dir of the master node setup
# script path, unless they refer to paths directly (file:///)
for path in self._opts['libjars']:
if path.startswith('file:///'):
results.append(path[7:]) # keep leading slash
else:
results.append(posixpath.join(
self._master_node_setup_working_dir(),
self._master_node_setup_mgr.name('file', path)))
return results
def _get_streaming_jar_and_step_arg_prefix(self):
if self._opts['hadoop_streaming_jar']:
if self._opts['hadoop_streaming_jar'].startswith('file://'):
# special case: jar is already on EMR
# relative paths are OK (though maybe not useful)
return self._opts['hadoop_streaming_jar'][7:], []
else:
return self._upload_mgr.uri(
self._opts['hadoop_streaming_jar']), []
elif version_gte(self.get_image_version(), '4'):
# 4.x AMIs use an intermediary jar
return _4_X_COMMAND_RUNNER_JAR, ['hadoop-streaming']
else:
# 2.x and 3.x AMIs just use a regular old streaming jar
return _PRE_4_X_STREAMING_JAR, []
def _launch_emr_job(self):
"""Create an empty cluster on EMR, and set self._cluster_id to
its ID.
"""
# step concurrency level of a cluster we added steps to, used
# for locking
step_concurrency_level = None
# try to find a cluster from the pool. basically auto-fill
# 'cluster_id' if possible and then follow normal behavior.
if (self._opts['pool_clusters'] and not self._cluster_id):
cluster_id, step_concurrency_level = self._find_cluster()
if cluster_id:
self._cluster_id = cluster_id
self._locked_cluster = True
# create a cluster if we're not already using an existing one
if not self._cluster_id:
self._cluster_id = self._create_cluster()
self._created_cluster = True
else:
log.info('Adding our job to existing cluster %s' %
self._cluster_id)
self._log_address_of_master_once()
# now that we know which cluster it is, check for Spark support
if self._has_spark_steps():
self._check_cluster_spark_support()
# define our steps
steps = self._steps_to_submit()
if self._add_steps_in_batch():
self._add_steps_to_cluster(steps)
else:
# later steps will be added one at a time
self._add_steps_to_cluster(steps[:1])
# if we locked a cluster with concurrent steps, we can release
# the lock immediately
if step_concurrency_level and step_concurrency_level > 1:
self._release_cluster_lock()
# learn about how fast the cluster state switches
cluster = self._describe_cluster()
log.debug('Cluster has state %s' % cluster['Status']['State'])
# SSH FS uses sudo if we're on AMI 4.3.0+ (see #1244)
if hasattr(self.fs, 'ssh') and version_gte(
self.get_image_version(), '4.3.0'):
self.fs.ssh.use_sudo_over_ssh()
def _release_cluster_lock(self):
if not self._locked_cluster:
return
emr_client = self.make_emr_client()
log.info(' releasing cluster lock')
# this can fail, but usually it's because the cluster
# started terminating, so only try releasing the lock once
_attempt_to_unlock_cluster(emr_client, self._cluster_id)
self._locked_cluster = False
def _add_steps_to_cluster(self, steps):
"""Add steps (from _steps_to_submit()) to our cluster and append their
IDs to self._step_ids"""
emr_client = self.make_emr_client()
steps_kwargs = dict(JobFlowId=self._cluster_id, Steps=steps)
log.debug('Calling add_job_flow_steps(%s)' % ','.join(
('%s=%r' % (k, v)) for k, v in steps_kwargs.items()))
step_ids = emr_client.add_job_flow_steps(**steps_kwargs)['StepIds']
self._step_ids.extend(step_ids)
def get_job_steps(self):
"""Fetch the steps submitted by this runner from the EMR API.
.. deprecated:: 0.7.4
.. versionadded:: 0.6.1
"""
log.warning(
'get_job_steps() is deprecated and will be removed in v0.8.0')
return _get_job_steps(
self.make_emr_client(), self.get_cluster_id(), self.get_job_key())
def _wait_for_steps_to_complete(self):
"""Wait for every step of the job to complete, one by one."""
steps = self._steps_to_submit()
# clear out log interpretations if they were filled somehow
self._log_interpretations = []
self._mns_log_interpretation = None
# open SSH tunnel if cluster is already ready
# (this happens with pooling). See #1115
cluster = self._describe_cluster()
if cluster['Status']['State'] in ('RUNNING', 'WAITING'):
self._set_up_ssh_tunnel_and_hdfs()
for i, step in enumerate(steps):
# if our step isn't already submitted, submit it
if len(self._step_ids) <= i:
self._add_steps_to_cluster(
steps[len(self._step_ids):i + 1])
step_id = self._step_ids[i]
step_name = step['Name'].split(': ')[-1]
# treat master node setup script is treated as step -1
if self._master_node_setup_script_path:
step_num = i - 1
else:
step_num = i
log.info('Waiting for %s (%s) to complete...' %
(step_name, step_id))
self._wait_for_step_to_complete(step_id, step_num)
def _wait_for_step_to_complete(self, step_id, step_num=None):
"""Helper for _wait_for_step_to_complete(). Wait for
step with the given ID to complete, and fetch counters.
If it fails, attempt to diagnose the error, and raise an
exception.
:param step_id: the s-XXXXXXX step ID on EMR
:param step_num: which step this is out of the steps
belonging to our job (0-indexed). Master node
setup script, if there is one, is step -1
This also adds an item to self._log_interpretations or sets
self._mns_log_interpretation
"""
log_interpretation = dict(step_id=step_id)
# suppress warnings about missing job ID for script-runner.jar
if step_num == -1:
log_interpretation['no_job'] = True
self._mns_log_interpretation = log_interpretation
else:
self._log_interpretations.append(log_interpretation)
emr_client = self.make_emr_client()
while True:
# don't antagonize EMR's throttling
log.debug('Waiting %.1f seconds...' %
self._opts['check_cluster_every'])
time.sleep(self._opts['check_cluster_every'])
# log address of the master node once if we have it
self._log_address_of_master_once()
step = emr_client.describe_step(
ClusterId=self._cluster_id, StepId=step_id)['Step']
if step['Status']['State'] == 'PENDING':
cluster = self._describe_cluster()
reason = _get_reason(cluster)
reason_desc = (': %s' % reason) if reason else ''
# we can open the ssh tunnel if cluster is ready (see #1115)
if cluster['Status']['State'] in ('RUNNING', 'WAITING'):
self._set_up_ssh_tunnel_and_hdfs()
log.info(' PENDING (cluster is %s%s)' % (
cluster['Status']['State'], reason_desc))
continue
elif step['Status']['State'] == 'RUNNING':
time_running_desc = ''
start = step['Status']['Timeline'].get('StartDateTime')
if start:
time_running_desc = ' for %s' % strip_microseconds(
_boto3_now() - start)
# now is the time to tunnel, if we haven't already
self._set_up_ssh_tunnel_and_hdfs()
log.info(' RUNNING%s' % time_running_desc)
# don't log progress for master node setup step, because
# it doesn't appear in job tracker
if step_num >= 0:
self._log_step_progress()
# it's safe to clean up our lock, cluster isn't WAITING
self._release_cluster_lock()
continue
# we're done, will return at the end of this
elif step['Status']['State'] == 'COMPLETED':
log.info(' COMPLETED')
# will fetch counters, below, and then return
else:
# step has failed somehow. *reason* seems to only be set
# when job is cancelled (e.g. 'Job terminated')
reason = _get_reason(step)
reason_desc = (' (%s)' % reason) if reason else ''
log.info(' %s%s' % (
step['Status']['State'], reason_desc))
# print cluster status; this might give more context
# why step didn't succeed
cluster = self._describe_cluster()
reason = _get_reason(cluster)
reason_desc = (': %s' % reason) if reason else ''
log.info('Cluster %s %s %s%s' % (
cluster['Id'],
'was' if 'ED' in cluster['Status']['State'] else 'is',
cluster['Status']['State'],
reason_desc))
if cluster['Status']['State'] in (
'TERMINATING', 'TERMINATED', 'TERMINATED_WITH_ERRORS'):
# was it caused by a pooled cluster self-terminating?
# (if so, raise _PooledClusterSelfTerminatedException)
self._check_for_pooled_cluster_self_termination(
cluster, step)
# was it caused by IAM roles?
self._check_for_missing_default_iam_roles(cluster)
# was it because a bootstrap action failed?
self._check_for_failed_bootstrap_action(cluster)
# spark steps require different log parsing. The master node
# setup script is a JAR step (albeit one that never produces
# counters)
step_type = (
self._get_step(step_num)['type'] if step_num >= 0 else 'jar')
# step is done (either COMPLETED, FAILED, INTERRUPTED). so
# try to fetch counters. (Except for master node setup
# and Spark, which has no counters.)
if step['Status']['State'] != 'CANCELLED' and step_num >= 0:
self._log_counters(log_interpretation, step_num)
if step['Status']['State'] == 'COMPLETED':
return
if step['Status']['State'] == 'FAILED':
error = self._pick_error(log_interpretation, step_type)
if error:
_log_probable_cause_of_failure(log, error)
raise StepFailedException(
step_num=step_num, num_steps=self._num_steps(),
# "Step 0 of ... failed" looks weird
step_desc=(
'Master node setup step' if step_num == -1 else None))
def _log_address_of_master_once(self):
"""Log the master node's public DNS, if we haven't already"""
# Some users like to SSH in manually. See #2007
if not self._cluster_id:
return
if self._cluster_id in self._logged_address_of_master:
return
master_dns = self._address_of_master()
if not master_dns:
return
log.info(' master node is %s' % master_dns)
self._logged_address_of_master.add(self._cluster_id)
def _log_step_progress(self):
"""Tunnel to the job tracker/resource manager and log the
progress of the current step.
(This takes no arguments; we just assume the most recent running
job is ours, which should be correct for EMR.)
"""
progress_html = (self._progress_html_from_tunnel() or
self._progress_html_over_ssh())
if not progress_html:
return
tunnel_config = self._ssh_tunnel_config()
if tunnel_config['name'] == 'job tracker':
map_progress, reduce_progress = (
_parse_progress_from_job_tracker(progress_html))
if map_progress is not None:
log.info(' map %3d%% reduce %3d%%' % (
map_progress, reduce_progress))
else:
progress = _parse_progress_from_resource_manager(
progress_html)
if progress is not None:
log.info(' %5.1f%% complete' % progress)
def _progress_html_from_tunnel(self):
"""Fetch progress by calling :py:func:`urlopen` on our ssh tunnel, or
return ``None``."""
if not self._ssh_tunnel_url:
return None
tunnel_config = self._ssh_tunnel_config()
log.debug(' Fetching progress from %s at %s' % (
tunnel_config['name'], self._ssh_tunnel_url))
tunnel_handle = None
try:
tunnel_handle = urlopen(self._ssh_tunnel_url)
return tunnel_handle.read()
except Exception as e:
log.debug(' failed: %s' % str(e))
return None
finally:
if tunnel_handle:
tunnel_handle.close()
def _progress_html_over_ssh(self):
"""Fetch progress by running :command:`curl` over SSH, or return
``None``"""
host = self._address_of_master()
if not self._opts['ec2_key_pair_file']:
return None
if not host:
return None
tunnel_config = self._ssh_tunnel_config()
remote_url = self._job_tracker_url()
log.debug(' Fetching progress from %s over SSH' % (
tunnel_config['name']))
try:
stdout, _ = self.fs.ssh._ssh_run(host, ['curl', remote_url])
return stdout
except Exception as e:
log.debug(' failed: %s' % str(e))
return None
def _check_for_pooled_cluster_self_termination(self, cluster, step):
"""If failure could have been due to a pooled cluster self-terminating,
raise _PooledClusterSelfTerminatedException"""
# this check might not even be relevant
if not self._opts['pool_clusters']:
return
if self._opts['cluster_id']:
return
# if a cluster we created self-terminated, something is wrong with
# the way self-termination is set up (e.g. very low idle time)
if self._created_cluster:
return
# don't check for max_mins_idle because it's possible to
# join a self-terminating cluster without having max_mins_idle set
# on this runner (pooling only cares about the master bootstrap script,
# not other bootstrap actions)
# our step should be CANCELLED (not failed)
if step['Status']['State'] != 'CANCELLED':
return
# we *could* check if the step had a chance to start by checking if
# step.status.timeline.startdatetime is set. This shouldn't happen in
# practice, and if it did, we'd still be fine as long as the script
# didn't write data to the output dir, so it's not worth the extra
# code.
# cluster should have stopped because master node failed
# could also check for
# cluster.status.statechangereason.code == 'INSTANCE_FAILURE'
if not _CLUSTER_SELF_TERMINATED_RE.match(_get_reason(cluster)):
return
log.info('Pooled cluster self-terminated, trying again...')
raise _PooledClusterSelfTerminatedException
def _check_for_missing_default_iam_roles(self, cluster):
"""If cluster couldn't start due to missing IAM roles, tell
user what to do."""
if not cluster:
cluster = self._describe_cluster()
reason = _get_reason(cluster)
if any(reason.endswith('/%s is invalid' % role)
for role in (_FALLBACK_INSTANCE_PROFILE,
_FALLBACK_SERVICE_ROLE)):
log.warning(
'\n'
'Ask your admin to create the default EMR roles'
' by following:\n\n'
' http://docs.aws.amazon.com/ElasticMapReduce/latest'
'/DeveloperGuide/emr-iam-roles-creatingroles.html\n')
def _default_step_output_dir(self):
# put intermediate data in HDFS
return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key
### LOG PARSING (implementation of LogInterpretationMixin) ###
def _check_for_failed_bootstrap_action(self, cluster):
"""If our bootstrap actions failed, parse the stderr to find
out why."""
reason = _get_reason(cluster)
action_num_and_node_id = _check_for_nonzero_return_code(reason)
if not action_num_and_node_id:
return
if not self._read_logs():
return
# this doesn't really correspond to a step, so
# don't bother storing it in self._log_interpretations
bootstrap_interpretation = _interpret_emr_bootstrap_stderr(
self.fs, self._ls_bootstrap_stderr_logs(**action_num_and_node_id))
# should be 0 or 1 errors, since we're checking a single stderr file
if bootstrap_interpretation.get('errors'):
error = bootstrap_interpretation['errors'][0]
_log_probable_cause_of_failure(log, error)
def _ls_bootstrap_stderr_logs(self, action_num=None, node_id=None):
"""_ls_bootstrap_stderr_logs(), with logging for each log we parse."""
if not self._read_logs():
return
for match in _ls_emr_bootstrap_stderr_logs(
self.fs,
self._stream_bootstrap_log_dirs(
action_num=action_num, node_id=node_id),
action_num=action_num,
node_id=node_id):
log.info(' Parsing boostrap stderr log: %s' % match['path'])
yield match
def _stream_bootstrap_log_dirs(self, action_num=None, node_id=None):
"""Stream a single directory on S3 containing the relevant bootstrap
stderr. Optionally, use *action_num* and *node_id* to narrow it down
further.
"""
if action_num is None or node_id is None:
s3_dir_name = 'node'
else:
s3_dir_name = posixpath.join(
'node', node_id, 'bootstrap-actions', str(action_num + 1))
# dir_name=None means don't try to SSH in.
#
# TODO: If the failure is on the master node, we could just look in
# /mnt/var/log/bootstrap-actions. However, if it's on a worker node,
# we'd have to look up its internal IP using the ListInstances
# API call. This *would* be a bit faster though. See #1346.
return self._stream_log_dirs(
'bootstrap logs',
dir_name=None, # don't SSH in
s3_dir_name=s3_dir_name)
def _stream_history_log_dirs(self, output_dir=None):
"""Yield lists of directories to look for the history log in."""
if version_gte(self.get_image_version(), '4'):
hdfs_dir_name = 'history'
# on 4.0.0 (and possibly other versions before 4.3.0)
# history logs aren't on the filesystem. See #1253
dir_name = 'hadoop-mapreduce/history'
s3_dir_name = 'hadoop-mapreduce/history'
elif version_gte(self.get_image_version(), '3'):
# on the 3.x AMIs, the history log is on HDFS only
# (not even S3)
hdfs_dir_name = 'history'
dir_name = None
s3_dir_name = None
else:
# 2.x AMIs don't use YARN, so no point in checking HDFS
hdfs_dir_name = None
dir_name = 'hadoop/history'
s3_dir_name = 'jobs'
return self._stream_log_dirs(
'history log',
hdfs_dir_name=hdfs_dir_name,
dir_name=dir_name,
s3_dir_name=s3_dir_name)
def _stream_task_log_dirs(self, application_id=None, output_dir=None):
"""Get lists of directories to look for the task logs in."""
if version_gte(self.get_image_version(), '4'):
# denied access on some 4.x AMIs by the yarn user, see #1244
dir_name = 'hadoop-yarn/containers'
s3_dir_name = 'containers'
else:
dir_name = 'hadoop/userlogs'
s3_dir_name = 'task-attempts'
if application_id:
dir_name = posixpath.join(dir_name, application_id)
s3_dir_name = posixpath.join(s3_dir_name, application_id)
return self._stream_log_dirs(
'task logs',
dir_name=dir_name,
s3_dir_name=s3_dir_name,
ssh_to_workers=True) # TODO: does this make sense on YARN?
def _get_step_log_interpretation(self, log_interpretation, step_type):
"""Fetch and interpret the step log."""
step_id = log_interpretation.get('step_id')
if not self._read_logs():
return
if not step_id:
log.warning("Can't fetch step log; missing step ID")
return
if self._step_type_uses_spark(step_type):
# Spark also has a "controller" log4j log, but it doesn't
# contain errors or anything else we need
#
# the step log is unlikely to be very much help because
# Spark on EMR runs in cluster mode. See #2056
#
# there's generally only one log (unless the job has been running
# long enough for log rotation), so use partial=False
return _interpret_spark_logs(
self.fs, self._ls_step_stderr_logs(step_id=step_id),
partial=False)
else:
return (
_interpret_emr_step_syslog(
self.fs, self._ls_step_syslogs(step_id=step_id)) or
_interpret_emr_step_stderr(
self.fs, self._ls_step_stderr_logs(step_id=step_id))
)
# _ls_step_*() methods are just helpers for _get_step_log_interpretation,
# so not disabling them if self._read_logs() is false
def _ls_step_syslogs(self, step_id):
"""Yield step log matches, logging a message for each one."""
for match in _ls_emr_step_syslogs(
self.fs, self._stream_step_log_dirs(step_id=step_id),
step_id=step_id):
log.info(' Parsing step log: %s' % match['path'])
yield match
def _ls_step_stderr_logs(self, step_id):
"""Yield step log matches, logging a message for each one."""
for match in _ls_emr_step_stderr_logs(
self.fs, self._stream_step_log_dirs(step_id=step_id),
step_id=step_id):
log.info(' Parsing step log: %s' % match['path'])
yield match
def _stream_step_log_dirs(self, step_id):
"""Get lists of directories to look for the step log in."""
return self._stream_log_dirs(
'step log',
dir_name=posixpath.join('hadoop', 'steps', step_id),
s3_dir_name=posixpath.join('steps', step_id))
def _stream_log_dirs(self, log_desc, dir_name, s3_dir_name,
hdfs_dir_name=None,
ssh_to_workers=False):
"""Stream log dirs for any kind of log.
Our general strategy is first, if SSH is enabled, to SSH into the
master node (and possibly workers, if *ssh_to_workers* is set).
If this doesn't work, we have to look on S3. If the cluster is
TERMINATING, we first wait for it to terminate (since that
will trigger copying logs over).
"""
if not self._read_logs():
return
# used to fetch history logs off HDFS
if (hdfs_dir_name and
self.fs.can_handle_path(_DEFAULT_YARN_HDFS_LOG_DIR)):
hdfs_log_dir = posixpath.join(
_DEFAULT_YARN_HDFS_LOG_DIR, hdfs_dir_name)
log.info('Looking for %s in %s...' % (log_desc, hdfs_log_dir))
yield [hdfs_log_dir]
if dir_name and self.fs.can_handle_path('ssh:///'):
ssh_host = self._address_of_master()
if ssh_host:
hosts = [ssh_host]
host_desc = ssh_host
if ssh_to_workers:
try:
hosts.extend(self._ssh_worker_hosts())
host_desc += ' and task/core nodes'
except IOError:
log.warning('Could not get worker addresses for %s' %
ssh_host)
path = posixpath.join(_EMR_LOG_DIR, dir_name)
log.info('Looking for %s in %s on %s...' % (
log_desc, path, host_desc))
yield ['ssh://%s%s%s' % (
ssh_host, '!' + host if host != ssh_host else '',
path) for host in hosts]
# wait for logs to be on S3
self._wait_for_logs_on_s3()
s3_dir_name = s3_dir_name or dir_name
if s3_dir_name and self._s3_log_dir():
cloud_log_dir = posixpath.join(self._s3_log_dir(), s3_dir_name)
log.info('Looking for %s in %s...' % (log_desc, cloud_log_dir))
yield [cloud_log_dir]
def _ssh_worker_hosts(self):
"""Get the hostnames of all core and task nodes,
that are currently running, so we can SSH to them through the master
nodes and read their logs.
(This currently returns IP addresses rather than full hostnames
because they're shorter.)
"""
emr_client = self.make_emr_client()
instances = _boto3_paginate(
'Instances', emr_client, 'list_instances',
ClusterId=self._cluster_id,
InstanceGroupTypes=['CORE', 'TASK'],
InstanceStates=['RUNNING'])
hosts = []
for instance in instances:
hosts.append(instance['PrivateIpAddress'])
return hosts
def _wait_for_logs_on_s3(self):
"""If the cluster is already terminating, wait for it to terminate,
so that logs will be transferred to S3.
Don't print anything unless cluster is in the TERMINATING state.
"""
cluster = self._describe_cluster()
if cluster['Status']['State'] in (
'TERMINATED', 'TERMINATED_WITH_ERRORS'):
return # already terminated
if cluster['Status']['State'] != 'TERMINATING':
# going to need to wait for logs to get archived to S3
# "step_num" is just a unique ID for the step; using -1
# for master node setup script
if (self._master_node_setup_script_path and
self._mns_log_interpretation is None):
step_num = -1
else:
step_num = len(self._log_interpretations)
# already did this for this step
if step_num in self._waited_for_logs_on_s3:
return
try:
log.info('Waiting %d minutes for logs to transfer to S3...'
' (ctrl-c to skip)' % _S3_LOG_WAIT_MINUTES)
if not self.fs.can_handle_path('ssh:///'):
log.info(
'\n'
'To fetch logs immediately next time, set up SSH.'
' See:\n'
'https://pythonhosted.org/mrjob/guides'
'/emr-quickstart.html#configuring-ssh-credentials\n')
time.sleep(60 * _S3_LOG_WAIT_MINUTES)
except KeyboardInterrupt:
pass
# do this even if they ctrl-c'ed; don't make them do it
# for every log for this step
self._waited_for_logs_on_s3.add(step_num)
return
self._wait_for_cluster_to_terminate()
def counters(self):
# not using self._pick_counters() because we don't want to
# initiate a log fetch
return [_pick_counters(log_interpretation)
for log_interpretation in self._log_interpretations]
### Bootstrapping ###
def _bootstrap_python(self):
"""Return a (possibly empty) list of parsed commands (in the same
format as returned by parse_setup_cmd())'"""
if PY2:
# Python 2 and pip are basically already installed everywhere
# (Okay, there's no pip on AMIs prior to 2.4.3, but there's no
# longer an easy way to get it now that apt-get is broken.)
return []
# if bootstrap_python is None, install it for all AMIs up to 4.6.0,
# and warn if it's an AMI before 3.7.0
if self._opts['bootstrap_python'] or (
self._opts['bootstrap_python'] is None and
not self._image_version_gte('4.6.0')):
# we have to have at least on AMI 3.7.0. But give it a shot
if not self._image_version_gte('3.7.0'):
log.warning(
'bootstrapping Python 3 will probably not work on'
' AMIs prior to 3.7.0. For an alternative, see:'
' https://pythonhosted.org/mrjob/guides/emr-bootstrap'
'-cookbook.html#installing-python-from-source')
return [[
'sudo yum install -y python34 python34-devel python34-pip'
]]
else:
return []
def _should_bootstrap_spark(self):
"""Return *bootstrap_spark* option if set; otherwise return
true if our job has Spark steps."""
if self._opts['bootstrap_spark'] is None:
return self._has_spark_steps()
else:
return bool(self._opts['bootstrap_spark'])
def _applications(self, add_spark=True):
"""Returns applications (*applications* option) as a set. Adds
in ``Hadoop`` and ``Spark`` as needed."""
applications = set(self._opts['applications'])
# release_label implies 4.x AMI and later
if (add_spark and self._should_bootstrap_spark() and
self._opts['release_label']):
# EMR allows us to have both "spark" and "Spark" applications,
# which is probably not what we want
if not self._has_spark_application():
applications.add('Spark')
# patch in "Hadoop" unless applications is empty (e.g. 3.x AMIs)
if applications:
# don't add both "Hadoop" and "hadoop"
if not any(a.lower() == 'hadoop' for a in applications):
applications.add('Hadoop')
return applications
def _bootstrap_actions(self, add_spark=True):
"""Parse *bootstrap_actions* option into dictionaries with
keys *path*, *args*, adding Spark bootstrap action if needed.
(This doesn't handle the master bootstrap script.)
"""
actions = list(self._opts['bootstrap_actions'])
# no release_label implies AMIs prior to 4.x
if (add_spark and self._should_bootstrap_spark() and
not self._opts['release_label']):
# running this action twice apparently breaks Spark's
# ability to output to S3 (see #1367)
if not self._has_spark_install_bootstrap_action():
actions.append(_3_X_SPARK_BOOTSTRAP_ACTION)
results = []
for action in actions:
args = shlex_split(action)
if not args:
raise ValueError('bad bootstrap action: %r' % (action,))
results.append(dict(path=args[0], args=args[1:]))
return results
def _cp_to_local_cmd(self):
"""Command to copy files from the cloud to the local directory."""
if self._opts['release_label']:
# on the 4.x AMIs, hadoop isn't yet installed, so use AWS CLI
return 'aws s3 cp'
else:
# on the 2.x and 3.x AMIs, use hadoop
return 'hadoop fs -copyToLocal'
def _manifest_download_commands(self):
return [
('s3://*', 'aws s3 cp'),
('*://*', 'hadoop fs -copyToLocal'),
]
### master node setup script ###
def _create_master_node_setup_script_if_needed(self):
"""Helper for :py:meth:`_add_bootstrap_files_for_upload`.
If we need a master node setup script and write it into our local
temp directory. Set self._master_node_setup_script_path.
"""
# already created
if self._master_node_setup_script_path:
return
# currently, the only thing this script does is upload files
if not self._master_node_setup_mgr.paths():
return
# create script
path = os.path.join(self._get_local_tmp_dir(), 'mns.sh')
contents = self._master_node_setup_script_content()
self._write_script(contents, path, 'master node setup script')
# the script itself doesn't need to be on the master node, just S3
self._master_node_setup_script_path = path
self._upload_mgr.add(path)
def _master_node_setup_script_content(self):
"""Create the contents of the master node setup script as an
array of strings.
(prepare self._master_node_setup_mgr first)
"""
# TODO: this is very similar to _master_bootstrap_script_content();
# merge common code
out = []
# shebang, etc.
for line in self._start_of_sh_script():
out.append(line)
out.append('')
# run commands in a block so we can redirect stdout to stderr
# (e.g. to catch errors from compileall). See #370
out.append('{')
# make working dir
working_dir = self._master_node_setup_working_dir()
out.append(' mkdir -p %s' % pipes.quote(working_dir))
out.append(' cd %s' % pipes.quote(working_dir))
out.append('')
for name, path in sorted(
self._master_node_setup_mgr.name_to_path('file').items()):
uri = self._upload_mgr.uri(path)
out.append(' %s %s %s' % (
self._cp_to_local_cmd(), pipes.quote(uri), pipes.quote(name)))
# imitate Hadoop Distributed Cache
out.append(' chmod u+rx %s' % pipes.quote(name))
# at some point we will probably run commands as well (see #1336)
out.append('} 1>&2') # stdout -> stderr for ease of error log parsing
return out
def _master_node_setup_working_dir(self):
"""Where to place files used by the master node setup script."""
return '/home/hadoop/%s' % self._job_key
def _script_runner_jar_uri(self):
return (
's3://%s.elasticmapreduce/libs/script-runner/script-runner.jar' %
self._opts['region'])
def _build_debugging_step(self):
if self._opts['release_label']:
jar = _4_X_COMMAND_RUNNER_JAR
args = ['state-pusher-script']
else:
jar = self._script_runner_jar_uri()
args = (
's3://%s.elasticmapreduce/libs/state-pusher/0.1/fetch' %
self._opts['region'])
return dict(
Name='Setup Hadoop Debugging',
HadoopJarStep=dict(Jar=jar, Args=args),
)
def _debug_script_uri(self):
return (
's3://%s.elasticmapreduce/libs/state-pusher/0.1/fetch' %
self._opts['region'])
### EMR JOB MANAGEMENT UTILS ###
def make_persistent_cluster(self):
if (self._cluster_id):
raise ValueError(
'This runner is already associated with cluster ID %s' %
(self._cluster_id))
log.info('Creating persistent cluster to run several jobs in...')
self._add_bootstrap_files_for_upload(persistent=True)
self._upload_local_files()
# make sure we can see the files we copied to S3
self._wait_for_s3_eventual_consistency()
# don't allow user to call run()
self._ran_job = True
self._cluster_id = self._create_cluster(persistent=True)
return self._cluster_id
def get_cluster_id(self):
"""Get the ID of the cluster our job is running on, or ``None``."""
return self._cluster_id
def _yield_clusters_to_join(self, available_cluster_ids):
"""Get a list of IDs of pooled clusters that this runner can join,
sorted so that the ones with the greatest CPU capacity come first.
yields (cluster, when_cluster_described) so we can lock clusters that
we wish to join (*cluster* is the cluster description and
*when_cluster_described* is a unix timestamp).
"""
emr_client = self.make_emr_client()
for cluster_id in available_cluster_ids:
if not self._cluster_has_adequate_capacity(cluster_id):
continue
# check other things about the cluster that we can't hash
# (DescribeCluster)
#
# save cluster description so we can use it for locking
when_cluster_described = time.time()
cluster = emr_client.describe_cluster(
ClusterId=cluster_id)['Cluster']
if not self._cluster_description_matches(cluster):
continue
yield (cluster, when_cluster_described)
def _list_cluster_ids_for_pooling(self, created_after=None):
"""Call ListClusters, and collect cluster IDs relevant to pooling.
Optionally, only list clusters created after *created_after*.
Returns a dictionary with the following keys:
available: a list of IDs of clusters that we could join, based on their
state and name suffix (pool name and hash, mrjob version).
Sorted so that the cluster with the most CPU (based on
NormalizedInstanceHours) goes first
matching: a set of IDs of clusters that have the right name suffix but
may or may not be in the right state to join (a superset
of *available*)
in_pool: a set of IDs of clusters that are in the pool we want to join,
regardless of their state or pool hash (a superset of
*matching*)
max_created: the latest creation timestamp for *any* cluster listed
(so we can call this again to get stats on newly created
clusters only)
"""
# a map from cluster_id to cpu_capacity
available = {}
matching = set()
in_pool = set()
max_created = None
name_to_match = self._opts['pool_name']
suffix_to_match = self._cluster_name_pooling_suffix()
if self._opts['max_concurrent_steps'] > 1:
states_to_match = {'RUNNING', 'WAITING'}
else:
states_to_match = {'WAITING'}
emr_client = self.make_emr_client()
now = _boto3_now()
# you can't pass CreatedAfter=None to list_clusters()
list_cluster_kwargs = dict(ClusterStates=_ACTIVE_CLUSTER_STATES)
if created_after:
list_cluster_kwargs['CreatedAfter'] = created_after
log.debug('calling list_clusters(%s)' % ', '.join(
'%s=%r' % (k, v)
for k, v in sorted(list_cluster_kwargs.items())))
for cluster in _boto3_paginate(
'Clusters', emr_client, 'list_clusters',
**list_cluster_kwargs):
cluster_id = cluster['Id']
log.debug(cluster_id)
created = cluster['Status']['Timeline']['CreationDateTime']
if max_created is None or created > max_created:
max_created = created
name = _parse_cluster_name_suffix(cluster['Name']).get('pool_name')
if name != name_to_match:
continue
in_pool.add(cluster_id)
if not cluster['Name'].endswith(suffix_to_match):
continue
matching.add(cluster_id)
if cluster['Status']['State'] not in states_to_match:
continue
when_ready = cluster['Status']['Timeline'].get('ReadyDateTime')
if when_ready:
hours = max(ceil((now - when_ready).total_seconds() / 3600),
1.0)
cpu_capacity = cluster['NormalizedInstanceHours'] / hours
else:
# this probably won't happen, since we only inspect clusters
# in the WAITING state
cpu_capacity = 0
available[cluster_id] = cpu_capacity
# convert *available* from a dict to a sorted list
available = sorted(
available,
key=lambda c: available[c],
reverse=True)
return dict(
available=available,
in_pool=in_pool,
matching=matching,
max_created=max_created,
)
def _cluster_has_adequate_capacity(self, cluster_id):
"""Check if the cluster has an instance group/fleet configuration
that works as well or better.
This either calls ``ListInstanceFleets`` or ``ListInstanceGroups``,
as appropriate
"""
emr_client = self.make_emr_client()
if (self._opts['min_available_mb'] or
self._opts['min_available_virtual_cores']):
cluster = emr_client.describe_cluster(
ClusterId=cluster_id)['Cluster']
host = cluster['MasterPublicDnsName']
try:
log.debug(' querying clusterMetrics from %s' % host)
metrics = self._yrm_get('metrics', host=host)['clusterMetrics']
log.debug(' metrics: %s' %
json.dumps(metrics, sort_keys=True))
except IOError as ex:
log.info(' error while getting metrics for cluster %s: %s' %
(cluster_id, str(ex)))
return False
if metrics['availableMB'] < self._opts['min_available_mb']:
log.info(' too little memory')
return False
if (metrics['availableVirtualCores'] <
self._opts['min_available_virtual_cores']):
log.info(' too few virtual cores')
return False
return True
elif self._opts['instance_fleets']:
try:
fleets = list(_boto3_paginate(
'InstanceFleets', emr_client, 'list_instance_fleets',
ClusterId=cluster_id))
except botocore.exceptions.ClientError:
# this shouldn't usually happen because whether a cluster
# uses instance fleets is in the pool hash
log.debug(' cluster %s: does not use instance fleets' %
cluster_id)
return False
return _instance_fleets_satisfy(
fleets, self._opts['instance_fleets'])
else:
try:
groups = list(_boto3_paginate(
'InstanceGroups', emr_client, 'list_instance_groups',
ClusterId=cluster_id))
except botocore.exceptions.ClientError:
# this shouldn't usually happen because whether a cluster
# uses instance fleets is in the pool hash
log.debug(' cluster %s: does not use instance groups' %
cluster_id)
return False
return _instance_groups_satisfy(groups, self._instance_groups())
def _cluster_description_matches(self, cluster):
"""Do we want to join the cluster with the given description?"""
cluster_id = cluster['Id']
# skip if user specified a key pair and it doesn't match
if (self._opts['ec2_key_pair'] and
self._opts['ec2_key_pair'] !=
cluster['Ec2InstanceAttributes'].get('Ec2KeyName')):
log.debug(' cluster %s: ec2 key pair mismatch' % cluster_id)
return False
# only take persistent clusters
if cluster['AutoTerminate']:
log.debug(' cluster %s: not persistent' % cluster_id)
return False
# EBS root volume size
if self._opts['ebs_root_volume_gb']:
if 'EbsRootVolumeSize' not in cluster:
log.debug(' cluster %s: EBS root volume size not set' %
cluster_id)
return False
elif (cluster['EbsRootVolumeSize'] <
self._opts['ebs_root_volume_gb']):
log.debug(' cluster %s: EBS root volume size too small' %
cluster_id)
return False
else:
if 'EbsRootVolumeSize' in cluster:
log.debug(' cluster %s: uses non-default EBS root volume'
' size' % cluster_id)
return False
# subnet
subnet = cluster['Ec2InstanceAttributes'].get('Ec2SubnetId')
if isinstance(self._opts['subnet'], list):
matches = (subnet in self._opts['subnet'])
else:
# empty subnet is the same as no subnet. see #1931
matches = (subnet == (self._opts['subnet'] or None))
if not matches:
log.debug(' cluster %s: subnet mismatch' % cluster_id)
return
# step concurrency
step_concurrency = cluster.get('StepConcurrencyLevel', 1)
if step_concurrency > self._opts['max_concurrent_steps']:
log.debug(' cluster %s: step concurrency level too high' %
cluster_id)
return
return True
def _find_cluster(self):
"""Find a cluster that can host this runner. Prefer clusters with more
compute units. Break ties by choosing cluster with longest idle time.
Return ``None`` if no suitable clusters exist.
"""
emr_client = self.make_emr_client()
start = datetime.now()
wait_mins = self._opts['pool_wait_minutes']
timeout_mins = self._opts['pool_timeout_minutes']
pool_name = self._opts['pool_name']
max_in_pool = self._opts['max_clusters_in_pool']
# like sleep() but also raises PoolTimeoutException if we're going to
# sleep beyond the timeout
def sleep_or_time_out(seconds):
if (timeout_mins and (
datetime.now() + timedelta(seconds=seconds) >
start + timedelta(minutes=timeout_mins))):
raise PoolTimeoutException(
'Unable to join or create a cluster within %d minutes' %
timeout_mins)
time.sleep(seconds)
log.info('Attempting to find an available cluster...')
while True:
cluster_ids = self._list_cluster_ids_for_pooling()
for cluster, when_cluster_described in (
self._yield_clusters_to_join(cluster_ids['available'])):
cluster_id = cluster['Id']
step_concurrency_level = cluster['StepConcurrencyLevel']
log.info(' Attempting to join cluster %s' % cluster_id)
lock_acquired = _attempt_to_lock_cluster(
emr_client, cluster_id, self._job_key,
cluster=cluster,
when_cluster_described=when_cluster_described)
if lock_acquired:
return cluster_id, step_concurrency_level
keep_waiting = (
datetime.now() < start + timedelta(minutes=wait_mins))
# if we haven't exhausted pool_wait_minutes, and there are
# clusters we might eventually join, sleep and try again
if keep_waiting and cluster_ids['matching']:
log.info('No clusters in pool %r are available. Checking again'
' in %d seconds...' % (
pool_name, int(_POOLING_SLEEP_INTERVAL)))
sleep_or_time_out(_POOLING_SLEEP_INTERVAL)
continue
# implement max_clusters_in_pool
if max_in_pool:
num_in_pool = len(cluster_ids['in_pool'])
log.info(' %d cluster%s in pool %r (max. is %d)' % (
num_in_pool, _plural(num_in_pool), pool_name, max_in_pool))
if num_in_pool >= max_in_pool:
log.info('Checking again in %d seconds...' % (
_POOLING_SLEEP_INTERVAL))
sleep_or_time_out(_POOLING_SLEEP_INTERVAL)
continue
# to avoid race conditions, double-check the clusters in the pool
# if we need to satisfy max_clusters_in_pool or are trying to
# bypass pool_wait_minutes
if max_in_pool or (keep_waiting and not cluster_ids['matching']):
jitter_seconds = randint(0, self._opts['pool_jitter_seconds'])
log.info(' waiting %d seconds and double-checking for'
' newly created clusters...' % jitter_seconds)
sleep_or_time_out(jitter_seconds)
new_cluster_ids = self._list_cluster_ids_for_pooling(
created_after=cluster_ids['max_created'])
new_num_in_pool = len(
cluster_ids['in_pool'] | new_cluster_ids['in_pool'])
log.info(' %d cluster%s in pool' % (
new_num_in_pool, _plural(new_num_in_pool)))
if ((not max_in_pool or new_num_in_pool < max_in_pool) and
(not keep_waiting or not new_cluster_ids['matching'])):
# allow creating a new cluster
return None, None
log.info('Checking again in %d seconds...' % (
_POOLING_SLEEP_INTERVAL))
sleep_or_time_out(_POOLING_SLEEP_INTERVAL)
continue
# pool_wait_minutes is exhausted and max_clusters_in_pool is not
# set, so create a new cluster
return None, None
# (defensive programming, in case we break out of the loop)
return None, None
def _pool_hash_dict(self):
"""A dictionary of information that must be matched exactly to
join a pooled cluster (other than mrjob version and pool name).
The format of this dictionary may change between mrjob versions.
"""
# this can be expensive because we have to read every file used in
# bootstrapping, so cache it
if not self._pool_hash_dict_cached:
d = {}
# additional_emr_info
d['additional_emr_info'] = self._opts['additional_emr_info']
# applications
# (these are case-insensitive)
d['applications'] = sorted(a.lower() for a in self._applications())
# bootstrapping
# bootstrap_actions
d['bootstrap_actions'] = self._bootstrap_actions()
# bootstrap_file_md5sums
d['bootstrap_file_md5sums'] = {
name: self.fs.md5sum(path)
for name, path
in self._bootstrap_dir_mgr.name_to_path().items()
if path != self._mrjob_zip_path
}
# bootstrap_without_paths
# original path doesn't matter, just contents (above) and name
d['bootstrap_without_paths'] = [
[
dict(type=x['type'],
name=self._bootstrap_dir_mgr.name(**x))
if isinstance(x, dict) else x
for x in cmd
]
for cmd in self._bootstrap
]
# emr_configurations
d['emr_configurations'] = self._emr_configurations()
# image_id
d['image_id'] = self._opts['image_id']
# instance_collection_type
# no way to compare instance groups with instance fleets
# so make it part of the hash
d['instance_collection_type'] = (
'INSTANCE_FLEET' if self._opts['instance_fleets']
else 'INSTANCE_GROUP'
)
# release_label
# use e.g. emr-2.4.9 for 2.x/3.x AMIs, even though the API wouldn't
d['release_label'] = (self._opts['release_label'] or
'emr-' + self._opts['image_version'])
self._pool_hash_dict_cached = d
return self._pool_hash_dict_cached
def _pool_hash(self):
hash_dict = self._pool_hash_dict()
hash_json = json.dumps(hash_dict, sort_keys=True)
if not isinstance(hash_json, bytes):
hash_json = hash_json.encode('utf_8')
m = hashlib.md5()
m.update(hash_json)
return m.hexdigest()
def _cluster_name_pooling_suffix(self):
"""Extra info added to the cluster name, for pooling."""
if not self._opts['pool_clusters']:
return ''
else:
return _cluster_name_suffix(
self._pool_hash(), self._opts['pool_name'])
### EMR-specific Stuff ###
def make_emr_client(self):
"""Create a :py:mod:`boto3` EMR client.
:return: a :py:class:`botocore.client.EMR` wrapped in a
:py:class:`mrjob.retry.RetryWrapper`
"""
# ...which is then wrapped in bacon! Mmmmm!
if boto3 is None:
raise ImportError('You must install boto3 to connect to EMR')
raw_emr_client = boto3.client(
'emr',
aws_access_key_id=self._opts['aws_access_key_id'],
aws_secret_access_key=self._opts['aws_secret_access_key'],
aws_session_token=self._opts['aws_session_token'],
endpoint_url=_endpoint_url(self._opts['emr_endpoint']),
region_name=self._opts['region'],
)
# #1799: don't retry faster than EMR checks the API
return _wrap_aws_client(raw_emr_client,
min_backoff=self._opts['check_cluster_every'])
def _describe_cluster(self):
emr_client = self.make_emr_client()
return emr_client.describe_cluster(
ClusterId=self._cluster_id)['Cluster']
def get_hadoop_version(self):
return self._get_app_versions().get('hadoop')
def get_image_version(self):
"""Get the version of the AMI that our cluster is running, or ``None``.
"""
return self._get_cluster_info('image_version')
def _address_of_master(self):
"""Get the address of the master node so we can SSH to it"""
return self._get_cluster_info('master_public_dns')
def _master_private_ip(self):
"""Get the internal ("private") address of the master node, so we
can direct our SSH tunnel to it."""
return self._get_cluster_info('master_private_ip')
def _get_app_versions(self):
"""Returns a map from lowercase app name to version for our cluster.
For apps other than Hadoop, this only works for AMI 4.x and later.
"""
return self._get_cluster_info('app_versions')
def _get_collection_type(self):
"""Return the collection type of the cluster (either
``'INSTANCE_FLEET'`` or ``'INSTANCE_GROUP'``)."""
return self._get_cluster_info('collection_type')
def _get_cluster_info(self, key):
if not self._cluster_id:
return None
cache = self._cluster_to_cache[self._cluster_id]
if not cache.get(key):
if key == 'master_private_ip':
self._store_master_instance_info()
else:
self._store_cluster_info()
return cache.get(key)
def _store_cluster_info(self):
"""Describe our cluster, and cache image_version, hadoop_version,
and master_public_dns"""
if not self._cluster_id:
raise ValueError('cluster has not yet been created')
cache = self._cluster_to_cache[self._cluster_id]
cluster = self._describe_cluster()
# AMI version might be in RunningAMIVersion (2.x, 3.x)
# or ReleaseLabel (4.x)
cache['image_version'] = cluster.get('RunningAmiVersion')
if not cache['image_version']:
release_label = cluster.get('ReleaseLabel')
if release_label:
cache['image_version'] = release_label.lstrip('emr-')
cache['app_versions'] = dict(
(a['Name'].lower(), a.get('Version'))
for a in cluster['Applications'])
cache['collection_type'] = cluster.get(
'InstanceCollectionType', 'INSTANCE_GROUP')
if cluster['Status']['State'] in ('RUNNING', 'WAITING'):
cache['master_public_dns'] = cluster['MasterPublicDnsName']
def _store_master_instance_info(self):
"""List master instance for our cluster, and cache
master_private_ip."""
if not self._cluster_id:
raise ValueError('cluster has not yet been created')
cache = self._cluster_to_cache[self._cluster_id]
emr_client = self.make_emr_client()
instances = emr_client.list_instances(
ClusterId=self._cluster_id,
InstanceGroupTypes=['MASTER'])['Instances']
if not instances:
return
master = instances[0]
# can also get private DNS and public IP/DNS, but we don't use this
master_private_ip = master.get('PrivateIpAddress')
if master_private_ip: # may not have been assigned yet
cache['master_private_ip'] = master_private_ip
def make_ec2_client(self):
"""Create a :py:mod:`boto3` EC2 client.
:return: a :py:class:`botocore.client.EC2` wrapped in a
:py:class:`mrjob.retry.RetryWrapper`
"""
if boto3 is None:
raise ImportError('You must install boto3 to connect to EC2')
raw_ec2_client = boto3.client(
'ec2',
aws_access_key_id=self._opts['aws_access_key_id'],
aws_secret_access_key=self._opts['aws_secret_access_key'],
aws_session_token=self._opts['aws_session_token'],
endpoint_url=_endpoint_url(self._opts['ec2_endpoint']),
region_name=self._opts['region'],
)
return _wrap_aws_client(raw_ec2_client)
def make_iam_client(self):
"""Create a :py:mod:`boto3` IAM client.
:return: a :py:class:`botocore.client.IAM` wrapped in a
:py:class:`mrjob.retry.RetryWrapper`
"""
if boto3 is None:
raise ImportError('You must install boto3 to connect to IAM')
# special logic for setting IAM endpoint (which you don't usually
# want to do, because IAM is regionless).
endpoint_url = _endpoint_url(self._opts['iam_endpoint'])
if endpoint_url:
# keep boto3 from loading a nonsensical region name from configs
# (see https://github.com/boto/boto3/issues/985)
region_name = _DEFAULT_AWS_REGION
log.debug('creating IAM client to %s' % endpoint_url)
else:
region_name = None
log.debug('creating IAM client')
raw_iam_client = boto3.client(
'iam',
aws_access_key_id=self._opts['aws_access_key_id'],
aws_secret_access_key=self._opts['aws_secret_access_key'],
aws_session_token=self._opts['aws_session_token'],
endpoint_url=endpoint_url,
region_name=region_name,
)
return _wrap_aws_client(raw_iam_client)
# Spark
def _uses_spark(self):
"""Does this runner use Spark, based on steps, bootstrap actions,
and EMR applications? If so, we'll need more memory."""
return (self._has_spark_steps() or
self._has_spark_install_bootstrap_action() or
self._has_spark_application() or
self._opts['bootstrap_spark'])
def _has_spark_install_bootstrap_action(self):
"""Does it look like this runner has a spark bootstrap install
action set? (Anything ending in "/install-spark" counts.)"""
return any(ba['path'].endswith('/install-spark')
for ba in self._bootstrap_actions(add_spark=False))
def _has_spark_application(self):
"""Does this runner have "Spark" in its *applications* option?"""
return any(a.lower() == 'spark'
for a in self._applications(add_spark=False))
def _check_cluster_spark_support(self):
"""Issue a warning if our cluster doesn't support Spark.
This should only be called if you are going to run one or more
Spark steps.
"""
message = self._cluster_spark_support_warning()
if message:
log.warning(message)
def _cluster_spark_support_warning(self):
"""Helper for _check_cluster_spark_support()."""
image_version = self.get_image_version()
if not version_gte(image_version, _MIN_SPARK_AMI_VERSION):
suggested_version = (
_MIN_SPARK_AMI_VERSION if PY2 else _MIN_SPARK_PY3_AMI_VERSION)
return (' AMI version %s does not support Spark;\n'
' (try --image-version %s or later)' % (
image_version, suggested_version))
if not version_gte(image_version, _MIN_SPARK_PY3_AMI_VERSION):
if PY2:
# even though this version of Spark "works" with Python 2,
# it doesn't work well
return (' AMI version %s has an old version of Spark\n'
' and does not correctly determine when a Spark'
' job has failed\n'
'Try --image-version %s or later)' % (
image_version, _MIN_SPARK_PY3_AMI_VERSION))
else:
# this version of Spark doesn't support Python 3 at all!
return (' AMI version %s does not support Python 3 on Spark\n'
' (try --image-version %s or later)' % (
image_version, _MIN_SPARK_PY3_AMI_VERSION))
emr_client = self.make_emr_client()
too_small_msg = (' instance type %s is too small for Spark;'
' your job may stall forever')
if self._get_collection_type() == 'INSTANCE_FLEET':
fleets = list(_boto3_paginate(
'InstanceFleets', emr_client, 'list_instance_fleets',
ClusterId=self.get_cluster_id()))
for fleet in fleets:
# master doesn't matter if it's not running tasks
if fleet['InstanceFleetType'] == 'MASTER' and len(fleets) > 1:
continue
for spec in fleet['InstanceTypeSpecifications']:
mem = EC2_INSTANCE_TYPE_TO_MEMORY.get(spec['InstanceType'])
if mem and mem < _MIN_SPARK_INSTANCE_MEMORY:
return (too_small_msg % spec['InstanceType'])
else:
# instance groups
igs = list(_boto3_paginate(
'InstanceGroups', emr_client, 'list_instance_groups',
ClusterId=self.get_cluster_id()))
for ig in igs:
# master doesn't matter if it's not running tasks
if ig['InstanceGroupType'] == 'MASTER' and len(igs) > 1:
continue
mem = EC2_INSTANCE_TYPE_TO_MEMORY.get(ig['InstanceType'])
if mem and mem < _MIN_SPARK_INSTANCE_MEMORY:
return (too_small_msg % ig['InstanceType'])
return None
def _cmdenv(self):
env = super(EMRJobRunner, self)._cmdenv()
return combine_dicts(self._docker_cmdenv(), env)
def _emr_configurations(self):
# don't keep two configs with the same Classification (#2097)
return _deduplicate_emr_configurations(
self._docker_emr_configurations() +
self._opts['emr_configurations']
)
def _docker_image(self):
"""Special-case the "library" registry which is implied on Docker Hub
but needs to be specified explicitly on EMR."""
image = self._opts['docker_image']
if not image:
return None
elif '/' in image:
return image
else:
return 'library/' + image
def _docker_registry(self):
"""Infer the trusted docker registry from the docker image."""
image = self._docker_image()
if not image:
return None
else:
return image.split('/')[0]
def _docker_cmdenv(self):
image = self._docker_image()
if not image:
return {}
env = dict(
YARN_CONTAINER_RUNTIME_TYPE='docker',
YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=image,
)
if self._opts['docker_client_config']:
env['YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG'] = (
self._opts['docker_client_config'])
if self._opts['docker_mounts']:
env['YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS'] = ','.join(
self._opts['docker_mounts'])
return env
def _docker_emr_configurations(self):
registry = self._docker_registry()
if not registry:
return []
registries = ','.join(['local', registry])
return [
dict(
Classification='container-executor',
Configurations=[
dict(
Classification='docker',
Properties={
'docker.trusted.registries': registries,
'docker.privileged-containers.registries': (
registries),
},
),
],
Properties={},
),
]
def _yrm_get(self, path, host=None, port=None, timeout=None):
"""Use curl to perform an HTTP GET on the given path on the
YARN Resource Manager. Either return decoded JSON from the call,
or raise an IOError
*path* should not start with a '/'
More info on the YARN REST API can be found here:
https://hadoop.apache.org/docs/current/hadoop-yarn/
hadoop-yarn-site/ResourceManagerRest.html
"""
if host is None:
host = self._address_of_master()
if port is None:
port = _YARN_RESOURCE_MANAGER_PORT
if timeout is None:
timeout = _YARN_API_TIMEOUT
# using urljoin() to avoid a double / when joining host/port with path
yrm_url = urljoin(
'http://{}:{:d}'.format(host, port),
'{}/{}'.format(_YRM_BASE_PATH, path)
)
curl_args = [
'curl', # always available on EMR
'-fsS', # fail on HTTP errors, print errors only to stderr
'-m', str(timeout), # timeout after 20 seconds
yrm_url,
]
stdout, stderr = self.fs.ssh._ssh_run(host, curl_args)
return json.loads(to_unicode(stdout))
def _get_job_steps(emr_client, cluster_id, job_key):
"""Efficiently fetch steps for a particular mrjob run from the EMR API.
:param emr_client: a boto3 EMR client. See
:py:meth:`~mrjob.emr.EMRJobRunner.make_emr_client`
:param cluster_id: ID of EMR cluster to fetch steps from. See
:py:meth:`~mrjob.emr.EMRJobRunner.get_cluster_id`
:param job_key: Unique key for a mrjob run. See
:py:meth:`~mrjob.runner.MRJobRunner.get_job_key`
"""
steps = []
for step in _boto3_paginate('Steps', emr_client, 'list_steps',
ClusterId=cluster_id):
if step['Name'].startswith(job_key):
steps.append(step)
elif steps:
# all steps for job will be together, so stop
# when we find a non-job step
break
return list(reversed(list(steps)))
def _get_reason(cluster_or_step):
"""Get state change reason message."""
# StateChangeReason is {} before the first state change
return cluster_or_step['Status']['StateChangeReason'].get('Message', '')
def _deduplicate_emr_configurations(emr_configurations):
"""Takes the value of the *emr_configurations* opt, and ensures that
later configs overwrite earlier ones with the same Classification.
Additionally, any configs that contain empty or unset Properties
and Configurations will be removed (this is a way of deleting
existing config dicts without replacing them).
You can assume that all config dicts have run through
_fix_configuration_opt()
"""
results = OrderedDict()
for c in emr_configurations:
results[c['Classification']] = c
return [c for c in results.values() if
c['Properties'] or c.get('Configurations')]
def _fix_configuration_opt(c):
"""Return copy of *c* with *Properties* is always set
(defaults to {}) and with *Configurations* is not set if empty.
Convert all values to strings.
Raise exception on more serious problems (extra fields, wrong data
type, etc).
This allows us to match configurations against the API, *and* catches bad
configurations before they result in cryptic API errors.
"""
if not isinstance(c, dict):
raise TypeError('configurations must be dicts, not %r' % (c,))
c = dict(c) # make a copy
# extra keys
extra_keys = (
set(c) - set(['Classification', 'Configurations', 'Properties']))
if extra_keys:
raise ValueError('configuration opt has extra keys: %s' % ', '.join(
sorted(extra_keys)))
# Classification
if 'Classification' not in c:
raise ValueError('configuration opt has no Classification')
if not isinstance(c['Classification'], string_types):
raise TypeError('Classification must be string')
# Properties
c.setdefault('Properties', {})
if not isinstance(c['Properties'], dict):
raise TypeError('Properties must be a dict')
c['Properties'] = dict(
(str(k), str(v)) for k, v in c['Properties'].items())
# sub-Configurations
if 'Configurations' in c:
if c['Configurations']:
if not isinstance(c['Configurations'], list):
raise TypeError('Configurations must be a list')
# recursively fix subconfigurations
c['Configurations'] = [
_fix_configuration_opt(sc) for sc in c['Configurations']]
else:
# don't keep empty configurations around
del c['Configurations']
return c
def _fix_subnet_opt(subnet):
"""Return either None, a string, or a list with at least two items."""
if subnet is None:
return None
if isinstance(subnet, string_types):
return subnet
subnet = list(subnet)
if len(subnet) == 1:
return subnet[0]
else:
return subnet
def _build_instance_group(role, instance_type, num_instances, bid_price):
"""Helper method for creating instance groups. For use when
creating a cluster using a list of InstanceGroups
- role is either 'MASTER', 'CORE', or 'TASK'.
- instance_type is an EC2 instance type
- count is an int
- bid_price is a number, a string, or None. If None,
this instance group will be use the ON-DEMAND market
instead of the SPOT market.
"""
if role not in _INSTANCE_ROLES:
raise ValueError
if not instance_type:
raise ValueError
if not num_instances:
raise ValueError
ig = dict(
InstanceCount=num_instances,
InstanceRole=role,
InstanceType=instance_type,
Market='ON_DEMAND',
Name=role.lower(), # just name the groups "core", "master", and "task"
)
if bid_price:
ig['Market'] = 'SPOT'
ig['BidPrice'] = str(bid_price) # must be a string
return ig
def _plural(n):
"""Utility for logging messages"""
if n == 1:
return ''
else:
return 's'