mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-10 04:01:19 +00:00
3374 lines
122 KiB
Python
3374 lines
122 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2009-2017 Yelp and Contributors
|
|
# Copyright 2018 Yelp
|
|
# Copyright 2019 Yelp and Contributors
|
|
# Copyright 2020 Affirm, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import os.path
|
|
import pipes
|
|
import posixpath
|
|
import re
|
|
import time
|
|
from collections import OrderedDict
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
from datetime import timedelta
|
|
from math import ceil
|
|
from random import randint
|
|
|
|
try:
|
|
import botocore.client
|
|
import botocore.exceptions
|
|
botocore # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
botocore = None
|
|
|
|
try:
|
|
import boto3
|
|
boto3 # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
# don't require boto3; MRJobs don't actually need it when running
|
|
# inside hadoop streaming
|
|
boto3 = None
|
|
|
|
|
|
import mrjob
|
|
import mrjob.step
|
|
from mrjob.aws import _DEFAULT_AWS_REGION
|
|
from mrjob.aws import EC2_INSTANCE_TYPE_TO_MEMORY
|
|
from mrjob.aws import _boto3_now
|
|
from mrjob.aws import _boto3_paginate
|
|
from mrjob.aws import _wrap_aws_client
|
|
from mrjob.cloud import HadoopInTheCloudJobRunner
|
|
from mrjob.compat import map_version
|
|
from mrjob.compat import version_gte
|
|
from mrjob.conf import combine_dicts
|
|
from mrjob.fs.composite import CompositeFilesystem
|
|
from mrjob.fs.hadoop import HadoopFilesystem
|
|
from mrjob.fs.local import LocalFilesystem
|
|
from mrjob.fs.s3 import S3Filesystem
|
|
from mrjob.fs.s3 import _client_error_status
|
|
from mrjob.fs.s3 import _endpoint_url
|
|
from mrjob.fs.s3 import _get_bucket_region
|
|
from mrjob.fs.ssh import SSHFilesystem
|
|
from mrjob.hadoop import _DEFAULT_YARN_HDFS_LOG_DIR
|
|
from mrjob.iam import _FALLBACK_INSTANCE_PROFILE
|
|
from mrjob.iam import _FALLBACK_SERVICE_ROLE
|
|
from mrjob.iam import get_or_create_mrjob_instance_profile
|
|
from mrjob.iam import get_or_create_mrjob_service_role
|
|
from mrjob.logs.bootstrap import _check_for_nonzero_return_code
|
|
from mrjob.logs.bootstrap import _interpret_emr_bootstrap_stderr
|
|
from mrjob.logs.bootstrap import _ls_emr_bootstrap_stderr_logs
|
|
from mrjob.logs.counters import _pick_counters
|
|
from mrjob.logs.errors import _log_probable_cause_of_failure
|
|
from mrjob.logs.mixin import LogInterpretationMixin
|
|
from mrjob.logs.spark import _interpret_spark_logs
|
|
from mrjob.logs.step import _interpret_emr_step_stderr
|
|
from mrjob.logs.step import _interpret_emr_step_syslog
|
|
from mrjob.logs.step import _ls_emr_step_stderr_logs
|
|
from mrjob.logs.step import _ls_emr_step_syslogs
|
|
from mrjob.parse import is_s3_uri
|
|
from mrjob.parse import _parse_progress_from_job_tracker
|
|
from mrjob.parse import _parse_progress_from_resource_manager
|
|
from mrjob.pool import _attempt_to_lock_cluster
|
|
from mrjob.pool import _attempt_to_unlock_cluster
|
|
from mrjob.pool import _cluster_name_suffix
|
|
from mrjob.pool import _instance_fleets_satisfy
|
|
from mrjob.pool import _instance_groups_satisfy
|
|
from mrjob.pool import _parse_cluster_name_suffix
|
|
from mrjob.py2 import PY2
|
|
from mrjob.py2 import string_types
|
|
from mrjob.py2 import to_unicode
|
|
from mrjob.py2 import urljoin
|
|
from mrjob.py2 import urlopen
|
|
from mrjob.runner import _blank_out_conflicting_opts
|
|
from mrjob.setup import UploadDirManager
|
|
from mrjob.setup import WorkingDirManager
|
|
from mrjob.step import StepFailedException
|
|
from mrjob.step import _is_spark_step_type
|
|
from mrjob.util import shlex_split
|
|
from mrjob.util import strip_microseconds
|
|
from mrjob.util import random_identifier
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# how to set up the SSH tunnel for various AMI versions
|
|
_IMAGE_VERSION_TO_SSH_TUNNEL_CONFIG = {
|
|
'2': dict(
|
|
localhost=True,
|
|
name='job tracker',
|
|
path='/jobtracker.jsp',
|
|
port=9100,
|
|
),
|
|
'3': dict(
|
|
localhost=False,
|
|
name='resource manager',
|
|
path='/cluster',
|
|
port=9026,
|
|
),
|
|
'4': dict(
|
|
localhost=False,
|
|
name='resource manager',
|
|
path='/cluster',
|
|
port=8088,
|
|
),
|
|
}
|
|
|
|
# if we SSH into a node, default place to look for logs
|
|
_EMR_LOG_DIR = '/mnt/var/log'
|
|
|
|
# Prior to AMI 2.4.8/3.1.1, there is a limit of 256 steps total per cluster.
|
|
# We issue a warning for users who are continuing to used pooling on these
|
|
# very old AMIs
|
|
_IMAGE_SUPPORTS_POOLING = {
|
|
'2': False,
|
|
'2.4.8': True,
|
|
'3': False,
|
|
'3.1.1': True,
|
|
}
|
|
|
|
_MAX_SSH_RETRIES = 20
|
|
|
|
# ssh should fail right away if it can't bind a port
|
|
_WAIT_FOR_SSH_TO_FAIL = 1.0
|
|
|
|
# amount of time to wait between checks for available pooled clusters
|
|
_POOLING_SLEEP_INTERVAL = 30.01 # Add .1 seconds so minutes arent spot on.
|
|
|
|
# bootstrap action which automatically terminates idle clusters
|
|
_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH = os.path.join(
|
|
os.path.dirname(mrjob.__file__),
|
|
'bootstrap',
|
|
'terminate_idle_cluster_emr.sh')
|
|
|
|
# default AWS region to use for EMR. Using us-west-2 because it is the default
|
|
# for new (since October 10, 2012) accounts (see #1025)
|
|
_DEFAULT_EMR_REGION = 'us-west-2'
|
|
|
|
# default AMI to use on EMR. This may be updated with each version
|
|
_DEFAULT_IMAGE_VERSION = '6.0.0'
|
|
|
|
# first AMI version that we can't run bash -e on (see #1548)
|
|
_BAD_BASH_IMAGE_VERSION = '5.2.0'
|
|
|
|
# use this if bash -e works (/bin/sh is actually bash)
|
|
_GOOD_BASH_SH_BIN = ['/bin/sh', '-ex']
|
|
|
|
# use this if bash -e doesn't work
|
|
_BAD_BASH_SH_BIN = ['/bin/sh', '-x']
|
|
|
|
# Hadoop streaming jar on 1-3.x AMIs
|
|
_PRE_4_X_STREAMING_JAR = '/home/hadoop/contrib/streaming/hadoop-streaming.jar'
|
|
|
|
# intermediary jar used on 4.x AMIs
|
|
_4_X_COMMAND_RUNNER_JAR = 'command-runner.jar'
|
|
|
|
# path to spark-submit on 3.x AMIs. (On 4.x, it's just 'spark-submit')
|
|
_3_X_SPARK_SUBMIT = '/home/hadoop/spark/bin/spark-submit'
|
|
|
|
# bootstrap action to install Spark on 3.x AMIs (On 4.x+, we use
|
|
# Applications instead)
|
|
_3_X_SPARK_BOOTSTRAP_ACTION = (
|
|
'file:///usr/share/aws/emr/install-spark/install-spark')
|
|
|
|
# first AMI version to support Spark
|
|
_MIN_SPARK_AMI_VERSION = '3.8.0'
|
|
|
|
# first AMI version with Spark that supports Python 3
|
|
_MIN_SPARK_PY3_AMI_VERSION = '4.0.0'
|
|
|
|
# first AMI version that allows steps to run concurrently
|
|
_MIN_STEP_CONCURRENCY_AMI_VERSION = '5.28.0'
|
|
|
|
# we have to wait this many minutes for logs to transfer to S3 (or wait
|
|
# for the cluster to terminate). Docs say logs are transferred every 5
|
|
# minutes, but I've seen it take longer on the 4.3.0 AMI. Probably it's
|
|
# 5 minutes plus time to copy the logs, or something like that.
|
|
_S3_LOG_WAIT_MINUTES = 10
|
|
|
|
# minimum amount of memory to run spark jobs
|
|
#
|
|
# it's possible that we could get by with slightly less memory, but
|
|
# m1.medium (3.75) definitely doesn't work.
|
|
_MIN_SPARK_INSTANCE_MEMORY = 7.5
|
|
|
|
# these are the only kinds of instance roles that exist
|
|
_INSTANCE_ROLES = ('MASTER', 'CORE', 'TASK')
|
|
|
|
# where to find the history log in HDFS
|
|
_YARN_HDFS_HISTORY_LOG_DIR = 'hdfs:///tmp/hadoop-yarn/staging/history'
|
|
|
|
# mildly flexible regex to detect cluster self-termination. Termination of
|
|
# non-master nodes won't shut down the cluster, so don't need to match that.
|
|
_CLUSTER_SELF_TERMINATED_RE = re.compile(
|
|
'^.*(node|instances) .* terminated.*$', re.I)
|
|
|
|
# if this appears in an S3 object's "restore" field, the object
|
|
# is available to read even if it's Glacier-archived
|
|
_RESTORED_FROM_GLACIER = 'ongoing-request="false"'
|
|
|
|
# Amount of time in seconds before we timeout yarn api calls.
|
|
_YARN_API_TIMEOUT = 20
|
|
|
|
# which port to connect to the YARN resource manager on
|
|
_YARN_RESOURCE_MANAGER_PORT = 8088
|
|
|
|
# base path for YARN resource manager
|
|
_YRM_BASE_PATH = '/ws/v1/cluster'
|
|
|
|
# all the cluster states other than terminating/terminated. We need this list
|
|
# because the ListClusters call can't filter out unwanted cluster states;
|
|
# it can only accept a whitelist of desired ones
|
|
#
|
|
# valid states are here:
|
|
# https://docs.aws.amazon.com/emr/latest/APIReference/API_ListClusters.html
|
|
_ACTIVE_CLUSTER_STATES = ['STARTING', 'BOOTSTRAPPING', 'RUNNING', 'WAITING']
|
|
|
|
|
|
# used to bail out and retry when a pooled cluster self-terminates
|
|
class _PooledClusterSelfTerminatedException(Exception):
|
|
pass
|
|
|
|
|
|
if PY2:
|
|
# this was introduced in Python 3.3
|
|
TimeoutError = OSError
|
|
|
|
|
|
class PoolTimeoutException(TimeoutError):
|
|
pass
|
|
|
|
|
|
class EMRJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
|
|
"""Runs an :py:class:`~mrjob.job.MRJob` on Amazon Elastic MapReduce.
|
|
Invoked when you run your job with ``-r emr``.
|
|
|
|
:py:class:`EMRJobRunner` runs your job in an EMR cluster, which is
|
|
basically a temporary Hadoop cluster. Normally, it creates a cluster
|
|
just for your job; it's also possible to run your job in a specific
|
|
cluster by setting *cluster_id* or to automatically choose a
|
|
waiting cluster, creating one if none exists, by setting
|
|
*pool_clusters*.
|
|
|
|
Input, support, and jar files can be either local or on S3; use
|
|
``s3://...`` URLs to refer to files on S3.
|
|
|
|
This class has some useful utilities for talking directly to S3 and EMR,
|
|
so you may find it useful to instantiate it without a script::
|
|
|
|
from mrjob.emr import EMRJobRunner
|
|
|
|
emr_client = EMRJobRunner().make_emr_client()
|
|
clusters = emr_client.list_clusters()
|
|
...
|
|
"""
|
|
alias = 'emr'
|
|
|
|
OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
|
|
'add_steps_in_batch',
|
|
'additional_emr_info',
|
|
'applications',
|
|
'aws_access_key_id',
|
|
'aws_secret_access_key',
|
|
'aws_session_token',
|
|
'bootstrap_actions',
|
|
'bootstrap_spark',
|
|
'cloud_log_dir',
|
|
'core_instance_bid_price',
|
|
'docker_client_config',
|
|
'docker_image',
|
|
'docker_mounts',
|
|
'ebs_root_volume_gb',
|
|
'ec2_endpoint',
|
|
'ec2_key_pair',
|
|
'ec2_key_pair_file',
|
|
'emr_action_on_failure',
|
|
'emr_configurations',
|
|
'emr_endpoint',
|
|
'enable_emr_debugging',
|
|
'hadoop_extra_args',
|
|
'iam_endpoint',
|
|
'iam_instance_profile',
|
|
'iam_service_role',
|
|
'instance_fleets',
|
|
'instance_groups',
|
|
'master_instance_bid_price',
|
|
'max_clusters_in_pool',
|
|
'max_concurrent_steps',
|
|
'min_available_mb',
|
|
'min_available_virtual_cores',
|
|
'pool_clusters',
|
|
'pool_jitter_seconds',
|
|
'pool_name',
|
|
'pool_timeout_minutes',
|
|
'pool_wait_minutes',
|
|
'release_label',
|
|
's3_endpoint',
|
|
'ssh_add_bin',
|
|
'ssh_bin',
|
|
'ssh_bind_ports',
|
|
'ssh_tunnel',
|
|
'ssh_tunnel_is_open',
|
|
'subnet',
|
|
'tags',
|
|
'task_instance_bid_price',
|
|
}
|
|
|
|
# supports everything (so far)
|
|
_STEP_TYPES = {
|
|
'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'}
|
|
|
|
# everything that controls instances number, type, or price
|
|
_INSTANCE_OPT_NAMES = {
|
|
name for name in OPT_NAMES
|
|
if 'instance' in name and 'iam' not in name
|
|
}
|
|
|
|
def __init__(self, **kwargs):
|
|
""":py:class:`~mrjob.emr.EMRJobRunner` takes the same arguments as
|
|
:py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
|
|
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
|
|
|
|
*aws_access_key_id* and *aws_secret_access_key* are required if you
|
|
haven't set them up already for boto3 (e.g. by setting the environment
|
|
variables :envvar:`AWS_ACCESS_KEY_ID` and
|
|
:envvar:`AWS_SECRET_ACCESS_KEY`)
|
|
|
|
A lengthy list of additional options can be found in
|
|
:doc:`guides/emr-opts.rst`.
|
|
"""
|
|
super(EMRJobRunner, self).__init__(**kwargs)
|
|
|
|
self._fix_s3_tmp_and_log_uri_opts()
|
|
|
|
# use job key to make a unique tmp dir
|
|
self._cloud_tmp_dir = self._opts['cloud_tmp_dir'] + self._job_key + '/'
|
|
|
|
# pick/validate output dir
|
|
if self._output_dir:
|
|
self._output_dir = self._check_and_fix_s3_dir(self._output_dir)
|
|
else:
|
|
self._output_dir = self._cloud_tmp_dir + 'output/'
|
|
|
|
# check AMI version
|
|
if self._opts['image_version'].startswith('1.'):
|
|
log.warning('1.x AMIs will not work because they use'
|
|
' Python 2.5. Use a later AMI version or mrjob v0.4.2')
|
|
elif not version_gte(self._opts['image_version'], '2.4.3'):
|
|
log.warning("AMIs prior to 2.4.3 probably will not work because"
|
|
" they don't support Python 2.7.")
|
|
elif not self._image_version_gte('5.7.0'):
|
|
if self._opts['image_id']:
|
|
log.warning('AMIs prior to 5.7.0 will probably not work'
|
|
' with custom machine images')
|
|
|
|
if self._opts['pool_clusters'] and not map_version(
|
|
self._opts['image_version'], _IMAGE_SUPPORTS_POOLING):
|
|
log.warning(
|
|
"Cluster pooling is not fully supported on AMIs prior to"
|
|
" 2.4.8/3.1.1 due to the limit on total number of steps")
|
|
|
|
if self._opts['max_concurrent_steps'] < 1:
|
|
raise ValueError('max_concurrent_steps must be at least 1')
|
|
|
|
# manage local files that we want to upload to S3. We'll add them
|
|
# to this manager just before we need them.
|
|
s3_files_dir = self._cloud_tmp_dir + 'files/'
|
|
self._upload_mgr = UploadDirManager(s3_files_dir)
|
|
|
|
# master node setup script (handled later by
|
|
# _add_master_node_setup_files_for_upload())
|
|
self._master_node_setup_mgr = WorkingDirManager()
|
|
self._master_node_setup_script_path = None
|
|
|
|
# where our own logs ended up (we'll find this out once we run the job)
|
|
self._s3_log_dir_uri = None
|
|
|
|
# did we create the cluster we're running on?
|
|
self._created_cluster = False
|
|
|
|
# did we acquire a lock on self._cluster_id? (used for pooling)
|
|
self._locked_cluster = None
|
|
|
|
# IDs of steps we have submitted to the cluster
|
|
self._step_ids = []
|
|
|
|
# we don't upload the ssh key to master until it's needed
|
|
self._ssh_key_is_copied = False
|
|
|
|
# map from cluster ID to a dictionary containing cached info about
|
|
# that cluster. Includes the following keys:
|
|
#
|
|
# - image_version
|
|
# - hadoop_version
|
|
# - master_public_dns
|
|
# - master_private_ip
|
|
#
|
|
# (we may do this for multiple cluster IDs if we join a pooled cluster
|
|
# that self-terminates)
|
|
self._cluster_to_cache = defaultdict(dict)
|
|
|
|
# set of cluster IDs for which we logged the master node's public DNS
|
|
self._logged_address_of_master = set()
|
|
|
|
# List of dicts (one for each step) potentially containing
|
|
# the keys 'history', 'step', and 'task'. These will also always
|
|
# contain 'step_id' (the s-XXXXXXXX step ID on EMR).
|
|
#
|
|
# This will be filled by _wait_for_steps_to_complete()
|
|
#
|
|
# This might work better as a dictionary.
|
|
self._log_interpretations = []
|
|
|
|
# log interpretation for master node setup step (currently we don't
|
|
# use this for anything; we just want to keep it out of
|
|
# self._log_interpretations)
|
|
self._mns_log_interpretation = None
|
|
|
|
# set of step numbers (0-indexed) where we waited 5 minutes for logs to
|
|
# transfer to S3 (so we don't do it twice)
|
|
self._waited_for_logs_on_s3 = set()
|
|
|
|
# info used to match clusters. catches _pool_hash_dict()
|
|
self._pool_hash_dict_cached = None
|
|
|
|
# add_steps_in_batch and concurrent steps don't mix
|
|
if (self._add_steps_in_batch() and
|
|
self._opts['max_concurrent_steps'] > 1):
|
|
log.warning('add_steps_in_batch will probably not work'
|
|
' with max_concurrent_steps > 1')
|
|
|
|
# min_available_* options require SSH
|
|
if ((self._opts['min_available_mb'] or
|
|
self._opts['min_available_virtual_cores']) and
|
|
not (self._opts['ec2_key_pair'] and
|
|
self._opts['ec2_key_pair_file'])):
|
|
raise ValueError('you must set up SSH (ec2_key_pair and'
|
|
' ec2_key_pair_file) to use the'
|
|
' min_available_* options')
|
|
|
|
### Options ###
|
|
|
|
@classmethod
|
|
def _default_opts(cls):
|
|
return combine_dicts(
|
|
super(EMRJobRunner, cls)._default_opts(),
|
|
dict(
|
|
bootstrap_python=None,
|
|
check_cluster_every=30,
|
|
cleanup_on_failure=['JOB'],
|
|
cloud_fs_sync_secs=5.0,
|
|
docker_client_config=None,
|
|
docker_image=None,
|
|
image_version=_DEFAULT_IMAGE_VERSION,
|
|
max_concurrent_steps=1,
|
|
min_available_mb=0,
|
|
min_available_virtual_cores=0,
|
|
num_core_instances=0,
|
|
num_task_instances=0,
|
|
pool_clusters=False,
|
|
pool_name='default',
|
|
pool_jitter_seconds=60,
|
|
pool_wait_minutes=0,
|
|
region=_DEFAULT_EMR_REGION,
|
|
)
|
|
)
|
|
|
|
def _combine_opts(self, opt_list):
|
|
"""Blank out overriden *instance_fleets* and *instance_groups*
|
|
|
|
Convert image_version of 4.x and later to release_label."""
|
|
# blank out any instance_fleets/groups before the last config
|
|
# where they are set
|
|
opt_list = _blank_out_conflicting_opts(
|
|
opt_list,
|
|
['instance_fleets', 'instance_groups'],
|
|
self._INSTANCE_OPT_NAMES)
|
|
|
|
# now combine opts, with instance_groups/fleets blanked out
|
|
opts = super(EMRJobRunner, self)._combine_opts(opt_list)
|
|
|
|
# set release_label based on image_version
|
|
if (version_gte(opts['image_version'], '4') and
|
|
not opts['release_label']):
|
|
opts['release_label'] = 'emr-' + opts['image_version']
|
|
|
|
# don't keep two confs with the same Classification (see #2097)
|
|
opts['emr_configurations'] = _deduplicate_emr_configurations(
|
|
opts['emr_configurations'])
|
|
|
|
return opts
|
|
|
|
def _fix_opt(self, opt_key, opt_value, source):
|
|
"""Fix and check various EMR-specific options"""
|
|
opt_value = super(EMRJobRunner, self)._fix_opt(
|
|
opt_key, opt_value, source)
|
|
|
|
# *_instance_bid_price
|
|
if opt_key.endswith('_instance_bid_price'):
|
|
if not opt_value: # don't allow blank bid price
|
|
return None
|
|
|
|
try:
|
|
if not float(opt_value):
|
|
return None
|
|
except ValueError: # maybe EMR allows non-floats?
|
|
pass
|
|
|
|
return str(opt_value) # should be str, not a number
|
|
|
|
# additional_emr_info
|
|
elif opt_key == 'additional_emr_info' and not isinstance(
|
|
opt_value, string_types):
|
|
return json.dumps(opt_value)
|
|
|
|
# emr_configurations
|
|
elif opt_key == 'emr_configurations':
|
|
return [_fix_configuration_opt(c) for c in opt_value]
|
|
|
|
# region
|
|
elif opt_key == 'region':
|
|
# don't allow blank region
|
|
return opt_value or _DEFAULT_EMR_REGION
|
|
|
|
# subnet should be None, a string, or a multi-item list
|
|
elif opt_key == 'subnet':
|
|
return _fix_subnet_opt(opt_value)
|
|
|
|
else:
|
|
return opt_value
|
|
|
|
def _obfuscate_opt(self, opt_key, opt_value):
|
|
"""Obfuscate AWS credentials."""
|
|
# don't need to obfuscate empty values
|
|
if not opt_value:
|
|
return opt_value
|
|
|
|
if opt_key in ('aws_secret_access_key', 'aws_session_token'):
|
|
# don't expose any part of secret credentials
|
|
return '...'
|
|
elif opt_key == 'aws_access_key_id':
|
|
if isinstance(opt_value, string_types):
|
|
return '...' + opt_value[-4:]
|
|
else:
|
|
# don't expose aws_access_key_id if it was accidentally
|
|
# put in a list or something
|
|
return '...'
|
|
else:
|
|
return opt_value
|
|
|
|
def _image_version_gte(self, version):
|
|
"""Check if the requested image version is greater than
|
|
or equal to *version*. If the *release_label* opt is set,
|
|
look at that instead.
|
|
|
|
If you're checking the actual image version of a cluster, just
|
|
use :py:func:`~mrjob.compat.version_gte` and
|
|
:py:meth:`get_image_version`.
|
|
"""
|
|
if self._opts['release_label']:
|
|
return version_gte(
|
|
self._opts['release_label'].lstrip('emr-'), version)
|
|
else:
|
|
return version_gte(self._opts['image_version'], version)
|
|
|
|
def _fix_s3_tmp_and_log_uri_opts(self):
|
|
"""Fill in cloud_tmp_dir and cloud_log_dir (in self._opts) if they
|
|
aren't already set.
|
|
|
|
Helper for __init__.
|
|
"""
|
|
# set cloud_tmp_dir by checking for existing buckets
|
|
if not self._opts['cloud_tmp_dir']:
|
|
self._set_cloud_tmp_dir()
|
|
log.info('Using %s as our temp dir on S3' %
|
|
self._opts['cloud_tmp_dir'])
|
|
|
|
self._opts['cloud_tmp_dir'] = self._check_and_fix_s3_dir(
|
|
self._opts['cloud_tmp_dir'])
|
|
|
|
# set cloud_log_dir
|
|
if self._opts['cloud_log_dir']:
|
|
self._opts['cloud_log_dir'] = self._check_and_fix_s3_dir(
|
|
self._opts['cloud_log_dir'])
|
|
else:
|
|
self._opts['cloud_log_dir'] = self._opts['cloud_tmp_dir'] + 'logs/'
|
|
|
|
def _set_cloud_tmp_dir(self):
|
|
"""Helper for _fix_s3_tmp_and_log_uri_opts"""
|
|
client = self.fs.s3.make_s3_client()
|
|
|
|
for bucket_name in self.fs.s3.get_all_bucket_names():
|
|
if not bucket_name.startswith('mrjob-'):
|
|
continue
|
|
|
|
bucket_region = _get_bucket_region(client, bucket_name)
|
|
if bucket_region == self._opts['region']:
|
|
# Regions are both specified and match
|
|
log.debug("using existing temp bucket %s" % bucket_name)
|
|
self._opts['cloud_tmp_dir'] = 's3://%s/tmp/' % bucket_name
|
|
return
|
|
|
|
# That may have all failed. If so, pick a name.
|
|
bucket_name = 'mrjob-' + random_identifier()
|
|
self._opts['cloud_tmp_dir'] = 's3://%s/tmp/' % bucket_name
|
|
log.info('Auto-created temp S3 bucket %s' % bucket_name)
|
|
self._wait_for_s3_eventual_consistency()
|
|
|
|
def _s3_log_dir(self):
|
|
"""Get the URI of the log directory for this job's cluster."""
|
|
if not self._s3_log_dir_uri:
|
|
cluster = self._describe_cluster()
|
|
log_uri = cluster.get('LogUri')
|
|
if log_uri:
|
|
self._s3_log_dir_uri = '%s%s/' % (
|
|
log_uri.replace('s3n://', 's3://'), self._cluster_id)
|
|
|
|
return self._s3_log_dir_uri
|
|
|
|
def _check_and_fix_s3_dir(self, s3_uri):
|
|
"""Helper for __init__"""
|
|
if not is_s3_uri(s3_uri):
|
|
raise ValueError('Invalid S3 URI: %r' % s3_uri)
|
|
if not s3_uri.endswith('/'):
|
|
s3_uri = s3_uri + '/'
|
|
|
|
return s3_uri
|
|
|
|
def _bash_is_bad(self):
|
|
# hopefully, there will eventually be an image version
|
|
# where this issue is fixed. See #1548
|
|
return self._image_version_gte(_BAD_BASH_IMAGE_VERSION)
|
|
|
|
def _default_sh_bin(self):
|
|
if self._bash_is_bad():
|
|
return _BAD_BASH_SH_BIN
|
|
else:
|
|
return _GOOD_BASH_SH_BIN
|
|
|
|
def _sh_pre_commands(self):
|
|
if self._bash_is_bad() and not self._opts['sh_bin']:
|
|
return ['set -e']
|
|
else:
|
|
return []
|
|
|
|
@property
|
|
def fs(self):
|
|
""":py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, and the
|
|
local filesystem.
|
|
"""
|
|
if self._fs is None:
|
|
self._fs = CompositeFilesystem()
|
|
|
|
if self._opts['ec2_key_pair_file']:
|
|
self._fs.add_fs('ssh', SSHFilesystem(
|
|
ssh_bin=self._ssh_bin(),
|
|
ssh_add_bin=self._ssh_add_bin(),
|
|
ec2_key_pair_file=self._opts['ec2_key_pair_file']))
|
|
|
|
self._fs.add_fs('s3', S3Filesystem(
|
|
aws_access_key_id=self._opts['aws_access_key_id'],
|
|
aws_secret_access_key=self._opts['aws_secret_access_key'],
|
|
aws_session_token=self._opts['aws_session_token'],
|
|
s3_endpoint=self._opts['s3_endpoint'],
|
|
s3_region=self._opts['region'],
|
|
part_size=self._upload_part_size()))
|
|
|
|
if self._opts['ec2_key_pair_file']:
|
|
# add hadoop fs after S3 because it tries to handle all URIs
|
|
|
|
# we'll set hadoop_bin later, once the cluster is set up
|
|
self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin=[]))
|
|
|
|
self._fs.add_fs('local', LocalFilesystem())
|
|
|
|
return self._fs
|
|
|
|
def _run(self):
|
|
self._launch()
|
|
self._finish_run()
|
|
|
|
def _finish_run(self):
|
|
while True:
|
|
try:
|
|
self._wait_for_steps_to_complete()
|
|
break
|
|
except _PooledClusterSelfTerminatedException:
|
|
self._relaunch()
|
|
|
|
def _prepare_for_launch(self):
|
|
"""Set up files needed for the job."""
|
|
self._check_output_not_exists()
|
|
self._create_setup_wrapper_scripts()
|
|
self._add_bootstrap_files_for_upload()
|
|
self._add_master_node_setup_files_for_upload()
|
|
self._add_job_files_for_upload()
|
|
self._upload_local_files()
|
|
# make sure we can see the files we copied to S3
|
|
self._wait_for_s3_eventual_consistency()
|
|
|
|
def _launch(self):
|
|
"""Set up files and then launch our job on EMR."""
|
|
self._prepare_for_launch()
|
|
self._launch_emr_job()
|
|
|
|
def _relaunch(self):
|
|
# files are already in place; just start with a fresh cluster
|
|
assert not self._opts['cluster_id']
|
|
self._cluster_id = None
|
|
self._created_cluster = False
|
|
self._step_ids = []
|
|
|
|
# old SSH tunnel isn't valid for this cluster (see #1549)
|
|
if self._ssh_proc:
|
|
self._kill_ssh_tunnel()
|
|
|
|
# don't try to connect to HDFS on the old cluster
|
|
if hasattr(self.fs, 'hadoop'):
|
|
self.fs.hadoop.set_hadoop_bin([])
|
|
|
|
self._launch_emr_job()
|
|
|
|
def _check_input_path(self, path):
|
|
"""Add a custom check for S3 paths to ensure they're not in
|
|
Glacier (which causes a cryptic error). See #1887."""
|
|
# handle non-S3 paths the usual way
|
|
if not is_s3_uri(path):
|
|
super(EMRJobRunner, self)._check_input_path(path)
|
|
return
|
|
|
|
exists = False
|
|
|
|
for uri, obj in self.fs.s3._ls(path):
|
|
exists = True
|
|
|
|
# we currently just look for 'ongoing-request="false"'
|
|
# in the *restore* field and ignore the expiration date
|
|
# (if the object has expired, the *restore* field won't be set).
|
|
#
|
|
# See #1887 for more discussion of checking expiration.
|
|
if obj.storage_class == 'GLACIER' and not (
|
|
obj.restore and _RESTORED_FROM_GLACIER in obj.restore):
|
|
raise IOError(
|
|
'%s is archived in Glacier and'
|
|
' cannot be read as input!' % uri)
|
|
|
|
if not exists:
|
|
raise IOError(
|
|
'Input path %s does not exist!' % (path,))
|
|
|
|
def _check_output_not_exists(self):
|
|
"""Verify the output path does not already exist. This avoids
|
|
provisioning a cluster only to have Hadoop refuse to launch.
|
|
"""
|
|
try:
|
|
if self.fs.exists(self._output_dir):
|
|
raise IOError(
|
|
'Output path %s already exists!' % (self._output_dir,))
|
|
except botocore.exceptions.ClientError:
|
|
pass
|
|
|
|
def _add_bootstrap_files_for_upload(self, persistent=False):
|
|
"""Add files needed by the bootstrap script to self._upload_mgr.
|
|
|
|
Create the master bootstrap script if necessary.
|
|
|
|
persistent -- set by make_persistent_cluster()
|
|
"""
|
|
# all other files needed by the script are already in
|
|
# _bootstrap_dir_mgr
|
|
for path in self._bootstrap_dir_mgr.paths():
|
|
self._upload_mgr.add(path)
|
|
|
|
# now that we know where the above files live, we can create
|
|
# the master bootstrap script
|
|
self._create_master_bootstrap_script_if_needed()
|
|
if self._master_bootstrap_script_path:
|
|
self._upload_mgr.add(self._master_bootstrap_script_path)
|
|
|
|
# make sure bootstrap action scripts are on S3
|
|
for bootstrap_action in self._bootstrap_actions():
|
|
self._upload_mgr.add(bootstrap_action['path'])
|
|
|
|
# Add max-mins-idle script if we need it
|
|
if persistent or self._opts['pool_clusters']:
|
|
self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)
|
|
|
|
def _add_master_node_setup_files_for_upload(self):
|
|
"""Add files necesary for the master node setup script to
|
|
self._master_node_setup_mgr() and self._upload_mgr().
|
|
|
|
Create the master node setup script if necessary.
|
|
"""
|
|
# currently, only used by libjars; see #1336 for how we might open
|
|
# this up more generally
|
|
for path in self._opts['libjars']:
|
|
# passthrough for libjars already on EMR
|
|
if path.startswith('file:///'):
|
|
continue
|
|
|
|
self._master_node_setup_mgr.add('file', path)
|
|
self._upload_mgr.add(path)
|
|
|
|
self._create_master_node_setup_script_if_needed()
|
|
if self._master_node_setup_script_path:
|
|
self._upload_mgr.add(self._master_node_setup_script_path)
|
|
|
|
def _add_job_files_for_upload(self):
|
|
"""Add files needed for running the job (setup and input)
|
|
to self._upload_mgr."""
|
|
for path in self._py_files():
|
|
self._upload_mgr.add(path)
|
|
|
|
if self._opts['hadoop_streaming_jar']:
|
|
self._upload_mgr.add(self._opts['hadoop_streaming_jar'])
|
|
|
|
# upload JARs and (Python) scripts run by steps
|
|
for step in self._get_steps():
|
|
for key in 'jar', 'script':
|
|
if step.get(key):
|
|
self._upload_mgr.add(step[key])
|
|
|
|
def _ssh_add_bin(self):
|
|
# the args of the ssh-add binary
|
|
return self._opts['ssh_add_bin'] or ['ssh-add']
|
|
|
|
def _ssh_bin(self):
|
|
# the args of the ssh binary
|
|
return self._opts['ssh_bin'] or ['ssh']
|
|
|
|
def _set_up_ssh_tunnel_and_hdfs(self):
|
|
if hasattr(self.fs, 'hadoop'):
|
|
self.fs.hadoop.set_hadoop_bin(self._ssh_hadoop_bin())
|
|
self._set_up_ssh_tunnel()
|
|
|
|
def _ssh_tunnel_config(self):
|
|
"""Look up AMI version, and return a dict with the following keys:
|
|
|
|
name: "job tracker" or "resource manager"
|
|
path: path to start page of job tracker/resource manager
|
|
port: port job tracker/resource manager is running on.
|
|
"""
|
|
return map_version(self.get_image_version(),
|
|
_IMAGE_VERSION_TO_SSH_TUNNEL_CONFIG)
|
|
|
|
def _job_tracker_host(self):
|
|
"""The host of the job tracker/resource manager, from the master node.
|
|
"""
|
|
tunnel_config = self._ssh_tunnel_config()
|
|
|
|
if tunnel_config['localhost']:
|
|
# Issue #1311: on the 2.x AMIs, we want to tunnel to the job
|
|
# tracker on localhost; otherwise it won't
|
|
# work on some VPC setups.
|
|
return 'localhost'
|
|
else:
|
|
# Issue #1397: on the 3.x and 4.x AMIs we want to tunnel to the
|
|
# resource manager on the master node's *internal* IP; otherwise
|
|
# it work won't work on some VPC setups
|
|
return self._master_private_ip()
|
|
|
|
def _ssh_tunnel_args(self, bind_port):
|
|
for opt_name in ('ec2_key_pair', 'ec2_key_pair_file',
|
|
'ssh_bind_ports'):
|
|
if not self._opts[opt_name]:
|
|
log.warning(
|
|
" You must set %s in order to set up the SSH tunnel!"
|
|
% opt_name)
|
|
self._give_up_on_ssh_tunnel = True
|
|
return
|
|
|
|
host = self._address_of_master()
|
|
if not host:
|
|
return
|
|
|
|
return self._ssh_bin() + [
|
|
'-o', 'VerifyHostKeyDNS=no',
|
|
'-o', 'StrictHostKeyChecking=no',
|
|
'-o', 'ExitOnForwardFailure=yes',
|
|
'-o', 'UserKnownHostsFile=%s' % os.devnull,
|
|
] + self._ssh_tunnel_opts(bind_port) + [
|
|
'-i', self._opts['ec2_key_pair_file'],
|
|
'hadoop@%s' % host,
|
|
]
|
|
|
|
def _ssh_hadoop_bin(self):
|
|
if not self._opts['ec2_key_pair_file']:
|
|
return []
|
|
|
|
host = self._address_of_master()
|
|
if not host:
|
|
return []
|
|
|
|
return self._ssh_bin() + [
|
|
'-o', 'VerifyHostKeyDNS=no',
|
|
'-o', 'StrictHostKeyChecking=no',
|
|
'-o', 'ExitOnForwardFailure=yes',
|
|
'-o', 'UserKnownHostsFile=%s' % os.devnull,
|
|
'-i', self._opts['ec2_key_pair_file'],
|
|
'-q', # don't care about SSH warnings, we just want hadoop
|
|
'hadoop@%s' % host,
|
|
'hadoop',
|
|
]
|
|
|
|
def _job_tracker_url(self):
|
|
"""Not actually used to set up the SSH tunnel, used to run curl
|
|
over SSH to fetch from the job tracker directly."""
|
|
tunnel_config = self._ssh_tunnel_config()
|
|
|
|
return 'http://%s:%d%s' % (
|
|
self._job_tracker_host(),
|
|
tunnel_config['port'],
|
|
tunnel_config['path'])
|
|
|
|
### Running the job ###
|
|
|
|
def cleanup(self, mode=None):
|
|
super(EMRJobRunner, self).cleanup(mode=mode)
|
|
|
|
# always stop our SSH tunnel if it's still running
|
|
self._kill_ssh_tunnel()
|
|
|
|
# stop the cluster if it belongs to us (it may have stopped on its
|
|
# own already, but that's fine)
|
|
# don't stop it if it was created due to --pool because the user
|
|
# probably wants to use it again
|
|
if self._cluster_id and not self._opts['cluster_id'] \
|
|
and not self._opts['pool_clusters']:
|
|
log.info('Terminating cluster: %s' % self._cluster_id)
|
|
try:
|
|
self.make_emr_client().terminate_job_flows(
|
|
JobFlowIds=[self._cluster_id]
|
|
)
|
|
except Exception as e:
|
|
log.exception(e)
|
|
|
|
# TODO: otherwise, cancel any steps we submitted (#1570)
|
|
|
|
def _cleanup_cloud_tmp(self):
|
|
# delete all the files we created on S3
|
|
if self._cloud_tmp_dir:
|
|
try:
|
|
log.info('Removing s3 temp directory %s...' %
|
|
self._cloud_tmp_dir)
|
|
self.fs.rm(self._cloud_tmp_dir)
|
|
self._cloud_tmp_dir = None
|
|
except Exception as e:
|
|
log.exception(e)
|
|
|
|
def _cleanup_logs(self):
|
|
super(EMRJobRunner, self)._cleanup_logs()
|
|
|
|
# delete the log files, if it's a cluster we created (the logs
|
|
# belong to the cluster)
|
|
if self._s3_log_dir() and not self._opts['cluster_id'] \
|
|
and not self._opts['pool_clusters']:
|
|
try:
|
|
log.info('Removing log files in %s...' % self._s3_log_dir())
|
|
self.fs.rm(self._s3_log_dir())
|
|
except Exception as e:
|
|
log.exception(e)
|
|
|
|
def _cleanup_cluster(self):
|
|
if not self._cluster_id:
|
|
# If we don't have a cluster, then we can't terminate it.
|
|
return
|
|
|
|
emr_client = self.make_emr_client()
|
|
try:
|
|
log.info("Attempting to terminate cluster")
|
|
emr_client.terminate_job_flows(
|
|
JobFlowIds=[self._cluster_id]
|
|
)
|
|
except Exception as e:
|
|
# Something happened with boto3 and the user should know.
|
|
log.exception(e)
|
|
return
|
|
log.info('Cluster %s successfully terminated' % self._cluster_id)
|
|
|
|
def _wait_for_s3_eventual_consistency(self):
|
|
"""Sleep for a little while, to give S3 a chance to sync up.
|
|
"""
|
|
log.debug('Waiting %.1fs for S3 eventual consistency...' %
|
|
self._opts['cloud_fs_sync_secs'])
|
|
time.sleep(self._opts['cloud_fs_sync_secs'])
|
|
|
|
def _wait_for_cluster_to_terminate(self, cluster=None):
|
|
if not cluster:
|
|
cluster = self._describe_cluster()
|
|
|
|
log.info('Waiting for cluster (%s) to terminate...' %
|
|
cluster['Id'])
|
|
|
|
if (cluster['Status']['State'] == 'WAITING' and
|
|
cluster['AutoTerminate']):
|
|
raise Exception('Operation requires cluster to terminate, but'
|
|
' it may never do so.')
|
|
|
|
while True:
|
|
log.info(' %s' % cluster['Status']['State'])
|
|
|
|
if cluster['Status']['State'] in (
|
|
'TERMINATED', 'TERMINATED_WITH_ERRORS'):
|
|
return
|
|
|
|
time.sleep(self._opts['check_cluster_every'])
|
|
cluster = self._describe_cluster()
|
|
|
|
# instance types
|
|
|
|
def _instance_type(self, role):
|
|
"""What instance type should we use for the given role?
|
|
(one of 'MASTER', 'CORE', 'TASK')"""
|
|
if role not in _INSTANCE_ROLES:
|
|
raise ValueError
|
|
|
|
# explicitly set
|
|
if self._opts[role.lower() + '_instance_type']:
|
|
return self._opts[role.lower() + '_instance_type']
|
|
|
|
elif self._instance_is_worker(role):
|
|
# using *instance_type* here is defensive programming;
|
|
# if set, it should have already been popped into the worker
|
|
# instance type option(s) by _fix_instance_opts() above
|
|
return self._opts['instance_type'] or self._default_instance_type()
|
|
else:
|
|
return self._default_instance_type()
|
|
|
|
def _default_instance_type(self):
|
|
"""Default instance type if not set by the user."""
|
|
# m5.xlarge is available on all regions, but only works in AMI 5.13.0
|
|
# or later. See #2098.
|
|
if self._image_version_gte('5.13.0'):
|
|
return 'm5.xlarge'
|
|
else:
|
|
return 'm4.large'
|
|
|
|
def _instance_is_worker(self, role):
|
|
"""Do instances of the given role run tasks? True for non-master
|
|
instances and sole master instance."""
|
|
if role not in _INSTANCE_ROLES:
|
|
raise ValueError
|
|
|
|
return (role != 'MASTER' or
|
|
sum(self._num_instances(role)
|
|
for role in _INSTANCE_ROLES) == 1)
|
|
|
|
def _num_instances(self, role):
|
|
"""How many of the given instance type do we want?"""
|
|
if role not in _INSTANCE_ROLES:
|
|
raise ValueError
|
|
|
|
if role == 'MASTER':
|
|
return 1 # there can be only one
|
|
else:
|
|
return self._opts['num_' + role.lower() + '_instances']
|
|
|
|
def _instance_bid_price(self, role):
|
|
"""What's the bid price for the given role (if any)?"""
|
|
if role not in _INSTANCE_ROLES:
|
|
raise ValueError
|
|
|
|
return self._opts[role.lower() + '_instance_bid_price']
|
|
|
|
def _instance_groups(self):
|
|
"""Which instance groups do we want to request?
|
|
|
|
Returns the value of the ``InstanceGroups`` parameter
|
|
passed to the EMR API.
|
|
"""
|
|
if self._opts['instance_groups']:
|
|
return self._opts['instance_groups']
|
|
|
|
return [
|
|
_build_instance_group(
|
|
role=role,
|
|
instance_type=self._instance_type(role),
|
|
num_instances=self._num_instances(role),
|
|
bid_price=self._instance_bid_price(role),
|
|
)
|
|
for role in _INSTANCE_ROLES
|
|
if self._num_instances(role)
|
|
]
|
|
|
|
def _create_cluster(self, persistent=False):
|
|
"""Create an empty cluster on EMR, and return the ID of that
|
|
job.
|
|
|
|
If the ``tags`` option is set, also tags the cluster (which
|
|
is a separate API call).
|
|
|
|
persistent -- if this is true, create the cluster with the keep_alive
|
|
option, indicating the job will have to be manually terminated.
|
|
"""
|
|
log.debug('Creating Elastic MapReduce cluster')
|
|
emr_client = self.make_emr_client()
|
|
|
|
kwargs = self._cluster_kwargs(persistent)
|
|
log.debug('Calling run_job_flow(%s)' % (
|
|
', '.join('%s=%r' % (k, v)
|
|
for k, v in sorted(kwargs.items()))))
|
|
cluster_id = emr_client.run_job_flow(**kwargs)['JobFlowId']
|
|
|
|
log.info('Created new cluster %s' % cluster_id)
|
|
|
|
# set EMR tags for the cluster
|
|
tags = dict(self._opts['tags'])
|
|
|
|
# patch in version
|
|
tags['__mrjob_version'] = mrjob.__version__
|
|
|
|
# patch in cluster label and owner
|
|
tags['__mrjob_label'] = self._label()
|
|
tags['__mrjob_owner'] = self._owner()
|
|
|
|
# add pooling tags
|
|
if self._opts['pool_clusters']:
|
|
tags['__mrjob_pool_hash'] = self._pool_hash()
|
|
tags['__mrjob_pool_name'] = self._opts['pool_name']
|
|
|
|
self._add_tags(tags, cluster_id)
|
|
|
|
return cluster_id
|
|
|
|
def _add_tags(self, tags, cluster_id):
|
|
"""Add tags in the dict *tags* to cluster *cluster_id*. Do nothing
|
|
if *tags* is empty or ``None``"""
|
|
if not tags:
|
|
return
|
|
|
|
tags_items = sorted(tags.items())
|
|
|
|
self.make_emr_client().add_tags(
|
|
ResourceId=cluster_id,
|
|
Tags=[dict(Key=k, Value=v) for k, v in tags_items])
|
|
|
|
log.info('Added EMR tags to cluster %s: %s' % (
|
|
cluster_id,
|
|
', '.join('%s=%s' % (tag, value) for tag, value in tags_items)))
|
|
|
|
# TODO: could break this into sub-methods for clarity
|
|
def _cluster_kwargs(self, persistent=False):
|
|
"""Build kwargs for emr_client.run_job_flow()"""
|
|
kwargs = {}
|
|
|
|
kwargs['Name'] = self._job_key + self._cluster_name_pooling_suffix()
|
|
|
|
kwargs['LogUri'] = self._opts['cloud_log_dir']
|
|
|
|
if self._opts['release_label']:
|
|
kwargs['ReleaseLabel'] = self._opts['release_label']
|
|
else:
|
|
kwargs['AmiVersion'] = self._opts['image_version']
|
|
|
|
if self._opts['image_id']:
|
|
kwargs['CustomAmiId'] = self._opts['image_id']
|
|
|
|
# capitalizing Instances because it's just an API parameter
|
|
kwargs['Instances'] = Instances = {}
|
|
|
|
if self._opts['zone']:
|
|
Instances['Placement'] = dict(AvailabilityZone=self._opts['zone'])
|
|
|
|
if self._opts['instance_fleets']:
|
|
Instances['InstanceFleets'] = self._opts['instance_fleets']
|
|
else:
|
|
Instances['InstanceGroups'] = self._instance_groups()
|
|
|
|
# EBS Root volume size
|
|
if self._opts['ebs_root_volume_gb']:
|
|
kwargs['EbsRootVolumeSize'] = self._opts['ebs_root_volume_gb']
|
|
|
|
# bootstrap actions
|
|
kwargs['BootstrapActions'] = BootstrapActions = []
|
|
|
|
for i, bootstrap_action in enumerate(self._bootstrap_actions()):
|
|
uri = self._upload_mgr.uri(bootstrap_action['path'])
|
|
BootstrapActions.append(dict(
|
|
Name=('action %d' % i),
|
|
ScriptBootstrapAction=dict(
|
|
Path=uri,
|
|
Args=bootstrap_action['args'])))
|
|
|
|
if self._master_bootstrap_script_path:
|
|
uri = self._upload_mgr.uri(self._master_bootstrap_script_path)
|
|
|
|
BootstrapActions.append(dict(
|
|
Name='master',
|
|
ScriptBootstrapAction=dict(
|
|
Path=uri,
|
|
Args=[])))
|
|
|
|
if persistent or self._opts['pool_clusters']:
|
|
Instances['KeepJobFlowAliveWhenNoSteps'] = True
|
|
|
|
# use idle termination script on persistent clusters
|
|
# add it last, so that we don't count bootstrapping as idle time
|
|
uri = self._upload_mgr.uri(
|
|
_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)
|
|
|
|
# script takes args in (integer) seconds
|
|
ba_args = [str(int(self._opts['max_mins_idle'] * 60))]
|
|
BootstrapActions.append(dict(
|
|
Name='idle timeout',
|
|
ScriptBootstrapAction=dict(
|
|
Path=uri,
|
|
Args=ba_args)))
|
|
|
|
if self._opts['ec2_key_pair']:
|
|
Instances['Ec2KeyName'] = self._opts['ec2_key_pair']
|
|
|
|
kwargs['Steps'] = Steps = []
|
|
|
|
kwargs['StepConcurrencyLevel'] = self._opts['max_concurrent_steps']
|
|
|
|
if self._opts['enable_emr_debugging']:
|
|
# other steps are added separately
|
|
Steps.append(self._build_debugging_step())
|
|
|
|
if self._opts['additional_emr_info']:
|
|
kwargs['AdditionalInfo'] = self._opts['additional_emr_info']
|
|
|
|
kwargs['VisibleToAllUsers'] = True
|
|
|
|
kwargs['JobFlowRole'] = self._instance_profile()
|
|
kwargs['ServiceRole'] = self._service_role()
|
|
|
|
applications = self._applications()
|
|
if applications:
|
|
kwargs['Applications'] = [
|
|
dict(Name=a) for a in sorted(applications)]
|
|
|
|
emr_configurations = self._emr_configurations()
|
|
if emr_configurations:
|
|
kwargs['Configurations'] = emr_configurations
|
|
|
|
if self._opts['subnet']:
|
|
# handle lists of subnets (for instance fleets)
|
|
if isinstance(self._opts['subnet'], list):
|
|
Instances['Ec2SubnetIds'] = self._opts['subnet']
|
|
else:
|
|
Instances['Ec2SubnetId'] = self._opts['subnet']
|
|
|
|
return self._add_extra_cluster_params(kwargs)
|
|
|
|
def _instance_profile(self):
|
|
try:
|
|
return (self._opts['iam_instance_profile'] or
|
|
get_or_create_mrjob_instance_profile(
|
|
self.make_iam_client()))
|
|
except botocore.exceptions.ClientError as ex:
|
|
if _client_error_status(ex) != 403:
|
|
raise
|
|
log.warning(
|
|
"Can't access IAM API, trying default instance profile: %s" %
|
|
_FALLBACK_INSTANCE_PROFILE)
|
|
return _FALLBACK_INSTANCE_PROFILE
|
|
|
|
def _service_role(self):
|
|
try:
|
|
return (self._opts['iam_service_role'] or
|
|
get_or_create_mrjob_service_role(self.make_iam_client()))
|
|
except botocore.exceptions.ClientError as ex:
|
|
if _client_error_status(ex) != 403:
|
|
raise
|
|
log.warning(
|
|
"Can't access IAM API, trying default service role: %s" %
|
|
_FALLBACK_SERVICE_ROLE)
|
|
return _FALLBACK_SERVICE_ROLE
|
|
|
|
def _action_on_failure(self):
|
|
# don't terminate other people's clusters
|
|
if (self._opts['emr_action_on_failure']):
|
|
return self._opts['emr_action_on_failure']
|
|
elif not self._add_steps_in_batch():
|
|
# concurrent clusters don't allow CANCEL_ON_WAIT
|
|
return 'CONTINUE'
|
|
elif (self._opts['cluster_id'] or
|
|
self._opts['pool_clusters']):
|
|
return 'CANCEL_AND_WAIT'
|
|
else:
|
|
return 'TERMINATE_CLUSTER'
|
|
|
|
def _add_steps_in_batch(self):
|
|
if self._opts['add_steps_in_batch'] is None:
|
|
# by default, add steps in batch only when concurrent steps
|
|
# are not possible
|
|
return not self._image_version_gte('5.28.0')
|
|
else:
|
|
return self._opts['add_steps_in_batch']
|
|
|
|
def _steps_to_submit(self):
|
|
"""Return a step data structures to pass to ``boto3``"""
|
|
# quick, add the other steps before the job spins up and
|
|
# then shuts itself down! (in practice that won't happen
|
|
# for several minutes)
|
|
steps = []
|
|
|
|
if self._master_node_setup_script_path:
|
|
steps.append(self._build_master_node_setup_step())
|
|
|
|
for n in range(self._num_steps()):
|
|
steps.append(self._build_step(n))
|
|
|
|
return steps
|
|
|
|
def _build_step(self, step_num):
|
|
step = self._get_step(step_num)
|
|
|
|
if step['type'] == 'streaming':
|
|
method = self._streaming_step_hadoop_jar_step
|
|
elif step['type'] == 'jar':
|
|
method = self._jar_step_hadoop_jar_step
|
|
elif _is_spark_step_type(step['type']):
|
|
method = self._spark_step_hadoop_jar_step
|
|
else:
|
|
raise ValueError('Bad step type: %r' % (step['type'],))
|
|
|
|
hadoop_jar_step = method(step_num)
|
|
|
|
return dict(
|
|
ActionOnFailure=self._action_on_failure(),
|
|
HadoopJarStep=hadoop_jar_step,
|
|
Name=self._step_name(step_num),
|
|
)
|
|
|
|
def _streaming_step_hadoop_jar_step(self, step_num):
|
|
jar, step_arg_prefix = self._get_streaming_jar_and_step_arg_prefix()
|
|
|
|
args = (step_arg_prefix +
|
|
self._hadoop_streaming_jar_args(step_num))
|
|
|
|
return dict(Jar=jar, Args=args)
|
|
|
|
def _jar_step_hadoop_jar_step(self, step_num):
|
|
step = self._get_step(step_num)
|
|
|
|
jar = self._upload_uri_or_remote_path(step['jar'])
|
|
|
|
args = (
|
|
self._interpolate_jar_step_args(step['args'], step_num))
|
|
|
|
hadoop_jar_step = dict(Jar=jar, Args=args)
|
|
|
|
if step.get('main_class'):
|
|
hadoop_jar_step['MainClass'] = step['main_class']
|
|
|
|
return hadoop_jar_step
|
|
|
|
def _spark_step_hadoop_jar_step(self, step_num):
|
|
return dict(
|
|
Jar=self._spark_jar(),
|
|
Args=self._args_for_spark_step(step_num))
|
|
|
|
def _interpolate_spark_script_path(self, path):
|
|
if path in self._working_dir_mgr.paths():
|
|
return self._dest_in_wd_mirror(
|
|
path, self._working_dir_mgr.name('file', path)) or path
|
|
else:
|
|
return self._upload_mgr.uri(path)
|
|
|
|
def _find_spark_submit_bin(self):
|
|
if version_gte(self.get_image_version(), '4'):
|
|
return ['spark-submit']
|
|
else:
|
|
return [_3_X_SPARK_SUBMIT]
|
|
|
|
def _spark_master(self):
|
|
# hard-coded for EMR
|
|
return 'yarn'
|
|
|
|
def _spark_deploy_mode(self):
|
|
# hard-coded for EMR; otherwise it can't access S3
|
|
return 'cluster'
|
|
|
|
def _spark_jar(self):
|
|
if version_gte(self.get_image_version(), '4'):
|
|
return _4_X_COMMAND_RUNNER_JAR
|
|
else:
|
|
return self._script_runner_jar_uri()
|
|
|
|
def _step_name(self, step_num):
|
|
"""Return something like: ``'mr_your_job Step X of Y'``"""
|
|
return '%s: Step %d of %d' % (
|
|
self._job_key, step_num + 1, self._num_steps())
|
|
|
|
def _upload_uri_or_remote_path(self, path):
|
|
"""Return where *path* will be uploaded, or, if it starts with
|
|
``'file:///'``, a local path."""
|
|
if path.startswith('file:///'):
|
|
return path[7:] # keep leading slash
|
|
else:
|
|
return self._upload_mgr.uri(path)
|
|
|
|
def _build_master_node_setup_step(self):
|
|
name = '%s: Master node setup' % self._job_key
|
|
jar = self._script_runner_jar_uri()
|
|
step_args = [self._upload_mgr.uri(self._master_node_setup_script_path)]
|
|
|
|
return dict(
|
|
Name=name,
|
|
ActionOnFailure=self._action_on_failure(),
|
|
HadoopJarStep=dict(
|
|
Jar=jar,
|
|
Args=step_args,
|
|
)
|
|
)
|
|
|
|
def _libjar_paths(self):
|
|
results = []
|
|
|
|
# libjars should be in the working dir of the master node setup
|
|
# script path, unless they refer to paths directly (file:///)
|
|
for path in self._opts['libjars']:
|
|
if path.startswith('file:///'):
|
|
results.append(path[7:]) # keep leading slash
|
|
else:
|
|
results.append(posixpath.join(
|
|
self._master_node_setup_working_dir(),
|
|
self._master_node_setup_mgr.name('file', path)))
|
|
|
|
return results
|
|
|
|
def _get_streaming_jar_and_step_arg_prefix(self):
|
|
if self._opts['hadoop_streaming_jar']:
|
|
if self._opts['hadoop_streaming_jar'].startswith('file://'):
|
|
# special case: jar is already on EMR
|
|
# relative paths are OK (though maybe not useful)
|
|
return self._opts['hadoop_streaming_jar'][7:], []
|
|
else:
|
|
return self._upload_mgr.uri(
|
|
self._opts['hadoop_streaming_jar']), []
|
|
elif version_gte(self.get_image_version(), '4'):
|
|
# 4.x AMIs use an intermediary jar
|
|
return _4_X_COMMAND_RUNNER_JAR, ['hadoop-streaming']
|
|
else:
|
|
# 2.x and 3.x AMIs just use a regular old streaming jar
|
|
return _PRE_4_X_STREAMING_JAR, []
|
|
|
|
def _launch_emr_job(self):
|
|
"""Create an empty cluster on EMR, and set self._cluster_id to
|
|
its ID.
|
|
"""
|
|
# step concurrency level of a cluster we added steps to, used
|
|
# for locking
|
|
step_concurrency_level = None
|
|
|
|
# try to find a cluster from the pool. basically auto-fill
|
|
# 'cluster_id' if possible and then follow normal behavior.
|
|
if (self._opts['pool_clusters'] and not self._cluster_id):
|
|
cluster_id, step_concurrency_level = self._find_cluster()
|
|
if cluster_id:
|
|
self._cluster_id = cluster_id
|
|
self._locked_cluster = True
|
|
|
|
# create a cluster if we're not already using an existing one
|
|
if not self._cluster_id:
|
|
self._cluster_id = self._create_cluster()
|
|
self._created_cluster = True
|
|
else:
|
|
log.info('Adding our job to existing cluster %s' %
|
|
self._cluster_id)
|
|
self._log_address_of_master_once()
|
|
|
|
# now that we know which cluster it is, check for Spark support
|
|
if self._has_spark_steps():
|
|
self._check_cluster_spark_support()
|
|
|
|
# define our steps
|
|
steps = self._steps_to_submit()
|
|
|
|
if self._add_steps_in_batch():
|
|
self._add_steps_to_cluster(steps)
|
|
else:
|
|
# later steps will be added one at a time
|
|
self._add_steps_to_cluster(steps[:1])
|
|
|
|
# if we locked a cluster with concurrent steps, we can release
|
|
# the lock immediately
|
|
if step_concurrency_level and step_concurrency_level > 1:
|
|
self._release_cluster_lock()
|
|
|
|
# learn about how fast the cluster state switches
|
|
cluster = self._describe_cluster()
|
|
log.debug('Cluster has state %s' % cluster['Status']['State'])
|
|
|
|
# SSH FS uses sudo if we're on AMI 4.3.0+ (see #1244)
|
|
if hasattr(self.fs, 'ssh') and version_gte(
|
|
self.get_image_version(), '4.3.0'):
|
|
self.fs.ssh.use_sudo_over_ssh()
|
|
|
|
def _release_cluster_lock(self):
|
|
if not self._locked_cluster:
|
|
return
|
|
|
|
emr_client = self.make_emr_client()
|
|
|
|
log.info(' releasing cluster lock')
|
|
# this can fail, but usually it's because the cluster
|
|
# started terminating, so only try releasing the lock once
|
|
_attempt_to_unlock_cluster(emr_client, self._cluster_id)
|
|
self._locked_cluster = False
|
|
|
|
def _add_steps_to_cluster(self, steps):
|
|
"""Add steps (from _steps_to_submit()) to our cluster and append their
|
|
IDs to self._step_ids"""
|
|
emr_client = self.make_emr_client()
|
|
|
|
steps_kwargs = dict(JobFlowId=self._cluster_id, Steps=steps)
|
|
log.debug('Calling add_job_flow_steps(%s)' % ','.join(
|
|
('%s=%r' % (k, v)) for k, v in steps_kwargs.items()))
|
|
step_ids = emr_client.add_job_flow_steps(**steps_kwargs)['StepIds']
|
|
self._step_ids.extend(step_ids)
|
|
|
|
def get_job_steps(self):
|
|
"""Fetch the steps submitted by this runner from the EMR API.
|
|
|
|
.. deprecated:: 0.7.4
|
|
|
|
.. versionadded:: 0.6.1
|
|
"""
|
|
log.warning(
|
|
'get_job_steps() is deprecated and will be removed in v0.8.0')
|
|
|
|
return _get_job_steps(
|
|
self.make_emr_client(), self.get_cluster_id(), self.get_job_key())
|
|
|
|
def _wait_for_steps_to_complete(self):
|
|
"""Wait for every step of the job to complete, one by one."""
|
|
steps = self._steps_to_submit()
|
|
|
|
# clear out log interpretations if they were filled somehow
|
|
self._log_interpretations = []
|
|
self._mns_log_interpretation = None
|
|
|
|
# open SSH tunnel if cluster is already ready
|
|
# (this happens with pooling). See #1115
|
|
cluster = self._describe_cluster()
|
|
if cluster['Status']['State'] in ('RUNNING', 'WAITING'):
|
|
self._set_up_ssh_tunnel_and_hdfs()
|
|
|
|
for i, step in enumerate(steps):
|
|
# if our step isn't already submitted, submit it
|
|
if len(self._step_ids) <= i:
|
|
self._add_steps_to_cluster(
|
|
steps[len(self._step_ids):i + 1])
|
|
|
|
step_id = self._step_ids[i]
|
|
step_name = step['Name'].split(': ')[-1]
|
|
|
|
# treat master node setup script is treated as step -1
|
|
if self._master_node_setup_script_path:
|
|
step_num = i - 1
|
|
else:
|
|
step_num = i
|
|
|
|
log.info('Waiting for %s (%s) to complete...' %
|
|
(step_name, step_id))
|
|
|
|
self._wait_for_step_to_complete(step_id, step_num)
|
|
|
|
def _wait_for_step_to_complete(self, step_id, step_num=None):
|
|
"""Helper for _wait_for_step_to_complete(). Wait for
|
|
step with the given ID to complete, and fetch counters.
|
|
If it fails, attempt to diagnose the error, and raise an
|
|
exception.
|
|
|
|
:param step_id: the s-XXXXXXX step ID on EMR
|
|
:param step_num: which step this is out of the steps
|
|
belonging to our job (0-indexed). Master node
|
|
setup script, if there is one, is step -1
|
|
|
|
This also adds an item to self._log_interpretations or sets
|
|
self._mns_log_interpretation
|
|
"""
|
|
log_interpretation = dict(step_id=step_id)
|
|
|
|
# suppress warnings about missing job ID for script-runner.jar
|
|
if step_num == -1:
|
|
log_interpretation['no_job'] = True
|
|
self._mns_log_interpretation = log_interpretation
|
|
else:
|
|
self._log_interpretations.append(log_interpretation)
|
|
|
|
emr_client = self.make_emr_client()
|
|
|
|
while True:
|
|
# don't antagonize EMR's throttling
|
|
log.debug('Waiting %.1f seconds...' %
|
|
self._opts['check_cluster_every'])
|
|
time.sleep(self._opts['check_cluster_every'])
|
|
|
|
# log address of the master node once if we have it
|
|
self._log_address_of_master_once()
|
|
|
|
step = emr_client.describe_step(
|
|
ClusterId=self._cluster_id, StepId=step_id)['Step']
|
|
|
|
if step['Status']['State'] == 'PENDING':
|
|
cluster = self._describe_cluster()
|
|
|
|
reason = _get_reason(cluster)
|
|
reason_desc = (': %s' % reason) if reason else ''
|
|
|
|
# we can open the ssh tunnel if cluster is ready (see #1115)
|
|
if cluster['Status']['State'] in ('RUNNING', 'WAITING'):
|
|
self._set_up_ssh_tunnel_and_hdfs()
|
|
|
|
log.info(' PENDING (cluster is %s%s)' % (
|
|
cluster['Status']['State'], reason_desc))
|
|
continue
|
|
|
|
elif step['Status']['State'] == 'RUNNING':
|
|
|
|
time_running_desc = ''
|
|
|
|
start = step['Status']['Timeline'].get('StartDateTime')
|
|
if start:
|
|
time_running_desc = ' for %s' % strip_microseconds(
|
|
_boto3_now() - start)
|
|
|
|
# now is the time to tunnel, if we haven't already
|
|
self._set_up_ssh_tunnel_and_hdfs()
|
|
log.info(' RUNNING%s' % time_running_desc)
|
|
|
|
# don't log progress for master node setup step, because
|
|
# it doesn't appear in job tracker
|
|
if step_num >= 0:
|
|
self._log_step_progress()
|
|
|
|
# it's safe to clean up our lock, cluster isn't WAITING
|
|
self._release_cluster_lock()
|
|
|
|
continue
|
|
|
|
# we're done, will return at the end of this
|
|
elif step['Status']['State'] == 'COMPLETED':
|
|
log.info(' COMPLETED')
|
|
# will fetch counters, below, and then return
|
|
else:
|
|
# step has failed somehow. *reason* seems to only be set
|
|
# when job is cancelled (e.g. 'Job terminated')
|
|
reason = _get_reason(step)
|
|
reason_desc = (' (%s)' % reason) if reason else ''
|
|
|
|
log.info(' %s%s' % (
|
|
step['Status']['State'], reason_desc))
|
|
|
|
# print cluster status; this might give more context
|
|
# why step didn't succeed
|
|
cluster = self._describe_cluster()
|
|
reason = _get_reason(cluster)
|
|
reason_desc = (': %s' % reason) if reason else ''
|
|
log.info('Cluster %s %s %s%s' % (
|
|
cluster['Id'],
|
|
'was' if 'ED' in cluster['Status']['State'] else 'is',
|
|
cluster['Status']['State'],
|
|
reason_desc))
|
|
|
|
if cluster['Status']['State'] in (
|
|
'TERMINATING', 'TERMINATED', 'TERMINATED_WITH_ERRORS'):
|
|
# was it caused by a pooled cluster self-terminating?
|
|
# (if so, raise _PooledClusterSelfTerminatedException)
|
|
self._check_for_pooled_cluster_self_termination(
|
|
cluster, step)
|
|
# was it caused by IAM roles?
|
|
self._check_for_missing_default_iam_roles(cluster)
|
|
# was it because a bootstrap action failed?
|
|
self._check_for_failed_bootstrap_action(cluster)
|
|
|
|
# spark steps require different log parsing. The master node
|
|
# setup script is a JAR step (albeit one that never produces
|
|
# counters)
|
|
step_type = (
|
|
self._get_step(step_num)['type'] if step_num >= 0 else 'jar')
|
|
|
|
# step is done (either COMPLETED, FAILED, INTERRUPTED). so
|
|
# try to fetch counters. (Except for master node setup
|
|
# and Spark, which has no counters.)
|
|
if step['Status']['State'] != 'CANCELLED' and step_num >= 0:
|
|
self._log_counters(log_interpretation, step_num)
|
|
|
|
if step['Status']['State'] == 'COMPLETED':
|
|
return
|
|
|
|
if step['Status']['State'] == 'FAILED':
|
|
error = self._pick_error(log_interpretation, step_type)
|
|
if error:
|
|
_log_probable_cause_of_failure(log, error)
|
|
|
|
raise StepFailedException(
|
|
step_num=step_num, num_steps=self._num_steps(),
|
|
# "Step 0 of ... failed" looks weird
|
|
step_desc=(
|
|
'Master node setup step' if step_num == -1 else None))
|
|
|
|
def _log_address_of_master_once(self):
|
|
"""Log the master node's public DNS, if we haven't already"""
|
|
# Some users like to SSH in manually. See #2007
|
|
if not self._cluster_id:
|
|
return
|
|
|
|
if self._cluster_id in self._logged_address_of_master:
|
|
return
|
|
|
|
master_dns = self._address_of_master()
|
|
|
|
if not master_dns:
|
|
return
|
|
|
|
log.info(' master node is %s' % master_dns)
|
|
self._logged_address_of_master.add(self._cluster_id)
|
|
|
|
def _log_step_progress(self):
|
|
"""Tunnel to the job tracker/resource manager and log the
|
|
progress of the current step.
|
|
|
|
(This takes no arguments; we just assume the most recent running
|
|
job is ours, which should be correct for EMR.)
|
|
"""
|
|
progress_html = (self._progress_html_from_tunnel() or
|
|
self._progress_html_over_ssh())
|
|
if not progress_html:
|
|
return
|
|
|
|
tunnel_config = self._ssh_tunnel_config()
|
|
|
|
if tunnel_config['name'] == 'job tracker':
|
|
map_progress, reduce_progress = (
|
|
_parse_progress_from_job_tracker(progress_html))
|
|
if map_progress is not None:
|
|
log.info(' map %3d%% reduce %3d%%' % (
|
|
map_progress, reduce_progress))
|
|
else:
|
|
progress = _parse_progress_from_resource_manager(
|
|
progress_html)
|
|
if progress is not None:
|
|
log.info(' %5.1f%% complete' % progress)
|
|
|
|
def _progress_html_from_tunnel(self):
|
|
"""Fetch progress by calling :py:func:`urlopen` on our ssh tunnel, or
|
|
return ``None``."""
|
|
if not self._ssh_tunnel_url:
|
|
return None
|
|
|
|
tunnel_config = self._ssh_tunnel_config()
|
|
log.debug(' Fetching progress from %s at %s' % (
|
|
tunnel_config['name'], self._ssh_tunnel_url))
|
|
|
|
tunnel_handle = None
|
|
try:
|
|
tunnel_handle = urlopen(self._ssh_tunnel_url)
|
|
return tunnel_handle.read()
|
|
except Exception as e:
|
|
log.debug(' failed: %s' % str(e))
|
|
return None
|
|
finally:
|
|
if tunnel_handle:
|
|
tunnel_handle.close()
|
|
|
|
def _progress_html_over_ssh(self):
|
|
"""Fetch progress by running :command:`curl` over SSH, or return
|
|
``None``"""
|
|
host = self._address_of_master()
|
|
|
|
if not self._opts['ec2_key_pair_file']:
|
|
return None
|
|
|
|
if not host:
|
|
return None
|
|
|
|
tunnel_config = self._ssh_tunnel_config()
|
|
remote_url = self._job_tracker_url()
|
|
|
|
log.debug(' Fetching progress from %s over SSH' % (
|
|
tunnel_config['name']))
|
|
|
|
try:
|
|
stdout, _ = self.fs.ssh._ssh_run(host, ['curl', remote_url])
|
|
return stdout
|
|
except Exception as e:
|
|
log.debug(' failed: %s' % str(e))
|
|
|
|
return None
|
|
|
|
def _check_for_pooled_cluster_self_termination(self, cluster, step):
|
|
"""If failure could have been due to a pooled cluster self-terminating,
|
|
raise _PooledClusterSelfTerminatedException"""
|
|
# this check might not even be relevant
|
|
if not self._opts['pool_clusters']:
|
|
return
|
|
|
|
if self._opts['cluster_id']:
|
|
return
|
|
|
|
# if a cluster we created self-terminated, something is wrong with
|
|
# the way self-termination is set up (e.g. very low idle time)
|
|
if self._created_cluster:
|
|
return
|
|
|
|
# don't check for max_mins_idle because it's possible to
|
|
# join a self-terminating cluster without having max_mins_idle set
|
|
# on this runner (pooling only cares about the master bootstrap script,
|
|
# not other bootstrap actions)
|
|
|
|
# our step should be CANCELLED (not failed)
|
|
if step['Status']['State'] != 'CANCELLED':
|
|
return
|
|
|
|
# we *could* check if the step had a chance to start by checking if
|
|
# step.status.timeline.startdatetime is set. This shouldn't happen in
|
|
# practice, and if it did, we'd still be fine as long as the script
|
|
# didn't write data to the output dir, so it's not worth the extra
|
|
# code.
|
|
|
|
# cluster should have stopped because master node failed
|
|
# could also check for
|
|
# cluster.status.statechangereason.code == 'INSTANCE_FAILURE'
|
|
if not _CLUSTER_SELF_TERMINATED_RE.match(_get_reason(cluster)):
|
|
return
|
|
|
|
log.info('Pooled cluster self-terminated, trying again...')
|
|
raise _PooledClusterSelfTerminatedException
|
|
|
|
def _check_for_missing_default_iam_roles(self, cluster):
|
|
"""If cluster couldn't start due to missing IAM roles, tell
|
|
user what to do."""
|
|
if not cluster:
|
|
cluster = self._describe_cluster()
|
|
|
|
reason = _get_reason(cluster)
|
|
if any(reason.endswith('/%s is invalid' % role)
|
|
for role in (_FALLBACK_INSTANCE_PROFILE,
|
|
_FALLBACK_SERVICE_ROLE)):
|
|
log.warning(
|
|
'\n'
|
|
'Ask your admin to create the default EMR roles'
|
|
' by following:\n\n'
|
|
' http://docs.aws.amazon.com/ElasticMapReduce/latest'
|
|
'/DeveloperGuide/emr-iam-roles-creatingroles.html\n')
|
|
|
|
def _default_step_output_dir(self):
|
|
# put intermediate data in HDFS
|
|
return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key
|
|
|
|
### LOG PARSING (implementation of LogInterpretationMixin) ###
|
|
|
|
def _check_for_failed_bootstrap_action(self, cluster):
|
|
"""If our bootstrap actions failed, parse the stderr to find
|
|
out why."""
|
|
reason = _get_reason(cluster)
|
|
action_num_and_node_id = _check_for_nonzero_return_code(reason)
|
|
if not action_num_and_node_id:
|
|
return
|
|
|
|
if not self._read_logs():
|
|
return
|
|
|
|
# this doesn't really correspond to a step, so
|
|
# don't bother storing it in self._log_interpretations
|
|
bootstrap_interpretation = _interpret_emr_bootstrap_stderr(
|
|
self.fs, self._ls_bootstrap_stderr_logs(**action_num_and_node_id))
|
|
|
|
# should be 0 or 1 errors, since we're checking a single stderr file
|
|
if bootstrap_interpretation.get('errors'):
|
|
error = bootstrap_interpretation['errors'][0]
|
|
_log_probable_cause_of_failure(log, error)
|
|
|
|
def _ls_bootstrap_stderr_logs(self, action_num=None, node_id=None):
|
|
"""_ls_bootstrap_stderr_logs(), with logging for each log we parse."""
|
|
if not self._read_logs():
|
|
return
|
|
|
|
for match in _ls_emr_bootstrap_stderr_logs(
|
|
self.fs,
|
|
self._stream_bootstrap_log_dirs(
|
|
action_num=action_num, node_id=node_id),
|
|
action_num=action_num,
|
|
node_id=node_id):
|
|
log.info(' Parsing boostrap stderr log: %s' % match['path'])
|
|
yield match
|
|
|
|
def _stream_bootstrap_log_dirs(self, action_num=None, node_id=None):
|
|
"""Stream a single directory on S3 containing the relevant bootstrap
|
|
stderr. Optionally, use *action_num* and *node_id* to narrow it down
|
|
further.
|
|
"""
|
|
if action_num is None or node_id is None:
|
|
s3_dir_name = 'node'
|
|
else:
|
|
s3_dir_name = posixpath.join(
|
|
'node', node_id, 'bootstrap-actions', str(action_num + 1))
|
|
|
|
# dir_name=None means don't try to SSH in.
|
|
#
|
|
# TODO: If the failure is on the master node, we could just look in
|
|
# /mnt/var/log/bootstrap-actions. However, if it's on a worker node,
|
|
# we'd have to look up its internal IP using the ListInstances
|
|
# API call. This *would* be a bit faster though. See #1346.
|
|
return self._stream_log_dirs(
|
|
'bootstrap logs',
|
|
dir_name=None, # don't SSH in
|
|
s3_dir_name=s3_dir_name)
|
|
|
|
def _stream_history_log_dirs(self, output_dir=None):
|
|
"""Yield lists of directories to look for the history log in."""
|
|
|
|
if version_gte(self.get_image_version(), '4'):
|
|
hdfs_dir_name = 'history'
|
|
# on 4.0.0 (and possibly other versions before 4.3.0)
|
|
# history logs aren't on the filesystem. See #1253
|
|
dir_name = 'hadoop-mapreduce/history'
|
|
s3_dir_name = 'hadoop-mapreduce/history'
|
|
elif version_gte(self.get_image_version(), '3'):
|
|
# on the 3.x AMIs, the history log is on HDFS only
|
|
# (not even S3)
|
|
hdfs_dir_name = 'history'
|
|
dir_name = None
|
|
s3_dir_name = None
|
|
else:
|
|
# 2.x AMIs don't use YARN, so no point in checking HDFS
|
|
hdfs_dir_name = None
|
|
dir_name = 'hadoop/history'
|
|
s3_dir_name = 'jobs'
|
|
|
|
return self._stream_log_dirs(
|
|
'history log',
|
|
hdfs_dir_name=hdfs_dir_name,
|
|
dir_name=dir_name,
|
|
s3_dir_name=s3_dir_name)
|
|
|
|
def _stream_task_log_dirs(self, application_id=None, output_dir=None):
|
|
"""Get lists of directories to look for the task logs in."""
|
|
if version_gte(self.get_image_version(), '4'):
|
|
# denied access on some 4.x AMIs by the yarn user, see #1244
|
|
dir_name = 'hadoop-yarn/containers'
|
|
s3_dir_name = 'containers'
|
|
else:
|
|
dir_name = 'hadoop/userlogs'
|
|
s3_dir_name = 'task-attempts'
|
|
|
|
if application_id:
|
|
dir_name = posixpath.join(dir_name, application_id)
|
|
s3_dir_name = posixpath.join(s3_dir_name, application_id)
|
|
|
|
return self._stream_log_dirs(
|
|
'task logs',
|
|
dir_name=dir_name,
|
|
s3_dir_name=s3_dir_name,
|
|
ssh_to_workers=True) # TODO: does this make sense on YARN?
|
|
|
|
def _get_step_log_interpretation(self, log_interpretation, step_type):
|
|
"""Fetch and interpret the step log."""
|
|
step_id = log_interpretation.get('step_id')
|
|
|
|
if not self._read_logs():
|
|
return
|
|
|
|
if not step_id:
|
|
log.warning("Can't fetch step log; missing step ID")
|
|
return
|
|
|
|
if self._step_type_uses_spark(step_type):
|
|
# Spark also has a "controller" log4j log, but it doesn't
|
|
# contain errors or anything else we need
|
|
#
|
|
# the step log is unlikely to be very much help because
|
|
# Spark on EMR runs in cluster mode. See #2056
|
|
#
|
|
# there's generally only one log (unless the job has been running
|
|
# long enough for log rotation), so use partial=False
|
|
return _interpret_spark_logs(
|
|
self.fs, self._ls_step_stderr_logs(step_id=step_id),
|
|
partial=False)
|
|
else:
|
|
return (
|
|
_interpret_emr_step_syslog(
|
|
self.fs, self._ls_step_syslogs(step_id=step_id)) or
|
|
_interpret_emr_step_stderr(
|
|
self.fs, self._ls_step_stderr_logs(step_id=step_id))
|
|
)
|
|
|
|
# _ls_step_*() methods are just helpers for _get_step_log_interpretation,
|
|
# so not disabling them if self._read_logs() is false
|
|
|
|
def _ls_step_syslogs(self, step_id):
|
|
"""Yield step log matches, logging a message for each one."""
|
|
for match in _ls_emr_step_syslogs(
|
|
self.fs, self._stream_step_log_dirs(step_id=step_id),
|
|
step_id=step_id):
|
|
log.info(' Parsing step log: %s' % match['path'])
|
|
yield match
|
|
|
|
def _ls_step_stderr_logs(self, step_id):
|
|
"""Yield step log matches, logging a message for each one."""
|
|
for match in _ls_emr_step_stderr_logs(
|
|
self.fs, self._stream_step_log_dirs(step_id=step_id),
|
|
step_id=step_id):
|
|
log.info(' Parsing step log: %s' % match['path'])
|
|
yield match
|
|
|
|
def _stream_step_log_dirs(self, step_id):
|
|
"""Get lists of directories to look for the step log in."""
|
|
return self._stream_log_dirs(
|
|
'step log',
|
|
dir_name=posixpath.join('hadoop', 'steps', step_id),
|
|
s3_dir_name=posixpath.join('steps', step_id))
|
|
|
|
def _stream_log_dirs(self, log_desc, dir_name, s3_dir_name,
|
|
hdfs_dir_name=None,
|
|
ssh_to_workers=False):
|
|
"""Stream log dirs for any kind of log.
|
|
|
|
Our general strategy is first, if SSH is enabled, to SSH into the
|
|
master node (and possibly workers, if *ssh_to_workers* is set).
|
|
|
|
If this doesn't work, we have to look on S3. If the cluster is
|
|
TERMINATING, we first wait for it to terminate (since that
|
|
will trigger copying logs over).
|
|
"""
|
|
if not self._read_logs():
|
|
return
|
|
|
|
# used to fetch history logs off HDFS
|
|
if (hdfs_dir_name and
|
|
self.fs.can_handle_path(_DEFAULT_YARN_HDFS_LOG_DIR)):
|
|
|
|
hdfs_log_dir = posixpath.join(
|
|
_DEFAULT_YARN_HDFS_LOG_DIR, hdfs_dir_name)
|
|
|
|
log.info('Looking for %s in %s...' % (log_desc, hdfs_log_dir))
|
|
yield [hdfs_log_dir]
|
|
|
|
if dir_name and self.fs.can_handle_path('ssh:///'):
|
|
ssh_host = self._address_of_master()
|
|
if ssh_host:
|
|
hosts = [ssh_host]
|
|
host_desc = ssh_host
|
|
if ssh_to_workers:
|
|
try:
|
|
hosts.extend(self._ssh_worker_hosts())
|
|
host_desc += ' and task/core nodes'
|
|
except IOError:
|
|
log.warning('Could not get worker addresses for %s' %
|
|
ssh_host)
|
|
|
|
path = posixpath.join(_EMR_LOG_DIR, dir_name)
|
|
log.info('Looking for %s in %s on %s...' % (
|
|
log_desc, path, host_desc))
|
|
yield ['ssh://%s%s%s' % (
|
|
ssh_host, '!' + host if host != ssh_host else '',
|
|
path) for host in hosts]
|
|
|
|
# wait for logs to be on S3
|
|
self._wait_for_logs_on_s3()
|
|
|
|
s3_dir_name = s3_dir_name or dir_name
|
|
|
|
if s3_dir_name and self._s3_log_dir():
|
|
cloud_log_dir = posixpath.join(self._s3_log_dir(), s3_dir_name)
|
|
log.info('Looking for %s in %s...' % (log_desc, cloud_log_dir))
|
|
yield [cloud_log_dir]
|
|
|
|
def _ssh_worker_hosts(self):
|
|
"""Get the hostnames of all core and task nodes,
|
|
that are currently running, so we can SSH to them through the master
|
|
nodes and read their logs.
|
|
|
|
(This currently returns IP addresses rather than full hostnames
|
|
because they're shorter.)
|
|
"""
|
|
emr_client = self.make_emr_client()
|
|
|
|
instances = _boto3_paginate(
|
|
'Instances', emr_client, 'list_instances',
|
|
ClusterId=self._cluster_id,
|
|
InstanceGroupTypes=['CORE', 'TASK'],
|
|
InstanceStates=['RUNNING'])
|
|
|
|
hosts = []
|
|
|
|
for instance in instances:
|
|
hosts.append(instance['PrivateIpAddress'])
|
|
|
|
return hosts
|
|
|
|
def _wait_for_logs_on_s3(self):
|
|
"""If the cluster is already terminating, wait for it to terminate,
|
|
so that logs will be transferred to S3.
|
|
|
|
Don't print anything unless cluster is in the TERMINATING state.
|
|
"""
|
|
cluster = self._describe_cluster()
|
|
|
|
if cluster['Status']['State'] in (
|
|
'TERMINATED', 'TERMINATED_WITH_ERRORS'):
|
|
return # already terminated
|
|
|
|
if cluster['Status']['State'] != 'TERMINATING':
|
|
# going to need to wait for logs to get archived to S3
|
|
|
|
# "step_num" is just a unique ID for the step; using -1
|
|
# for master node setup script
|
|
if (self._master_node_setup_script_path and
|
|
self._mns_log_interpretation is None):
|
|
step_num = -1
|
|
else:
|
|
step_num = len(self._log_interpretations)
|
|
|
|
# already did this for this step
|
|
if step_num in self._waited_for_logs_on_s3:
|
|
return
|
|
|
|
try:
|
|
log.info('Waiting %d minutes for logs to transfer to S3...'
|
|
' (ctrl-c to skip)' % _S3_LOG_WAIT_MINUTES)
|
|
|
|
if not self.fs.can_handle_path('ssh:///'):
|
|
log.info(
|
|
'\n'
|
|
'To fetch logs immediately next time, set up SSH.'
|
|
' See:\n'
|
|
'https://pythonhosted.org/mrjob/guides'
|
|
'/emr-quickstart.html#configuring-ssh-credentials\n')
|
|
|
|
time.sleep(60 * _S3_LOG_WAIT_MINUTES)
|
|
except KeyboardInterrupt:
|
|
pass
|
|
|
|
# do this even if they ctrl-c'ed; don't make them do it
|
|
# for every log for this step
|
|
self._waited_for_logs_on_s3.add(step_num)
|
|
return
|
|
|
|
self._wait_for_cluster_to_terminate()
|
|
|
|
def counters(self):
|
|
# not using self._pick_counters() because we don't want to
|
|
# initiate a log fetch
|
|
return [_pick_counters(log_interpretation)
|
|
for log_interpretation in self._log_interpretations]
|
|
|
|
### Bootstrapping ###
|
|
|
|
def _bootstrap_python(self):
|
|
"""Return a (possibly empty) list of parsed commands (in the same
|
|
format as returned by parse_setup_cmd())'"""
|
|
|
|
if PY2:
|
|
# Python 2 and pip are basically already installed everywhere
|
|
# (Okay, there's no pip on AMIs prior to 2.4.3, but there's no
|
|
# longer an easy way to get it now that apt-get is broken.)
|
|
return []
|
|
|
|
# if bootstrap_python is None, install it for all AMIs up to 4.6.0,
|
|
# and warn if it's an AMI before 3.7.0
|
|
if self._opts['bootstrap_python'] or (
|
|
self._opts['bootstrap_python'] is None and
|
|
not self._image_version_gte('4.6.0')):
|
|
|
|
# we have to have at least on AMI 3.7.0. But give it a shot
|
|
if not self._image_version_gte('3.7.0'):
|
|
log.warning(
|
|
'bootstrapping Python 3 will probably not work on'
|
|
' AMIs prior to 3.7.0. For an alternative, see:'
|
|
' https://pythonhosted.org/mrjob/guides/emr-bootstrap'
|
|
'-cookbook.html#installing-python-from-source')
|
|
|
|
return [[
|
|
'sudo yum install -y python34 python34-devel python34-pip'
|
|
]]
|
|
else:
|
|
return []
|
|
|
|
def _should_bootstrap_spark(self):
|
|
"""Return *bootstrap_spark* option if set; otherwise return
|
|
true if our job has Spark steps."""
|
|
if self._opts['bootstrap_spark'] is None:
|
|
return self._has_spark_steps()
|
|
else:
|
|
return bool(self._opts['bootstrap_spark'])
|
|
|
|
def _applications(self, add_spark=True):
|
|
"""Returns applications (*applications* option) as a set. Adds
|
|
in ``Hadoop`` and ``Spark`` as needed."""
|
|
applications = set(self._opts['applications'])
|
|
|
|
# release_label implies 4.x AMI and later
|
|
if (add_spark and self._should_bootstrap_spark() and
|
|
self._opts['release_label']):
|
|
# EMR allows us to have both "spark" and "Spark" applications,
|
|
# which is probably not what we want
|
|
if not self._has_spark_application():
|
|
applications.add('Spark')
|
|
|
|
# patch in "Hadoop" unless applications is empty (e.g. 3.x AMIs)
|
|
if applications:
|
|
# don't add both "Hadoop" and "hadoop"
|
|
if not any(a.lower() == 'hadoop' for a in applications):
|
|
applications.add('Hadoop')
|
|
|
|
return applications
|
|
|
|
def _bootstrap_actions(self, add_spark=True):
|
|
"""Parse *bootstrap_actions* option into dictionaries with
|
|
keys *path*, *args*, adding Spark bootstrap action if needed.
|
|
|
|
(This doesn't handle the master bootstrap script.)
|
|
"""
|
|
actions = list(self._opts['bootstrap_actions'])
|
|
|
|
# no release_label implies AMIs prior to 4.x
|
|
if (add_spark and self._should_bootstrap_spark() and
|
|
not self._opts['release_label']):
|
|
|
|
# running this action twice apparently breaks Spark's
|
|
# ability to output to S3 (see #1367)
|
|
if not self._has_spark_install_bootstrap_action():
|
|
actions.append(_3_X_SPARK_BOOTSTRAP_ACTION)
|
|
|
|
results = []
|
|
for action in actions:
|
|
args = shlex_split(action)
|
|
if not args:
|
|
raise ValueError('bad bootstrap action: %r' % (action,))
|
|
|
|
results.append(dict(path=args[0], args=args[1:]))
|
|
|
|
return results
|
|
|
|
def _cp_to_local_cmd(self):
|
|
"""Command to copy files from the cloud to the local directory."""
|
|
if self._opts['release_label']:
|
|
# on the 4.x AMIs, hadoop isn't yet installed, so use AWS CLI
|
|
return 'aws s3 cp'
|
|
else:
|
|
# on the 2.x and 3.x AMIs, use hadoop
|
|
return 'hadoop fs -copyToLocal'
|
|
|
|
def _manifest_download_commands(self):
|
|
return [
|
|
('s3://*', 'aws s3 cp'),
|
|
('*://*', 'hadoop fs -copyToLocal'),
|
|
]
|
|
|
|
### master node setup script ###
|
|
|
|
def _create_master_node_setup_script_if_needed(self):
|
|
"""Helper for :py:meth:`_add_bootstrap_files_for_upload`.
|
|
|
|
If we need a master node setup script and write it into our local
|
|
temp directory. Set self._master_node_setup_script_path.
|
|
"""
|
|
# already created
|
|
if self._master_node_setup_script_path:
|
|
return
|
|
|
|
# currently, the only thing this script does is upload files
|
|
if not self._master_node_setup_mgr.paths():
|
|
return
|
|
|
|
# create script
|
|
path = os.path.join(self._get_local_tmp_dir(), 'mns.sh')
|
|
contents = self._master_node_setup_script_content()
|
|
|
|
self._write_script(contents, path, 'master node setup script')
|
|
|
|
# the script itself doesn't need to be on the master node, just S3
|
|
self._master_node_setup_script_path = path
|
|
self._upload_mgr.add(path)
|
|
|
|
def _master_node_setup_script_content(self):
|
|
"""Create the contents of the master node setup script as an
|
|
array of strings.
|
|
|
|
(prepare self._master_node_setup_mgr first)
|
|
"""
|
|
# TODO: this is very similar to _master_bootstrap_script_content();
|
|
# merge common code
|
|
out = []
|
|
|
|
# shebang, etc.
|
|
for line in self._start_of_sh_script():
|
|
out.append(line)
|
|
out.append('')
|
|
|
|
# run commands in a block so we can redirect stdout to stderr
|
|
# (e.g. to catch errors from compileall). See #370
|
|
out.append('{')
|
|
|
|
# make working dir
|
|
working_dir = self._master_node_setup_working_dir()
|
|
out.append(' mkdir -p %s' % pipes.quote(working_dir))
|
|
out.append(' cd %s' % pipes.quote(working_dir))
|
|
out.append('')
|
|
|
|
for name, path in sorted(
|
|
self._master_node_setup_mgr.name_to_path('file').items()):
|
|
uri = self._upload_mgr.uri(path)
|
|
out.append(' %s %s %s' % (
|
|
self._cp_to_local_cmd(), pipes.quote(uri), pipes.quote(name)))
|
|
# imitate Hadoop Distributed Cache
|
|
out.append(' chmod u+rx %s' % pipes.quote(name))
|
|
|
|
# at some point we will probably run commands as well (see #1336)
|
|
|
|
out.append('} 1>&2') # stdout -> stderr for ease of error log parsing
|
|
|
|
return out
|
|
|
|
def _master_node_setup_working_dir(self):
|
|
"""Where to place files used by the master node setup script."""
|
|
return '/home/hadoop/%s' % self._job_key
|
|
|
|
def _script_runner_jar_uri(self):
|
|
return (
|
|
's3://%s.elasticmapreduce/libs/script-runner/script-runner.jar' %
|
|
self._opts['region'])
|
|
|
|
def _build_debugging_step(self):
|
|
if self._opts['release_label']:
|
|
jar = _4_X_COMMAND_RUNNER_JAR
|
|
args = ['state-pusher-script']
|
|
else:
|
|
jar = self._script_runner_jar_uri()
|
|
args = (
|
|
's3://%s.elasticmapreduce/libs/state-pusher/0.1/fetch' %
|
|
self._opts['region'])
|
|
|
|
return dict(
|
|
Name='Setup Hadoop Debugging',
|
|
HadoopJarStep=dict(Jar=jar, Args=args),
|
|
)
|
|
|
|
def _debug_script_uri(self):
|
|
return (
|
|
's3://%s.elasticmapreduce/libs/state-pusher/0.1/fetch' %
|
|
self._opts['region'])
|
|
|
|
### EMR JOB MANAGEMENT UTILS ###
|
|
|
|
def make_persistent_cluster(self):
|
|
if (self._cluster_id):
|
|
raise ValueError(
|
|
'This runner is already associated with cluster ID %s' %
|
|
(self._cluster_id))
|
|
|
|
log.info('Creating persistent cluster to run several jobs in...')
|
|
|
|
self._add_bootstrap_files_for_upload(persistent=True)
|
|
self._upload_local_files()
|
|
|
|
# make sure we can see the files we copied to S3
|
|
self._wait_for_s3_eventual_consistency()
|
|
|
|
# don't allow user to call run()
|
|
self._ran_job = True
|
|
|
|
self._cluster_id = self._create_cluster(persistent=True)
|
|
|
|
return self._cluster_id
|
|
|
|
def get_cluster_id(self):
|
|
"""Get the ID of the cluster our job is running on, or ``None``."""
|
|
return self._cluster_id
|
|
|
|
def _yield_clusters_to_join(self, available_cluster_ids):
|
|
"""Get a list of IDs of pooled clusters that this runner can join,
|
|
sorted so that the ones with the greatest CPU capacity come first.
|
|
|
|
yields (cluster, when_cluster_described) so we can lock clusters that
|
|
we wish to join (*cluster* is the cluster description and
|
|
*when_cluster_described* is a unix timestamp).
|
|
"""
|
|
emr_client = self.make_emr_client()
|
|
|
|
for cluster_id in available_cluster_ids:
|
|
if not self._cluster_has_adequate_capacity(cluster_id):
|
|
continue
|
|
|
|
# check other things about the cluster that we can't hash
|
|
# (DescribeCluster)
|
|
#
|
|
# save cluster description so we can use it for locking
|
|
when_cluster_described = time.time()
|
|
cluster = emr_client.describe_cluster(
|
|
ClusterId=cluster_id)['Cluster']
|
|
|
|
if not self._cluster_description_matches(cluster):
|
|
continue
|
|
|
|
yield (cluster, when_cluster_described)
|
|
|
|
def _list_cluster_ids_for_pooling(self, created_after=None):
|
|
"""Call ListClusters, and collect cluster IDs relevant to pooling.
|
|
|
|
Optionally, only list clusters created after *created_after*.
|
|
|
|
Returns a dictionary with the following keys:
|
|
|
|
available: a list of IDs of clusters that we could join, based on their
|
|
state and name suffix (pool name and hash, mrjob version).
|
|
Sorted so that the cluster with the most CPU (based on
|
|
NormalizedInstanceHours) goes first
|
|
matching: a set of IDs of clusters that have the right name suffix but
|
|
may or may not be in the right state to join (a superset
|
|
of *available*)
|
|
in_pool: a set of IDs of clusters that are in the pool we want to join,
|
|
regardless of their state or pool hash (a superset of
|
|
*matching*)
|
|
max_created: the latest creation timestamp for *any* cluster listed
|
|
(so we can call this again to get stats on newly created
|
|
clusters only)
|
|
"""
|
|
# a map from cluster_id to cpu_capacity
|
|
available = {}
|
|
matching = set()
|
|
in_pool = set()
|
|
max_created = None
|
|
|
|
name_to_match = self._opts['pool_name']
|
|
suffix_to_match = self._cluster_name_pooling_suffix()
|
|
|
|
if self._opts['max_concurrent_steps'] > 1:
|
|
states_to_match = {'RUNNING', 'WAITING'}
|
|
else:
|
|
states_to_match = {'WAITING'}
|
|
|
|
emr_client = self.make_emr_client()
|
|
now = _boto3_now()
|
|
|
|
# you can't pass CreatedAfter=None to list_clusters()
|
|
list_cluster_kwargs = dict(ClusterStates=_ACTIVE_CLUSTER_STATES)
|
|
if created_after:
|
|
list_cluster_kwargs['CreatedAfter'] = created_after
|
|
|
|
log.debug('calling list_clusters(%s)' % ', '.join(
|
|
'%s=%r' % (k, v)
|
|
for k, v in sorted(list_cluster_kwargs.items())))
|
|
|
|
for cluster in _boto3_paginate(
|
|
'Clusters', emr_client, 'list_clusters',
|
|
**list_cluster_kwargs):
|
|
|
|
cluster_id = cluster['Id']
|
|
log.debug(cluster_id)
|
|
|
|
created = cluster['Status']['Timeline']['CreationDateTime']
|
|
|
|
if max_created is None or created > max_created:
|
|
max_created = created
|
|
|
|
name = _parse_cluster_name_suffix(cluster['Name']).get('pool_name')
|
|
|
|
if name != name_to_match:
|
|
continue
|
|
|
|
in_pool.add(cluster_id)
|
|
|
|
if not cluster['Name'].endswith(suffix_to_match):
|
|
continue
|
|
|
|
matching.add(cluster_id)
|
|
|
|
if cluster['Status']['State'] not in states_to_match:
|
|
continue
|
|
|
|
when_ready = cluster['Status']['Timeline'].get('ReadyDateTime')
|
|
|
|
if when_ready:
|
|
hours = max(ceil((now - when_ready).total_seconds() / 3600),
|
|
1.0)
|
|
cpu_capacity = cluster['NormalizedInstanceHours'] / hours
|
|
else:
|
|
# this probably won't happen, since we only inspect clusters
|
|
# in the WAITING state
|
|
cpu_capacity = 0
|
|
|
|
available[cluster_id] = cpu_capacity
|
|
|
|
# convert *available* from a dict to a sorted list
|
|
available = sorted(
|
|
available,
|
|
key=lambda c: available[c],
|
|
reverse=True)
|
|
|
|
return dict(
|
|
available=available,
|
|
in_pool=in_pool,
|
|
matching=matching,
|
|
max_created=max_created,
|
|
)
|
|
|
|
def _cluster_has_adequate_capacity(self, cluster_id):
|
|
"""Check if the cluster has an instance group/fleet configuration
|
|
that works as well or better.
|
|
|
|
This either calls ``ListInstanceFleets`` or ``ListInstanceGroups``,
|
|
as appropriate
|
|
"""
|
|
emr_client = self.make_emr_client()
|
|
|
|
if (self._opts['min_available_mb'] or
|
|
self._opts['min_available_virtual_cores']):
|
|
cluster = emr_client.describe_cluster(
|
|
ClusterId=cluster_id)['Cluster']
|
|
|
|
host = cluster['MasterPublicDnsName']
|
|
try:
|
|
log.debug(' querying clusterMetrics from %s' % host)
|
|
metrics = self._yrm_get('metrics', host=host)['clusterMetrics']
|
|
log.debug(' metrics: %s' %
|
|
json.dumps(metrics, sort_keys=True))
|
|
except IOError as ex:
|
|
log.info(' error while getting metrics for cluster %s: %s' %
|
|
(cluster_id, str(ex)))
|
|
return False
|
|
|
|
if metrics['availableMB'] < self._opts['min_available_mb']:
|
|
log.info(' too little memory')
|
|
return False
|
|
|
|
if (metrics['availableVirtualCores'] <
|
|
self._opts['min_available_virtual_cores']):
|
|
log.info(' too few virtual cores')
|
|
return False
|
|
|
|
return True
|
|
elif self._opts['instance_fleets']:
|
|
try:
|
|
fleets = list(_boto3_paginate(
|
|
'InstanceFleets', emr_client, 'list_instance_fleets',
|
|
ClusterId=cluster_id))
|
|
except botocore.exceptions.ClientError:
|
|
# this shouldn't usually happen because whether a cluster
|
|
# uses instance fleets is in the pool hash
|
|
log.debug(' cluster %s: does not use instance fleets' %
|
|
cluster_id)
|
|
return False
|
|
|
|
return _instance_fleets_satisfy(
|
|
fleets, self._opts['instance_fleets'])
|
|
else:
|
|
try:
|
|
groups = list(_boto3_paginate(
|
|
'InstanceGroups', emr_client, 'list_instance_groups',
|
|
ClusterId=cluster_id))
|
|
except botocore.exceptions.ClientError:
|
|
# this shouldn't usually happen because whether a cluster
|
|
# uses instance fleets is in the pool hash
|
|
log.debug(' cluster %s: does not use instance groups' %
|
|
cluster_id)
|
|
return False
|
|
|
|
return _instance_groups_satisfy(groups, self._instance_groups())
|
|
|
|
def _cluster_description_matches(self, cluster):
|
|
"""Do we want to join the cluster with the given description?"""
|
|
cluster_id = cluster['Id']
|
|
|
|
# skip if user specified a key pair and it doesn't match
|
|
if (self._opts['ec2_key_pair'] and
|
|
self._opts['ec2_key_pair'] !=
|
|
cluster['Ec2InstanceAttributes'].get('Ec2KeyName')):
|
|
log.debug(' cluster %s: ec2 key pair mismatch' % cluster_id)
|
|
return False
|
|
|
|
# only take persistent clusters
|
|
if cluster['AutoTerminate']:
|
|
log.debug(' cluster %s: not persistent' % cluster_id)
|
|
return False
|
|
|
|
# EBS root volume size
|
|
if self._opts['ebs_root_volume_gb']:
|
|
if 'EbsRootVolumeSize' not in cluster:
|
|
log.debug(' cluster %s: EBS root volume size not set' %
|
|
cluster_id)
|
|
return False
|
|
elif (cluster['EbsRootVolumeSize'] <
|
|
self._opts['ebs_root_volume_gb']):
|
|
log.debug(' cluster %s: EBS root volume size too small' %
|
|
cluster_id)
|
|
return False
|
|
else:
|
|
if 'EbsRootVolumeSize' in cluster:
|
|
log.debug(' cluster %s: uses non-default EBS root volume'
|
|
' size' % cluster_id)
|
|
return False
|
|
|
|
# subnet
|
|
subnet = cluster['Ec2InstanceAttributes'].get('Ec2SubnetId')
|
|
if isinstance(self._opts['subnet'], list):
|
|
matches = (subnet in self._opts['subnet'])
|
|
else:
|
|
# empty subnet is the same as no subnet. see #1931
|
|
matches = (subnet == (self._opts['subnet'] or None))
|
|
|
|
if not matches:
|
|
log.debug(' cluster %s: subnet mismatch' % cluster_id)
|
|
return
|
|
|
|
# step concurrency
|
|
step_concurrency = cluster.get('StepConcurrencyLevel', 1)
|
|
if step_concurrency > self._opts['max_concurrent_steps']:
|
|
log.debug(' cluster %s: step concurrency level too high' %
|
|
cluster_id)
|
|
return
|
|
|
|
return True
|
|
|
|
def _find_cluster(self):
|
|
"""Find a cluster that can host this runner. Prefer clusters with more
|
|
compute units. Break ties by choosing cluster with longest idle time.
|
|
Return ``None`` if no suitable clusters exist.
|
|
"""
|
|
emr_client = self.make_emr_client()
|
|
|
|
start = datetime.now()
|
|
wait_mins = self._opts['pool_wait_minutes']
|
|
timeout_mins = self._opts['pool_timeout_minutes']
|
|
pool_name = self._opts['pool_name']
|
|
max_in_pool = self._opts['max_clusters_in_pool']
|
|
|
|
# like sleep() but also raises PoolTimeoutException if we're going to
|
|
# sleep beyond the timeout
|
|
def sleep_or_time_out(seconds):
|
|
if (timeout_mins and (
|
|
datetime.now() + timedelta(seconds=seconds) >
|
|
start + timedelta(minutes=timeout_mins))):
|
|
raise PoolTimeoutException(
|
|
'Unable to join or create a cluster within %d minutes' %
|
|
timeout_mins)
|
|
|
|
time.sleep(seconds)
|
|
|
|
log.info('Attempting to find an available cluster...')
|
|
while True:
|
|
cluster_ids = self._list_cluster_ids_for_pooling()
|
|
|
|
for cluster, when_cluster_described in (
|
|
self._yield_clusters_to_join(cluster_ids['available'])):
|
|
cluster_id = cluster['Id']
|
|
step_concurrency_level = cluster['StepConcurrencyLevel']
|
|
|
|
log.info(' Attempting to join cluster %s' % cluster_id)
|
|
lock_acquired = _attempt_to_lock_cluster(
|
|
emr_client, cluster_id, self._job_key,
|
|
cluster=cluster,
|
|
when_cluster_described=when_cluster_described)
|
|
|
|
if lock_acquired:
|
|
return cluster_id, step_concurrency_level
|
|
|
|
keep_waiting = (
|
|
datetime.now() < start + timedelta(minutes=wait_mins))
|
|
|
|
# if we haven't exhausted pool_wait_minutes, and there are
|
|
# clusters we might eventually join, sleep and try again
|
|
if keep_waiting and cluster_ids['matching']:
|
|
log.info('No clusters in pool %r are available. Checking again'
|
|
' in %d seconds...' % (
|
|
pool_name, int(_POOLING_SLEEP_INTERVAL)))
|
|
sleep_or_time_out(_POOLING_SLEEP_INTERVAL)
|
|
continue
|
|
|
|
# implement max_clusters_in_pool
|
|
if max_in_pool:
|
|
num_in_pool = len(cluster_ids['in_pool'])
|
|
|
|
log.info(' %d cluster%s in pool %r (max. is %d)' % (
|
|
num_in_pool, _plural(num_in_pool), pool_name, max_in_pool))
|
|
|
|
if num_in_pool >= max_in_pool:
|
|
log.info('Checking again in %d seconds...' % (
|
|
_POOLING_SLEEP_INTERVAL))
|
|
sleep_or_time_out(_POOLING_SLEEP_INTERVAL)
|
|
continue
|
|
|
|
# to avoid race conditions, double-check the clusters in the pool
|
|
# if we need to satisfy max_clusters_in_pool or are trying to
|
|
# bypass pool_wait_minutes
|
|
if max_in_pool or (keep_waiting and not cluster_ids['matching']):
|
|
jitter_seconds = randint(0, self._opts['pool_jitter_seconds'])
|
|
|
|
log.info(' waiting %d seconds and double-checking for'
|
|
' newly created clusters...' % jitter_seconds)
|
|
sleep_or_time_out(jitter_seconds)
|
|
|
|
new_cluster_ids = self._list_cluster_ids_for_pooling(
|
|
created_after=cluster_ids['max_created'])
|
|
|
|
new_num_in_pool = len(
|
|
cluster_ids['in_pool'] | new_cluster_ids['in_pool'])
|
|
|
|
log.info(' %d cluster%s in pool' % (
|
|
new_num_in_pool, _plural(new_num_in_pool)))
|
|
|
|
if ((not max_in_pool or new_num_in_pool < max_in_pool) and
|
|
(not keep_waiting or not new_cluster_ids['matching'])):
|
|
|
|
# allow creating a new cluster
|
|
return None, None
|
|
|
|
log.info('Checking again in %d seconds...' % (
|
|
_POOLING_SLEEP_INTERVAL))
|
|
sleep_or_time_out(_POOLING_SLEEP_INTERVAL)
|
|
|
|
continue
|
|
|
|
# pool_wait_minutes is exhausted and max_clusters_in_pool is not
|
|
# set, so create a new cluster
|
|
return None, None
|
|
|
|
# (defensive programming, in case we break out of the loop)
|
|
return None, None
|
|
|
|
def _pool_hash_dict(self):
|
|
"""A dictionary of information that must be matched exactly to
|
|
join a pooled cluster (other than mrjob version and pool name).
|
|
|
|
The format of this dictionary may change between mrjob versions.
|
|
"""
|
|
# this can be expensive because we have to read every file used in
|
|
# bootstrapping, so cache it
|
|
if not self._pool_hash_dict_cached:
|
|
d = {}
|
|
|
|
# additional_emr_info
|
|
d['additional_emr_info'] = self._opts['additional_emr_info']
|
|
|
|
# applications
|
|
# (these are case-insensitive)
|
|
d['applications'] = sorted(a.lower() for a in self._applications())
|
|
|
|
# bootstrapping
|
|
|
|
# bootstrap_actions
|
|
d['bootstrap_actions'] = self._bootstrap_actions()
|
|
|
|
# bootstrap_file_md5sums
|
|
d['bootstrap_file_md5sums'] = {
|
|
name: self.fs.md5sum(path)
|
|
for name, path
|
|
in self._bootstrap_dir_mgr.name_to_path().items()
|
|
if path != self._mrjob_zip_path
|
|
}
|
|
|
|
# bootstrap_without_paths
|
|
# original path doesn't matter, just contents (above) and name
|
|
d['bootstrap_without_paths'] = [
|
|
[
|
|
dict(type=x['type'],
|
|
name=self._bootstrap_dir_mgr.name(**x))
|
|
if isinstance(x, dict) else x
|
|
for x in cmd
|
|
]
|
|
for cmd in self._bootstrap
|
|
]
|
|
|
|
# emr_configurations
|
|
d['emr_configurations'] = self._emr_configurations()
|
|
|
|
# image_id
|
|
d['image_id'] = self._opts['image_id']
|
|
|
|
# instance_collection_type
|
|
# no way to compare instance groups with instance fleets
|
|
# so make it part of the hash
|
|
d['instance_collection_type'] = (
|
|
'INSTANCE_FLEET' if self._opts['instance_fleets']
|
|
else 'INSTANCE_GROUP'
|
|
)
|
|
|
|
# release_label
|
|
# use e.g. emr-2.4.9 for 2.x/3.x AMIs, even though the API wouldn't
|
|
d['release_label'] = (self._opts['release_label'] or
|
|
'emr-' + self._opts['image_version'])
|
|
|
|
self._pool_hash_dict_cached = d
|
|
|
|
return self._pool_hash_dict_cached
|
|
|
|
def _pool_hash(self):
|
|
hash_dict = self._pool_hash_dict()
|
|
|
|
hash_json = json.dumps(hash_dict, sort_keys=True)
|
|
if not isinstance(hash_json, bytes):
|
|
hash_json = hash_json.encode('utf_8')
|
|
|
|
m = hashlib.md5()
|
|
m.update(hash_json)
|
|
return m.hexdigest()
|
|
|
|
def _cluster_name_pooling_suffix(self):
|
|
"""Extra info added to the cluster name, for pooling."""
|
|
if not self._opts['pool_clusters']:
|
|
return ''
|
|
else:
|
|
return _cluster_name_suffix(
|
|
self._pool_hash(), self._opts['pool_name'])
|
|
|
|
### EMR-specific Stuff ###
|
|
|
|
def make_emr_client(self):
|
|
"""Create a :py:mod:`boto3` EMR client.
|
|
|
|
:return: a :py:class:`botocore.client.EMR` wrapped in a
|
|
:py:class:`mrjob.retry.RetryWrapper`
|
|
"""
|
|
# ...which is then wrapped in bacon! Mmmmm!
|
|
if boto3 is None:
|
|
raise ImportError('You must install boto3 to connect to EMR')
|
|
|
|
raw_emr_client = boto3.client(
|
|
'emr',
|
|
aws_access_key_id=self._opts['aws_access_key_id'],
|
|
aws_secret_access_key=self._opts['aws_secret_access_key'],
|
|
aws_session_token=self._opts['aws_session_token'],
|
|
endpoint_url=_endpoint_url(self._opts['emr_endpoint']),
|
|
region_name=self._opts['region'],
|
|
)
|
|
|
|
# #1799: don't retry faster than EMR checks the API
|
|
return _wrap_aws_client(raw_emr_client,
|
|
min_backoff=self._opts['check_cluster_every'])
|
|
|
|
def _describe_cluster(self):
|
|
emr_client = self.make_emr_client()
|
|
return emr_client.describe_cluster(
|
|
ClusterId=self._cluster_id)['Cluster']
|
|
|
|
def get_hadoop_version(self):
|
|
return self._get_app_versions().get('hadoop')
|
|
|
|
def get_image_version(self):
|
|
"""Get the version of the AMI that our cluster is running, or ``None``.
|
|
"""
|
|
return self._get_cluster_info('image_version')
|
|
|
|
def _address_of_master(self):
|
|
"""Get the address of the master node so we can SSH to it"""
|
|
return self._get_cluster_info('master_public_dns')
|
|
|
|
def _master_private_ip(self):
|
|
"""Get the internal ("private") address of the master node, so we
|
|
can direct our SSH tunnel to it."""
|
|
return self._get_cluster_info('master_private_ip')
|
|
|
|
def _get_app_versions(self):
|
|
"""Returns a map from lowercase app name to version for our cluster.
|
|
|
|
For apps other than Hadoop, this only works for AMI 4.x and later.
|
|
"""
|
|
return self._get_cluster_info('app_versions')
|
|
|
|
def _get_collection_type(self):
|
|
"""Return the collection type of the cluster (either
|
|
``'INSTANCE_FLEET'`` or ``'INSTANCE_GROUP'``)."""
|
|
return self._get_cluster_info('collection_type')
|
|
|
|
def _get_cluster_info(self, key):
|
|
if not self._cluster_id:
|
|
return None
|
|
|
|
cache = self._cluster_to_cache[self._cluster_id]
|
|
|
|
if not cache.get(key):
|
|
if key == 'master_private_ip':
|
|
self._store_master_instance_info()
|
|
else:
|
|
self._store_cluster_info()
|
|
|
|
return cache.get(key)
|
|
|
|
def _store_cluster_info(self):
|
|
"""Describe our cluster, and cache image_version, hadoop_version,
|
|
and master_public_dns"""
|
|
if not self._cluster_id:
|
|
raise ValueError('cluster has not yet been created')
|
|
|
|
cache = self._cluster_to_cache[self._cluster_id]
|
|
|
|
cluster = self._describe_cluster()
|
|
|
|
# AMI version might be in RunningAMIVersion (2.x, 3.x)
|
|
# or ReleaseLabel (4.x)
|
|
cache['image_version'] = cluster.get('RunningAmiVersion')
|
|
if not cache['image_version']:
|
|
release_label = cluster.get('ReleaseLabel')
|
|
if release_label:
|
|
cache['image_version'] = release_label.lstrip('emr-')
|
|
|
|
cache['app_versions'] = dict(
|
|
(a['Name'].lower(), a.get('Version'))
|
|
for a in cluster['Applications'])
|
|
|
|
cache['collection_type'] = cluster.get(
|
|
'InstanceCollectionType', 'INSTANCE_GROUP')
|
|
|
|
if cluster['Status']['State'] in ('RUNNING', 'WAITING'):
|
|
cache['master_public_dns'] = cluster['MasterPublicDnsName']
|
|
|
|
def _store_master_instance_info(self):
|
|
"""List master instance for our cluster, and cache
|
|
master_private_ip."""
|
|
if not self._cluster_id:
|
|
raise ValueError('cluster has not yet been created')
|
|
|
|
cache = self._cluster_to_cache[self._cluster_id]
|
|
|
|
emr_client = self.make_emr_client()
|
|
|
|
instances = emr_client.list_instances(
|
|
ClusterId=self._cluster_id,
|
|
InstanceGroupTypes=['MASTER'])['Instances']
|
|
|
|
if not instances:
|
|
return
|
|
|
|
master = instances[0]
|
|
|
|
# can also get private DNS and public IP/DNS, but we don't use this
|
|
master_private_ip = master.get('PrivateIpAddress')
|
|
if master_private_ip: # may not have been assigned yet
|
|
cache['master_private_ip'] = master_private_ip
|
|
|
|
def make_ec2_client(self):
|
|
"""Create a :py:mod:`boto3` EC2 client.
|
|
|
|
:return: a :py:class:`botocore.client.EC2` wrapped in a
|
|
:py:class:`mrjob.retry.RetryWrapper`
|
|
"""
|
|
if boto3 is None:
|
|
raise ImportError('You must install boto3 to connect to EC2')
|
|
|
|
raw_ec2_client = boto3.client(
|
|
'ec2',
|
|
aws_access_key_id=self._opts['aws_access_key_id'],
|
|
aws_secret_access_key=self._opts['aws_secret_access_key'],
|
|
aws_session_token=self._opts['aws_session_token'],
|
|
endpoint_url=_endpoint_url(self._opts['ec2_endpoint']),
|
|
region_name=self._opts['region'],
|
|
)
|
|
|
|
return _wrap_aws_client(raw_ec2_client)
|
|
|
|
def make_iam_client(self):
|
|
"""Create a :py:mod:`boto3` IAM client.
|
|
|
|
:return: a :py:class:`botocore.client.IAM` wrapped in a
|
|
:py:class:`mrjob.retry.RetryWrapper`
|
|
"""
|
|
if boto3 is None:
|
|
raise ImportError('You must install boto3 to connect to IAM')
|
|
|
|
# special logic for setting IAM endpoint (which you don't usually
|
|
# want to do, because IAM is regionless).
|
|
endpoint_url = _endpoint_url(self._opts['iam_endpoint'])
|
|
if endpoint_url:
|
|
# keep boto3 from loading a nonsensical region name from configs
|
|
# (see https://github.com/boto/boto3/issues/985)
|
|
region_name = _DEFAULT_AWS_REGION
|
|
log.debug('creating IAM client to %s' % endpoint_url)
|
|
else:
|
|
region_name = None
|
|
log.debug('creating IAM client')
|
|
|
|
raw_iam_client = boto3.client(
|
|
'iam',
|
|
aws_access_key_id=self._opts['aws_access_key_id'],
|
|
aws_secret_access_key=self._opts['aws_secret_access_key'],
|
|
aws_session_token=self._opts['aws_session_token'],
|
|
endpoint_url=endpoint_url,
|
|
region_name=region_name,
|
|
)
|
|
|
|
return _wrap_aws_client(raw_iam_client)
|
|
|
|
# Spark
|
|
|
|
def _uses_spark(self):
|
|
"""Does this runner use Spark, based on steps, bootstrap actions,
|
|
and EMR applications? If so, we'll need more memory."""
|
|
return (self._has_spark_steps() or
|
|
self._has_spark_install_bootstrap_action() or
|
|
self._has_spark_application() or
|
|
self._opts['bootstrap_spark'])
|
|
|
|
def _has_spark_install_bootstrap_action(self):
|
|
"""Does it look like this runner has a spark bootstrap install
|
|
action set? (Anything ending in "/install-spark" counts.)"""
|
|
return any(ba['path'].endswith('/install-spark')
|
|
for ba in self._bootstrap_actions(add_spark=False))
|
|
|
|
def _has_spark_application(self):
|
|
"""Does this runner have "Spark" in its *applications* option?"""
|
|
return any(a.lower() == 'spark'
|
|
for a in self._applications(add_spark=False))
|
|
|
|
def _check_cluster_spark_support(self):
|
|
"""Issue a warning if our cluster doesn't support Spark.
|
|
|
|
This should only be called if you are going to run one or more
|
|
Spark steps.
|
|
"""
|
|
message = self._cluster_spark_support_warning()
|
|
if message:
|
|
log.warning(message)
|
|
|
|
def _cluster_spark_support_warning(self):
|
|
"""Helper for _check_cluster_spark_support()."""
|
|
image_version = self.get_image_version()
|
|
|
|
if not version_gte(image_version, _MIN_SPARK_AMI_VERSION):
|
|
suggested_version = (
|
|
_MIN_SPARK_AMI_VERSION if PY2 else _MIN_SPARK_PY3_AMI_VERSION)
|
|
return (' AMI version %s does not support Spark;\n'
|
|
' (try --image-version %s or later)' % (
|
|
image_version, suggested_version))
|
|
|
|
if not version_gte(image_version, _MIN_SPARK_PY3_AMI_VERSION):
|
|
if PY2:
|
|
# even though this version of Spark "works" with Python 2,
|
|
# it doesn't work well
|
|
return (' AMI version %s has an old version of Spark\n'
|
|
' and does not correctly determine when a Spark'
|
|
' job has failed\n'
|
|
'Try --image-version %s or later)' % (
|
|
image_version, _MIN_SPARK_PY3_AMI_VERSION))
|
|
else:
|
|
# this version of Spark doesn't support Python 3 at all!
|
|
return (' AMI version %s does not support Python 3 on Spark\n'
|
|
' (try --image-version %s or later)' % (
|
|
image_version, _MIN_SPARK_PY3_AMI_VERSION))
|
|
|
|
emr_client = self.make_emr_client()
|
|
|
|
too_small_msg = (' instance type %s is too small for Spark;'
|
|
' your job may stall forever')
|
|
|
|
if self._get_collection_type() == 'INSTANCE_FLEET':
|
|
fleets = list(_boto3_paginate(
|
|
'InstanceFleets', emr_client, 'list_instance_fleets',
|
|
ClusterId=self.get_cluster_id()))
|
|
|
|
for fleet in fleets:
|
|
# master doesn't matter if it's not running tasks
|
|
if fleet['InstanceFleetType'] == 'MASTER' and len(fleets) > 1:
|
|
continue
|
|
|
|
for spec in fleet['InstanceTypeSpecifications']:
|
|
mem = EC2_INSTANCE_TYPE_TO_MEMORY.get(spec['InstanceType'])
|
|
if mem and mem < _MIN_SPARK_INSTANCE_MEMORY:
|
|
return (too_small_msg % spec['InstanceType'])
|
|
else:
|
|
# instance groups
|
|
igs = list(_boto3_paginate(
|
|
'InstanceGroups', emr_client, 'list_instance_groups',
|
|
ClusterId=self.get_cluster_id()))
|
|
|
|
for ig in igs:
|
|
# master doesn't matter if it's not running tasks
|
|
if ig['InstanceGroupType'] == 'MASTER' and len(igs) > 1:
|
|
continue
|
|
|
|
mem = EC2_INSTANCE_TYPE_TO_MEMORY.get(ig['InstanceType'])
|
|
if mem and mem < _MIN_SPARK_INSTANCE_MEMORY:
|
|
return (too_small_msg % ig['InstanceType'])
|
|
|
|
return None
|
|
|
|
def _cmdenv(self):
|
|
env = super(EMRJobRunner, self)._cmdenv()
|
|
|
|
return combine_dicts(self._docker_cmdenv(), env)
|
|
|
|
def _emr_configurations(self):
|
|
# don't keep two configs with the same Classification (#2097)
|
|
return _deduplicate_emr_configurations(
|
|
self._docker_emr_configurations() +
|
|
self._opts['emr_configurations']
|
|
)
|
|
|
|
def _docker_image(self):
|
|
"""Special-case the "library" registry which is implied on Docker Hub
|
|
but needs to be specified explicitly on EMR."""
|
|
image = self._opts['docker_image']
|
|
|
|
if not image:
|
|
return None
|
|
elif '/' in image:
|
|
return image
|
|
else:
|
|
return 'library/' + image
|
|
|
|
def _docker_registry(self):
|
|
"""Infer the trusted docker registry from the docker image."""
|
|
image = self._docker_image()
|
|
|
|
if not image:
|
|
return None
|
|
else:
|
|
return image.split('/')[0]
|
|
|
|
def _docker_cmdenv(self):
|
|
image = self._docker_image()
|
|
|
|
if not image:
|
|
return {}
|
|
|
|
env = dict(
|
|
YARN_CONTAINER_RUNTIME_TYPE='docker',
|
|
YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=image,
|
|
)
|
|
|
|
if self._opts['docker_client_config']:
|
|
env['YARN_CONTAINER_RUNTIME_DOCKER_CLIENT_CONFIG'] = (
|
|
self._opts['docker_client_config'])
|
|
|
|
if self._opts['docker_mounts']:
|
|
env['YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS'] = ','.join(
|
|
self._opts['docker_mounts'])
|
|
|
|
return env
|
|
|
|
def _docker_emr_configurations(self):
|
|
registry = self._docker_registry()
|
|
|
|
if not registry:
|
|
return []
|
|
|
|
registries = ','.join(['local', registry])
|
|
|
|
return [
|
|
dict(
|
|
Classification='container-executor',
|
|
Configurations=[
|
|
dict(
|
|
Classification='docker',
|
|
Properties={
|
|
'docker.trusted.registries': registries,
|
|
'docker.privileged-containers.registries': (
|
|
registries),
|
|
},
|
|
),
|
|
],
|
|
Properties={},
|
|
),
|
|
]
|
|
|
|
def _yrm_get(self, path, host=None, port=None, timeout=None):
|
|
"""Use curl to perform an HTTP GET on the given path on the
|
|
YARN Resource Manager. Either return decoded JSON from the call,
|
|
or raise an IOError
|
|
|
|
*path* should not start with a '/'
|
|
|
|
More info on the YARN REST API can be found here:
|
|
|
|
https://hadoop.apache.org/docs/current/hadoop-yarn/
|
|
hadoop-yarn-site/ResourceManagerRest.html
|
|
"""
|
|
if host is None:
|
|
host = self._address_of_master()
|
|
|
|
if port is None:
|
|
port = _YARN_RESOURCE_MANAGER_PORT
|
|
|
|
if timeout is None:
|
|
timeout = _YARN_API_TIMEOUT
|
|
|
|
# using urljoin() to avoid a double / when joining host/port with path
|
|
yrm_url = urljoin(
|
|
'http://{}:{:d}'.format(host, port),
|
|
'{}/{}'.format(_YRM_BASE_PATH, path)
|
|
)
|
|
|
|
curl_args = [
|
|
'curl', # always available on EMR
|
|
'-fsS', # fail on HTTP errors, print errors only to stderr
|
|
'-m', str(timeout), # timeout after 20 seconds
|
|
yrm_url,
|
|
]
|
|
|
|
stdout, stderr = self.fs.ssh._ssh_run(host, curl_args)
|
|
|
|
return json.loads(to_unicode(stdout))
|
|
|
|
|
|
def _get_job_steps(emr_client, cluster_id, job_key):
|
|
"""Efficiently fetch steps for a particular mrjob run from the EMR API.
|
|
|
|
:param emr_client: a boto3 EMR client. See
|
|
:py:meth:`~mrjob.emr.EMRJobRunner.make_emr_client`
|
|
:param cluster_id: ID of EMR cluster to fetch steps from. See
|
|
:py:meth:`~mrjob.emr.EMRJobRunner.get_cluster_id`
|
|
:param job_key: Unique key for a mrjob run. See
|
|
:py:meth:`~mrjob.runner.MRJobRunner.get_job_key`
|
|
"""
|
|
steps = []
|
|
|
|
for step in _boto3_paginate('Steps', emr_client, 'list_steps',
|
|
ClusterId=cluster_id):
|
|
if step['Name'].startswith(job_key):
|
|
steps.append(step)
|
|
elif steps:
|
|
# all steps for job will be together, so stop
|
|
# when we find a non-job step
|
|
break
|
|
|
|
return list(reversed(list(steps)))
|
|
|
|
|
|
def _get_reason(cluster_or_step):
|
|
"""Get state change reason message."""
|
|
# StateChangeReason is {} before the first state change
|
|
return cluster_or_step['Status']['StateChangeReason'].get('Message', '')
|
|
|
|
|
|
def _deduplicate_emr_configurations(emr_configurations):
|
|
"""Takes the value of the *emr_configurations* opt, and ensures that
|
|
later configs overwrite earlier ones with the same Classification.
|
|
|
|
Additionally, any configs that contain empty or unset Properties
|
|
and Configurations will be removed (this is a way of deleting
|
|
existing config dicts without replacing them).
|
|
|
|
You can assume that all config dicts have run through
|
|
_fix_configuration_opt()
|
|
"""
|
|
results = OrderedDict()
|
|
|
|
for c in emr_configurations:
|
|
results[c['Classification']] = c
|
|
|
|
return [c for c in results.values() if
|
|
c['Properties'] or c.get('Configurations')]
|
|
|
|
|
|
def _fix_configuration_opt(c):
|
|
"""Return copy of *c* with *Properties* is always set
|
|
(defaults to {}) and with *Configurations* is not set if empty.
|
|
Convert all values to strings.
|
|
|
|
Raise exception on more serious problems (extra fields, wrong data
|
|
type, etc).
|
|
|
|
This allows us to match configurations against the API, *and* catches bad
|
|
configurations before they result in cryptic API errors.
|
|
"""
|
|
if not isinstance(c, dict):
|
|
raise TypeError('configurations must be dicts, not %r' % (c,))
|
|
|
|
c = dict(c) # make a copy
|
|
|
|
# extra keys
|
|
extra_keys = (
|
|
set(c) - set(['Classification', 'Configurations', 'Properties']))
|
|
if extra_keys:
|
|
raise ValueError('configuration opt has extra keys: %s' % ', '.join(
|
|
sorted(extra_keys)))
|
|
|
|
# Classification
|
|
if 'Classification' not in c:
|
|
raise ValueError('configuration opt has no Classification')
|
|
|
|
if not isinstance(c['Classification'], string_types):
|
|
raise TypeError('Classification must be string')
|
|
|
|
# Properties
|
|
c.setdefault('Properties', {})
|
|
if not isinstance(c['Properties'], dict):
|
|
raise TypeError('Properties must be a dict')
|
|
|
|
c['Properties'] = dict(
|
|
(str(k), str(v)) for k, v in c['Properties'].items())
|
|
|
|
# sub-Configurations
|
|
if 'Configurations' in c:
|
|
if c['Configurations']:
|
|
if not isinstance(c['Configurations'], list):
|
|
raise TypeError('Configurations must be a list')
|
|
# recursively fix subconfigurations
|
|
c['Configurations'] = [
|
|
_fix_configuration_opt(sc) for sc in c['Configurations']]
|
|
else:
|
|
# don't keep empty configurations around
|
|
del c['Configurations']
|
|
|
|
return c
|
|
|
|
|
|
def _fix_subnet_opt(subnet):
|
|
"""Return either None, a string, or a list with at least two items."""
|
|
if subnet is None:
|
|
return None
|
|
|
|
if isinstance(subnet, string_types):
|
|
return subnet
|
|
|
|
subnet = list(subnet)
|
|
if len(subnet) == 1:
|
|
return subnet[0]
|
|
else:
|
|
return subnet
|
|
|
|
|
|
def _build_instance_group(role, instance_type, num_instances, bid_price):
|
|
"""Helper method for creating instance groups. For use when
|
|
creating a cluster using a list of InstanceGroups
|
|
|
|
- role is either 'MASTER', 'CORE', or 'TASK'.
|
|
- instance_type is an EC2 instance type
|
|
- count is an int
|
|
- bid_price is a number, a string, or None. If None,
|
|
this instance group will be use the ON-DEMAND market
|
|
instead of the SPOT market.
|
|
"""
|
|
if role not in _INSTANCE_ROLES:
|
|
raise ValueError
|
|
|
|
if not instance_type:
|
|
raise ValueError
|
|
|
|
if not num_instances:
|
|
raise ValueError
|
|
|
|
ig = dict(
|
|
InstanceCount=num_instances,
|
|
InstanceRole=role,
|
|
InstanceType=instance_type,
|
|
Market='ON_DEMAND',
|
|
Name=role.lower(), # just name the groups "core", "master", and "task"
|
|
)
|
|
|
|
if bid_price:
|
|
ig['Market'] = 'SPOT'
|
|
ig['BidPrice'] = str(bid_price) # must be a string
|
|
|
|
return ig
|
|
|
|
|
|
def _plural(n):
|
|
"""Utility for logging messages"""
|
|
if n == 1:
|
|
return ''
|
|
else:
|
|
return 's'
|