mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-08 19:21:28 +00:00
733 lines
24 KiB
Python
733 lines
24 KiB
Python
# Copyright 2012 Yelp and Contributors
|
|
# Copyright 2013 Lyft
|
|
# Copyright 2014 Brett Gibson
|
|
# Copyright 2015-2019 Yelp
|
|
# Copyright 2020 Affirm, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Utilities related to cluster pooling. This code used to be in mrjob.emr.
|
|
|
|
In theory, this module might support pooling in general, but so far, there's
|
|
only a need for pooling on EMR.
|
|
|
|
"""
|
|
import time
|
|
from collections import defaultdict
|
|
from logging import getLogger
|
|
|
|
try:
|
|
from botocore.exceptions import ClientError
|
|
except ImportError:
|
|
ClientError = Exception
|
|
|
|
import mrjob
|
|
from mrjob.aws import EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS
|
|
from mrjob.aws import EC2_INSTANCE_TYPE_TO_MEMORY
|
|
from mrjob.aws import _boto3_paginate
|
|
from mrjob.py2 import integer_types
|
|
from mrjob.py2 import string_types
|
|
|
|
log = getLogger(__name__)
|
|
|
|
# we check the type and contents of requested fleets/groups because they
|
|
# are user-specified and may not have the correct format. Currently, we
|
|
# simply return no match, since either boto3 or the EMR AMI will catch
|
|
# the error when EMRJobRunner attempts to create a new cluster. See #1696
|
|
|
|
|
|
### tagging pooled clusters ###
|
|
|
|
def _pool_tags(hash, name):
|
|
"""Return a dict with "hidden" tags to add to the given cluster."""
|
|
return dict(__mrjob_pool_hash=hash, __mrjob_pool_name=name)
|
|
|
|
|
|
def _extract_tags(cluster):
|
|
"""Pull the tags from a cluster, as a dict."""
|
|
return {t['Key']: t['Value'] for t in cluster.get('Tags') or []}
|
|
|
|
|
|
def _pool_name(cluster):
|
|
tags = _extract_tags(cluster)
|
|
return tags.get('__mrjob_pool_name')
|
|
|
|
|
|
### putting pooling information in the name of a cluster
|
|
|
|
# this may change between versions of mrjob
|
|
|
|
def _cluster_name_suffix(hash, name):
|
|
fields = [mrjob.__version__, name, hash]
|
|
return ' pooling:%s' % ','.join(fields)
|
|
|
|
|
|
def _parse_cluster_name_suffix(cluster_name):
|
|
"""Return a dictionary possibly containing the keys:
|
|
|
|
mrjob_version: version of mrjob that created this cluster
|
|
pool_hash: hash representing bootstrap setup etc.
|
|
pool_name: name of the cluster pool
|
|
|
|
If the cluster is not pooled or we can't parse its pooling suffix,
|
|
return ``{}``.
|
|
"""
|
|
# return version, hash, and name from cluster pool suffix
|
|
|
|
i = cluster_name.find(' pooling:')
|
|
if i == -1:
|
|
return {}
|
|
|
|
suffix = cluster_name[i + len(' pooling:'):]
|
|
|
|
parts = suffix.split(',', 3)
|
|
|
|
if len(parts) == 3:
|
|
return dict(
|
|
mrjob_version=parts[0],
|
|
pool_name=parts[1],
|
|
pool_hash=parts[2],
|
|
)
|
|
else:
|
|
return {}
|
|
|
|
|
|
### instance groups ###
|
|
|
|
def _instance_groups_satisfy(actual_igs, requested_igs):
|
|
"""Do the actual instance groups from a cluster satisfy the requested
|
|
ones, for the purpose of pooling?
|
|
"""
|
|
# the format of *requested_igs* is here:
|
|
# http://docs.aws.amazon.com/ElasticMapReduce/latest/API/API_InstanceGroup.html # noqa
|
|
# and the format of *actual_igs* is here:
|
|
# http://docs.aws.amazon.com/ElasticMapReduce/latest/API/API_ListInstanceGroups.html # noqa
|
|
|
|
# verify format of requested_igs
|
|
if not (isinstance(requested_igs, (list, tuple)) and
|
|
all(isinstance(req_ig, dict) and 'InstanceRole' in req_ig
|
|
for req_ig in requested_igs)):
|
|
log.debug(' bad instance_groups config')
|
|
return False
|
|
|
|
# a is a map from role to actual instance groups
|
|
a = defaultdict(list)
|
|
for ig in actual_igs:
|
|
a[ig['InstanceGroupType']].append(ig)
|
|
|
|
# r is a map from role to request (should be only one per role)
|
|
r = {req.get('InstanceRole'): req for req in requested_igs}
|
|
|
|
# updated request to account for extra instance groups
|
|
# see #1630 for what we do when roles don't match
|
|
if set(a) - set(r):
|
|
r = _add_missing_roles_to_request(set(a) - set(r), r,
|
|
['InstanceCount'])
|
|
|
|
if set(a) != set(r):
|
|
log.debug(" missing instance group roles")
|
|
return False
|
|
|
|
for role in r:
|
|
if not _igs_for_same_role_satisfy(a[role], r[role]):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def _igs_for_same_role_satisfy(actual_igs, requested_ig):
|
|
"""Does the *actual* list of instance groups satisfy the *requested*
|
|
one?
|
|
"""
|
|
# bid price/on-demand
|
|
if not all(_ig_satisfies_bid_price(ig, requested_ig) for ig in actual_igs):
|
|
return False
|
|
|
|
# memory
|
|
if not all(_ig_satisfies_mem(ig, requested_ig) for ig in actual_igs):
|
|
return False
|
|
|
|
# EBS volumes
|
|
if not all(_ebs_satisfies(ig, requested_ig) for ig in actual_igs):
|
|
return False
|
|
|
|
# CPU (this returns # of compute units or None)
|
|
return _igs_satisfy_cpu(actual_igs, requested_ig)
|
|
|
|
|
|
def _ig_satisfies_bid_price(actual_ig, requested_ig):
|
|
"""Does the actual instance group definition satisfy the bid price
|
|
(or lack thereof) of the requested instance group?
|
|
"""
|
|
# _instance_groups_satisfy() already verified *requested_ig* is a dict
|
|
|
|
# on-demand instances satisfy every bid price
|
|
if actual_ig['Market'] == 'ON_DEMAND':
|
|
return True
|
|
|
|
if requested_ig.get('Market', 'ON_DEMAND') == 'ON_DEMAND':
|
|
log.debug(' spot instance, requested on-demand')
|
|
return False
|
|
|
|
if actual_ig['BidPrice'] == requested_ig.get('BidPrice'):
|
|
return True
|
|
|
|
try:
|
|
if float(actual_ig['BidPrice']) >= float(requested_ig.get('BidPrice')):
|
|
return True
|
|
else:
|
|
# low bid prices mean cluster is more likely to be
|
|
# yanked away
|
|
log.debug(' bid price too low')
|
|
return False
|
|
except ValueError:
|
|
log.debug(' non-float bid price')
|
|
return False
|
|
|
|
|
|
def _ig_satisfies_mem(actual_ig, requested_ig):
|
|
"""Does the actual instance group satisfy the memory requirements of
|
|
the requested instance group?"""
|
|
actual_type = actual_ig['InstanceType']
|
|
requested_type = requested_ig.get('InstanceType')
|
|
|
|
# this works even for unknown instance types
|
|
if actual_type == requested_type:
|
|
return True
|
|
|
|
try:
|
|
if (EC2_INSTANCE_TYPE_TO_MEMORY[actual_type] >=
|
|
EC2_INSTANCE_TYPE_TO_MEMORY[requested_type]):
|
|
return True
|
|
else:
|
|
log.debug(' too little memory')
|
|
return False
|
|
except KeyError:
|
|
log.debug(' unknown instance type')
|
|
return False
|
|
|
|
|
|
def _igs_satisfy_cpu(actual_igs, requested_ig):
|
|
"""Does the list of actual instance groups satisfy the CPU requirements
|
|
of the requested instance group?
|
|
"""
|
|
requested_type = requested_ig.get('InstanceType')
|
|
num_requested = requested_ig.get('InstanceCount')
|
|
|
|
if not isinstance(num_requested, integer_types):
|
|
return False
|
|
|
|
# count number of compute units (cu)
|
|
if requested_type in EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS:
|
|
requested_cu = (
|
|
num_requested * EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS[requested_type])
|
|
|
|
# don't require instances to be running; we'd be worse off if
|
|
# we started our own cluster from scratch. (This can happen if
|
|
# the previous job finished while some task instances were
|
|
# still being provisioned.)
|
|
actual_cu = sum(
|
|
ig['RunningInstanceCount'] *
|
|
EC2_INSTANCE_TYPE_TO_COMPUTE_UNITS.get(ig['InstanceType'], 0.0)
|
|
for ig in actual_igs)
|
|
else:
|
|
# unknown instance type, just count # of matching instances
|
|
requested_cu = num_requested
|
|
actual_cu = sum(ig['RunningInstanceCount'] for ig in actual_igs
|
|
if ig['InstanceType'] == requested_type)
|
|
|
|
if actual_cu >= requested_cu:
|
|
return True
|
|
else:
|
|
log.debug(' not enough compute units')
|
|
return False
|
|
|
|
|
|
### instance fleets ###
|
|
|
|
def _instance_fleets_satisfy(actual_fleets, req_fleets):
|
|
"""Common code for :py:func:`
|
|
:py:func:`_instance_groups_satisfy_fleets` and
|
|
:py:func:`_instance_groups_satisfy`."""
|
|
# verify format of requested_igs
|
|
if not (isinstance(req_fleets, (list, tuple)) and
|
|
all(isinstance(req_ft, dict) and 'InstanceFleetType' in req_ft
|
|
for req_ft in req_fleets)):
|
|
log.debug(' bad instance_fleets config')
|
|
return False
|
|
|
|
# a is a map from role to actual instance fleet
|
|
# (unlike with groups, there can never be more than one fleet per role)
|
|
a = {f['InstanceFleetType']: f for f in actual_fleets}
|
|
|
|
# r is a map from role to request (should be only one per role)
|
|
r = {f['InstanceFleetType']: f for f in req_fleets}
|
|
|
|
# updated request to account for extra instance groups
|
|
# see #1630 for what we do when roles don't match
|
|
if set(a) - set(r):
|
|
r = _add_missing_roles_to_request(
|
|
set(a) - set(r), r,
|
|
['TargetOnDemandCapacity', 'TargetSpotCapacity'])
|
|
|
|
if set(a) != set(r):
|
|
log.debug(" missing instance fleet roles")
|
|
return False
|
|
|
|
for role in r:
|
|
if not _fleet_for_same_role_satisfies(a[role], r[role]):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def _fleet_for_same_role_satisfies(actual_fleet, req_fleet):
|
|
# match up instance types
|
|
actual_specs = {spec['InstanceType']: spec
|
|
for spec in actual_fleet['InstanceTypeSpecifications']}
|
|
try:
|
|
req_specs = {spec['InstanceType']: spec
|
|
for spec in req_fleet['InstanceTypeConfigs']}
|
|
except (TypeError, KeyError):
|
|
return False
|
|
|
|
if set(actual_specs) - set(req_specs):
|
|
log.debug(' fleet may include wrong instance types')
|
|
return False
|
|
|
|
if not all(_fleet_spec_satsifies(actual_specs[t], req_specs[t])
|
|
for t in actual_specs):
|
|
return False
|
|
|
|
# capacity
|
|
actual_on_demand = actual_fleet.get('ProvisionedOnDemandCapacity', 0)
|
|
req_on_demand = req_fleet.get('TargetOnDemandCapacity', 0)
|
|
|
|
if not isinstance(req_on_demand, integer_types):
|
|
return False
|
|
|
|
if req_on_demand > actual_on_demand:
|
|
log.debug(' not enough on-demand capacity')
|
|
return False
|
|
|
|
actual_spot = actual_fleet.get('ProvisionedSpotCapacity', 0)
|
|
req_spot = req_fleet.get('TargetSpotCapacity', 0)
|
|
|
|
if not isinstance(req_spot, integer_types):
|
|
return False
|
|
|
|
# allow extra on-demand instances to serve as spot instances
|
|
if req_spot > actual_spot + (actual_on_demand - req_on_demand):
|
|
log.debug(' not enough spot capacity')
|
|
|
|
# handle TERMINATE_CLUSTER timeout action. This really doesn't play
|
|
# well with pooling anyhow
|
|
if _get_timeout_action(actual_fleet) == 'TERMINATE_CLUSTER':
|
|
if _get_timeout_action(req_fleet) != 'TERMINATE_CLUSTER':
|
|
log.debug(' self-terminating fleet not requested')
|
|
return False
|
|
|
|
if (_get_timeout_duration(actual_fleet) <
|
|
_get_timeout_duration(req_fleet)):
|
|
log.debug(' fleet may self-terminate prematurely')
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def _get_timeout_action(fleet):
|
|
return fleet.get(
|
|
'LaunchSpecifications', {}).get(
|
|
'SpotSpecification', {}).get(
|
|
'TimeoutAction')
|
|
|
|
|
|
def _get_timeout_duration(fleet):
|
|
return fleet.get(
|
|
'LaunchSpecifications', {}).get(
|
|
'SpotSpecification', {}).get(
|
|
'TimeoutDurationMinutes', 0.0)
|
|
|
|
|
|
def _fleet_spec_satsifies(actual_spec, req_spec):
|
|
"""Make sure the specification for the given instance type is as
|
|
good or better than the requested spec.
|
|
|
|
Specs must have the same weight, but "better" EBS configurations are
|
|
accepted.
|
|
|
|
Bid price must either be higher or the *actual* bid price
|
|
must be same as on-demand
|
|
"""
|
|
if (actual_spec.get('WeightedCapacity', 1) !=
|
|
req_spec.get('WeightedCapacity', 1)):
|
|
log.debug(' different weighted capacity for same instance type')
|
|
return False
|
|
|
|
if not _ebs_satisfies(actual_spec, req_spec):
|
|
return False
|
|
|
|
# bid price is the max, don't worry about it
|
|
if actual_spec.get('BidPriceAsPercentageOfOnDemandPrice', 100) >= 100:
|
|
return True
|
|
|
|
# absolute bid price
|
|
req_bid_price = req_spec.get('BidPrice')
|
|
if req_bid_price is not None:
|
|
actual_bid_price = actual_spec.get('BidPrice')
|
|
if actual_bid_price is None:
|
|
log.debug(' no bid price specified')
|
|
return False
|
|
|
|
try:
|
|
if not float(actual_bid_price) >= float(req_bid_price):
|
|
log.debug(' bid price too low')
|
|
return False
|
|
except TypeError:
|
|
log.debug(' non-numeric bid price')
|
|
return False
|
|
|
|
# relative bid price
|
|
req_bid_percent = req_spec.get('BidPriceAsPercentageOfOnDemandPrice')
|
|
if not isinstance(req_spec, (integer_types, float)):
|
|
return False
|
|
|
|
if req_bid_percent:
|
|
actual_bid_percent = actual_spec.get(
|
|
'BidPriceAsPercentageOfOnDemandPrice')
|
|
if actual_bid_percent is None:
|
|
log.debug(' no bid price as % of on-demand price')
|
|
return False
|
|
|
|
if req_bid_percent > actual_bid_percent:
|
|
log.debug(' bid price as % of on-demand price too low')
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
### common code for matching instance (groups or fleets) ###
|
|
|
|
|
|
def _add_missing_roles_to_request(
|
|
missing_roles, role_to_req, req_count_fields):
|
|
"""Helper for :py:func:`_igs_satisfy_request`. Add requests for
|
|
*missing_roles* to *role_to_ig* so that we have a better chance of
|
|
matching the cluster's actual instance groups."""
|
|
# see #1630 for discussion
|
|
|
|
# don't worry about modifying *role_to_req*; this is
|
|
# a helper func
|
|
|
|
if 'CORE' in missing_roles and list(role_to_req) == ['MASTER']:
|
|
# both core and master have to satisfy master-only request
|
|
role_to_req['CORE'] = role_to_req['MASTER']
|
|
|
|
if 'TASK' in missing_roles and 'CORE' in role_to_req:
|
|
# make sure tasks won't crash on the task instances,
|
|
# but don't require the same amount of CPU
|
|
role_to_req['TASK'] = dict(role_to_req['CORE'])
|
|
for req_count_field in req_count_fields:
|
|
role_to_req['TASK'][req_count_field] = 0
|
|
|
|
return role_to_req
|
|
|
|
|
|
def _ebs_satisfies(actual, request):
|
|
"""Does *actual* have EBS volumes that satisfy *request*.
|
|
|
|
*actual* is either an instance group from ``ListInstanceGroups``
|
|
or an instance fleet spec from ``ListInstanceFleets`` (format
|
|
is the same).
|
|
|
|
*request* is either the ``InstanceGroups`` or ``InstanceFleets``
|
|
param to ``RunJobFlow``
|
|
|
|
If *request* doesn't have an EBS Configuration, we return
|
|
True.
|
|
|
|
If *request* requests EBS optimization, *actual* should provide it.
|
|
|
|
Finally, *actual* should have the same or better block devices
|
|
as those in *request* (same volume type, at least as much IOPS
|
|
and volume size).
|
|
"""
|
|
req_ebs_config = request.get('EbsConfiguration')
|
|
|
|
if not req_ebs_config:
|
|
return True
|
|
|
|
if (req_ebs_config.get('EbsOptimized') and
|
|
not actual.get('EbsOptimized')):
|
|
log.debug(' need EBS-optimized instances')
|
|
return False
|
|
|
|
req_device_configs = req_ebs_config.get('EbsBlockDeviceConfigs')
|
|
|
|
if not req_device_configs:
|
|
return True
|
|
|
|
if not (isinstance(req_device_configs, (list, tuple)) and
|
|
all(isinstance(rdc, dict) for rdc in req_device_configs)):
|
|
return False
|
|
|
|
req_volumes = []
|
|
|
|
for req_device_config in req_device_configs:
|
|
volume = req_device_config['VolumeSpecification']
|
|
num_volumes = req_device_config.get('VolumesPerInstance', 1)
|
|
|
|
req_volumes.extend([volume] * num_volumes)
|
|
|
|
actual_volumes = [
|
|
bd.get('VolumeSpecification', {})
|
|
for bd in actual.get('EbsBlockDevices', [])]
|
|
|
|
return _ebs_volumes_satisfy(actual_volumes, req_volumes)
|
|
|
|
|
|
def _ebs_volumes_satisfy(actual_volumes, req_volumes):
|
|
"""Does the given list of actual EBS volumes satisfy the given request?
|
|
|
|
Just compare them one by one (we want each actual device to be
|
|
bigger/faster; just having the same amount of capacity or iops
|
|
isn't enough).
|
|
"""
|
|
if not isinstance(req_volumes, (list, tuple)):
|
|
return False
|
|
|
|
if len(req_volumes) > len(actual_volumes):
|
|
log.debug(' more EBS volumes requested than available')
|
|
return False
|
|
|
|
return all(_ebs_volume_satisfies(a, r)
|
|
for a, r in zip(actual_volumes, req_volumes))
|
|
|
|
|
|
def _ebs_volume_satisfies(actual_volume, req_volume):
|
|
"""Does the given actual EBS volume satisfy the given request?"""
|
|
if not isinstance(req_volume, dict):
|
|
return False
|
|
|
|
if req_volume.get('VolumeType') != actual_volume.get('VolumeType'):
|
|
log.debug(' wrong EBS volume type')
|
|
return False
|
|
|
|
if not req_volume.get('SizeInGB', 0) <= actual_volume.get('SizeInGB', 0):
|
|
log.debug(' EBS volume too small')
|
|
return False
|
|
|
|
# Iops isn't really "optional"; it has to be set if volume type is
|
|
# io1 and not set otherwise
|
|
if not (req_volume.get('Iops', 0) <= actual_volume.get('Iops', 0)):
|
|
log.debug(' EBS volume too slow')
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
### locking pooled clusters ###
|
|
|
|
# Locking ensures that two jobs don't add their steps to the same cluster at
|
|
# the same time
|
|
|
|
# after 60 seconds, a lock is considered released
|
|
_CLUSTER_LOCK_SECS = 60.0
|
|
|
|
# describe the cluster and add our tag within the first 5 seconds
|
|
_ADD_TAG_BEFORE = 5.0
|
|
|
|
# then wait 10 seconds before checking if our tag is still there
|
|
_WAIT_AFTER_ADD_TAG = 10.0
|
|
|
|
# make sure we have at least 40 seconds left to add steps and have them
|
|
# start running, before the lock expires
|
|
_CHECK_TAG_BEFORE = 20.0
|
|
|
|
# tag key used for locking pooled clusters
|
|
_POOL_LOCK_KEY = '__mrjob_pool_lock'
|
|
|
|
|
|
def _make_cluster_lock(job_key, expiry):
|
|
"""Return the contents of a tag used to lock a cluster.
|
|
|
|
*expiry* is the unix timestamp for when the lock is no longer valid"""
|
|
return '%s %.6f' % (job_key, expiry)
|
|
|
|
|
|
def _parse_cluster_lock(lock):
|
|
"""Return (job_key, expiry) or raise ValueError
|
|
|
|
Raises TypeError if *lock* is not a string.
|
|
"""
|
|
if not isinstance(lock, (string_types)):
|
|
raise TypeError
|
|
|
|
job_key, expiry_str = lock.split(' ')
|
|
|
|
try:
|
|
expiry = float(expiry_str)
|
|
except TypeError:
|
|
raise ValueError
|
|
|
|
return job_key, expiry
|
|
|
|
|
|
def _get_cluster_lock(cluster):
|
|
return _extract_tags(cluster).get(_POOL_LOCK_KEY)
|
|
|
|
|
|
def _attempt_to_lock_cluster(
|
|
emr_client, cluster_id, job_key,
|
|
cluster=None, when_cluster_described=None):
|
|
"""Attempt to lock the given pooled cluster using EMR tags.
|
|
|
|
You may optionally include *cluster* (a cluster description) and
|
|
*when_cluster_described*, to save an API call to ``DescribeCluster``
|
|
|
|
If the cluster's StepConcurrency Level is 1, locking considers the cluster
|
|
available if it's in the WAITING state. this means we should not release
|
|
our lock until our step(s) have started running, which can take several
|
|
seconds.
|
|
|
|
Otherwise, steps can run concurrently, so locking
|
|
considers the cluster available if it's in the WAITING or RUNNING state.
|
|
Additionally, it makes a ``ListSteps`` API call to verify that the cluster
|
|
doesn't already have as many active steps as it can run simultaneously.
|
|
Because other jobs looking to join the cluster will also count steps,
|
|
we can release our lock as soon as we add our steps.
|
|
"""
|
|
log.debug('Attempting to lock cluster %s for %.1f seconds' % (
|
|
cluster_id, _CLUSTER_LOCK_SECS))
|
|
|
|
if cluster is None:
|
|
cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']
|
|
|
|
if when_cluster_described is None:
|
|
start = time.time()
|
|
else:
|
|
start = when_cluster_described
|
|
|
|
if cluster['StepConcurrencyLevel'] == 1:
|
|
step_accepting_states = ['WAITING']
|
|
else:
|
|
step_accepting_states = ['RUNNING', 'WAITING']
|
|
|
|
# check if there is a non-expired lock
|
|
state = cluster['Status']['State']
|
|
|
|
if state not in step_accepting_states:
|
|
# this could happen if the cluster were TERMINATING, for example
|
|
log.info(' cluster is not accepting steps, state is %s' % state)
|
|
return False
|
|
|
|
lock = _get_cluster_lock(cluster)
|
|
|
|
if lock:
|
|
expiry = None
|
|
try:
|
|
their_job_key, expiry = _parse_cluster_lock(lock)
|
|
except ValueError:
|
|
log.info(' ignoring invalid pool lock: %s' % lock)
|
|
|
|
if expiry and expiry > start:
|
|
log.info(' locked by %s for %.1f seconds' % (
|
|
their_job_key, expiry - start))
|
|
return False
|
|
|
|
# add our lock
|
|
our_lock = _make_cluster_lock(job_key, start + _CLUSTER_LOCK_SECS)
|
|
|
|
log.debug(' adding tag to cluster %s:' % cluster_id)
|
|
log.debug(' %s=%s' % (_POOL_LOCK_KEY, our_lock))
|
|
emr_client.add_tags(
|
|
ResourceId=cluster_id,
|
|
Tags=[dict(Key=_POOL_LOCK_KEY, Value=our_lock)]
|
|
)
|
|
|
|
if time.time() - start > _ADD_TAG_BEFORE:
|
|
log.info(' took too long to tag cluster with lock')
|
|
return False
|
|
|
|
# wait, then check if our lock is still there
|
|
log.info(" waiting %.1f seconds to ensure lock wasn't overwritten" %
|
|
_WAIT_AFTER_ADD_TAG)
|
|
time.sleep(_WAIT_AFTER_ADD_TAG)
|
|
|
|
# check if our lock is still there
|
|
cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']
|
|
|
|
state = cluster['Status']['State']
|
|
|
|
if state not in step_accepting_states:
|
|
# this could happen if the cluster were TERMINATING, for example
|
|
log.info(' cluster is not accepting steps, state is %s' % state)
|
|
return False
|
|
|
|
if cluster['StepConcurrencyLevel'] > 1:
|
|
# is cluster already full of steps?
|
|
num_active_steps = len(list(_boto3_paginate(
|
|
'Steps', emr_client, 'list_steps',
|
|
ClusterId=cluster_id,
|
|
StepStates=['PENDING', 'RUNNING'])))
|
|
|
|
if num_active_steps >= cluster['StepConcurrencyLevel']:
|
|
log.info(
|
|
' cluster already has %d active steps' % num_active_steps)
|
|
return
|
|
|
|
lock = _get_cluster_lock(cluster)
|
|
|
|
if lock is None:
|
|
log.info(' lock was removed')
|
|
return False
|
|
elif lock != our_lock:
|
|
their_job_desc = 'other job'
|
|
try:
|
|
their_job_desc, expiry = _parse_cluster_lock(lock)
|
|
except ValueError:
|
|
pass
|
|
|
|
log.info(' lock was overwritten by %s' % their_job_desc)
|
|
return False
|
|
|
|
# make sure we have enough time to add steps and have them run
|
|
# before the lock expires
|
|
if time.time() > start + _CHECK_TAG_BEFORE:
|
|
log.info(' took too long to check for lock')
|
|
return False
|
|
|
|
log.info(' lock acquired')
|
|
return True
|
|
|
|
|
|
def _attempt_to_unlock_cluster(emr_client, cluster_id):
|
|
"""Release our lock on the given pooled cluster. Only do this if you know
|
|
the cluster is currently running steps (so other jobs won't try to
|
|
join the cluster).
|
|
|
|
Returns True if successful, False if not (usually, this means the
|
|
cluster terminated). Cluster locks eventually release themselves,
|
|
so if releasing a lock fails for whatever reason, it's not worth
|
|
releasing it again.
|
|
|
|
Locks expire after a minute anyway (which is less time than it takes to
|
|
run most jobs), so this is mostly useful for preventing problems
|
|
due to clock skew. Also makes unit testing more straightforward.
|
|
"""
|
|
try:
|
|
emr_client.remove_tags(ResourceId=cluster_id, TagKeys=[_POOL_LOCK_KEY])
|
|
return True
|
|
except ClientError as ex:
|
|
log.debug('removing tags failed: %r' % ex)
|
|
return False
|