GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/logs/bootstrap.py
2025-06-22 13:36:01 +05:30

150 lines
4.9 KiB
Python

# Copyright 2015-2016 Yelp
# Copyright 2017 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parse logs from EMR bootstrap actions (and, eventually, Dataproc
initialization actions)."""
import re
from .task import _parse_task_stderr
from .wrap import _cat_log_lines
from .wrap import _ls_logs
# match cause of failure when there's a problem with bootstrap script. Example:
#
# On the master instance (i-96c21a39), bootstrap action 1 returned a non-zero
# return code
#
# On 2 slave instances (including i-105af6bf and i-b659f519), bootstrap action
# 1 returned a non-zero return code
#
# (EMR doesn't seem to return errors that include both master and slave
# instances)
_BOOTSTRAP_NONZERO_RETURN_CODE_RE = re.compile(
r'^.*\(.*?(?P<node_id>i-[0-9a-f]+).*\)'
r'.*bootstrap action (?P<action_num>\d+)'
r'.*non-zero return code'
r'.*$')
# match a path like:
# <s3_log_dir>/node/i-96c21a39/bootstrap-actions/1/stderr.gz
#
# (may or may not actually be gzipped)
_EMR_BOOTSTRAP_STDERR_PATH_RE = re.compile(
r'^(?P<prefix>.*?/)'
r'node/'
r'(?P<node_id>i-[0-9a-f]+)/'
r'bootstrap-actions/'
r'(?P<action_num>\d+)/'
r'stderr(?P<suffix>\.\w+)?')
def _check_for_nonzero_return_code(reason):
"""Given a reason for cluster termination, check if it's because
a bootstrap action terminated with an error.
If it is, return a dictionary with the keys action_num (0-indexed
bootstrap action number) and node_id (a string). Otherwise return None.
"""
m = _BOOTSTRAP_NONZERO_RETURN_CODE_RE.match(reason)
if m:
return _extract_action_num_and_node_id(m)
else:
return None
def _ls_emr_bootstrap_stderr_logs(
fs, log_dir_stream, action_num=None, node_id=None):
"""Find all stderr from bootstrap actions in the given dir. Sort
so the most recent one comes first, using node ID as a tiebreaker.
(In practice, we look at a single a action on a single node anyway.)
"""
matches = _ls_logs(fs, log_dir_stream, _match_emr_bootstrap_stderr_path,
action_num=None, node_id=None)
return sorted(matches, key=lambda m: (-m['action_num'], m['node_id']))
def _match_emr_bootstrap_stderr_path(path, node_id=None, action_num=None):
"""If *path* corresponds to a bootstrap stderr file, return a dict
with the keys *action_num* (an 0-indexed int) and *node_id*. Otherwise
return None.
Optionally, filter by *action_num* and *node_id*.
"""
m = _EMR_BOOTSTRAP_STDERR_PATH_RE.match(path)
if not m:
return
result = _extract_action_num_and_node_id(m)
if action_num is not None and action_num != result['action_num']:
return None
if node_id is not None and node_id != result['node_id']:
return None
return result
# This strategy assumes we can ask the EMR API which node(s) the error
# occurred on. This is true even after the cluster has shut down, so it's
# a pretty reasonable assumption, even for after-the-fact log parsing.
#
# If we *had* to figure out from logs alone whether a node had an error,
# we'd want to first check the controller file for a line like:
#
# 2016-07-07T23:26:49.565Z ERROR Execution failed with code '1'
#
# and then look in the corresponding stderr file, much like how we handle
# task logs. This seems like overkill at the moment.
def _interpret_emr_bootstrap_stderr(fs, matches, partial=True):
"""Extract errors from bootstrap stderr.
If *partial* is true, stop when we find the first match.
(In practice, we usually target a single file anyway.)
"""
result = {}
for match in matches:
stderr_path = match['path']
task_error = _parse_task_stderr(_cat_log_lines(fs, stderr_path))
if task_error:
task_error = dict(task_error) # make a copy
task_error['path'] = stderr_path
error = dict(
action_num=match['action_num'],
node_id=match['node_id'],
task_error=task_error)
result.setdefault('errors', [])
result['errors'].append(error)
if partial:
result['partial'] = True
break
return result
def _extract_action_num_and_node_id(m):
"""Helper method: Extract *action_num* and *node_id* from the given regex
match. Convert *action_num* to a 0-indexed integer."""
return dict(
action_num=(int(m.group('action_num')) - 1),
node_id=m.group('node_id'),
)