GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/logs/errors.py
2025-06-22 13:36:01 +05:30

224 lines
7.0 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2015-2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Merging errors, picking the best one, and displaying it."""
import json
from mrjob.util import unique
from .ids import _time_sort_key
_PYTHON_TRACEBACK_HEADER = 'Traceback (most recent call last):'
def _pick_error(log_interpretation):
"""Pick most recent error from a dictionary possibly containing
step, history, and task interpretations. Returns None if there
are no errors.
"""
errors = _extract_errors(log_interpretation)
# no point in merging spark errors, which may not be tied to a container
# because they're not even necessarily on Hadoop
spark_errors = _pick_spark_errors(errors)
if spark_errors:
return spark_errors[0]
# otherwise, merge hadoop/task errors and pick the most recent one
attempt_to_container_id = log_interpretation.get('history', {}).get(
'attempt_to_container_id', {})
merged_errors = _merge_and_sort_errors(errors, attempt_to_container_id)
if merged_errors:
return merged_errors[0]
return None
def _extract_errors(log_interpretation):
"""Extract all errors from *log_interpretation*, in no particular order."""
errors = []
for log_type in ('step', 'history', 'task'):
errors.extend(
log_interpretation.get(log_type, {}).get('errors') or ())
return errors
def _pick_spark_errors(errors):
"""Pick the shortest Spark error with a traceback."""
def sort_key(error):
spark_error = error['spark_error']
msg = spark_error.get('message') or ''
num_lines = spark_error.get('num_lines') or 1
return (
_PYTHON_TRACEBACK_HEADER in msg,
num_lines > 1,
-num_lines,
)
return sorted(
(e for e in errors if e.get('spark_error')),
key=sort_key, reverse=True)
def _pick_error_attempt_ids(log_interpretation):
"""Pick error attempt IDs from step and history logs, so we know which
task logs to look at (most relevant first)"""
errors = _extract_errors(log_interpretation)
attempt_to_container_id = log_interpretation.get('history', {}).get(
'attempt_to_container_id', {})
errors = _merge_and_sort_errors(errors, attempt_to_container_id)
errors.sort(key=_is_probably_task_error, reverse=True)
return list(unique(
error['attempt_id'] for error in errors
if error.get('attempt_id')))
def _is_probably_task_error(error):
"""Used to identify task errors."""
return ('subprocess failed' in
error.get('hadoop_error', {}).get('message', ''))
def _merge_and_sort_errors(errors, attempt_to_container_id=None):
"""Merge errors from one or more lists of errors and then return
them, sorted by recency.
We allow None in place of an error list.
"""
attempt_to_container_id = attempt_to_container_id or {}
key_to_error = {}
for error in errors:
# merge by container_id if we know it
container_id = error.get('container_id') or (
attempt_to_container_id.get(error.get('attempt_id')))
if container_id:
key = ('container_id', container_id)
else:
key = ('time_key', _time_sort_key(error))
key_to_error.setdefault(key, {})
# assume redundant fields will match
key_to_error[key].update(error)
# wrap sort key to prioritize task errors. See #1429
def sort_key(key_and_error):
key, error = key_and_error
# key[0] is 'container_id' or 'time_key'
return (bool(error.get('task_error')),
key[0] == 'container_id',
key[1:])
return [error for _, error in
sorted(key_to_error.items(), key=sort_key, reverse=True)]
def _log_probable_cause_of_failure(log, error):
"""Log "probable cause of failure" log message."""
log.error('\nProbable cause of failure:\n\n%s\n\n' % _format_error(error))
def _format_error(error):
"""Return string to log/print explaining the given error."""
# it's just sad if we error while trying to explain an error
try:
return _format_error_helper(error)
except:
return json.dumps(error, indent=2, sort_keys=True)
def _format_error_helper(error):
"""Return string to log/print explaining the given error."""
result = ''
spark_error = error.get('spark_error')
if spark_error:
spark_error = _trim_spark_error(spark_error)
result += spark_error.get('message', '')
if spark_error.get('path'):
result += '\n\n(from %s)' % _describe_source(spark_error)
# spark errors typically include both java and Python stacktraces,
# so it's not helpful to print hadoop/task errors (and there probably
# wouldn't be any)
else:
hadoop_error = error.get('hadoop_error')
if hadoop_error:
result += hadoop_error.get('message', '')
if hadoop_error.get('path'):
result += '\n\n(from %s)' % _describe_source(hadoop_error)
# for practical purposes, there's always a Java error with a message,
# so don't worry too much about spacing.
task_error = error.get('task_error')
if task_error:
if hadoop_error:
result += '\n\ncaused by:\n\n%s' % (
task_error.get('message', ''))
else:
result += task_error.get('message', '')
if task_error.get('path'):
result += '\n\n(from %s)' % _describe_source(task_error)
split = error.get('split')
if split and split.get('path'):
result += '\n\nwhile reading input from %s' % _describe_source(split)
return result
def _describe_source(d):
"""return either '<path>' or 'line N of <path>' or 'lines M-N of <path>'.
"""
path = d.get('path') or ''
if 'num_lines' in d and 'start_line' in d:
if d['num_lines'] == 1:
return 'line %d of %s' % (d['start_line'] + 1, path)
else:
return 'lines %d-%d of %s' % (
d['start_line'] + 1, d['start_line'] + d['num_lines'], path)
else:
return path
def _trim_spark_error(spark_error):
"""If *spark_error* contains a stack trace followed by a blank line,
trim off the blank line and everything that follows."""
spark_error = dict(spark_error)
parts = spark_error['message'].split('\n\n')
if len(parts) > 1 and _PYTHON_TRACEBACK_HEADER in parts[0]:
spark_error['message'] = parts[0]
spark_error['num_lines'] = len(parts[0].split('\n')) + 1
return spark_error