GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/logs/history.py
2025-06-22 13:36:01 +05:30

425 lines
14 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2015-2017 Yelp
# Copyright 2018 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Code for parsing the history file, which contains counters and error
messages for each task."""
import json
import re
from logging import getLogger
from mrjob.py2 import integer_types
from mrjob.py2 import string_types
from .counters import _sum_counters
from .ids import _add_implied_task_id
from .wrap import _ls_logs
from .wrap import _cat_log_lines
log = getLogger(__name__)
# what job history (e.g. counters) look like on either YARN or pre-YARN.
# YARN uses - instead of _ to separate fields. This should work for
# non-streaming jars as well.
_HISTORY_LOG_PATH_RE = re.compile(
r'^(?P<prefix>.*?/)'
r'(?P<job_id>job_\d+_\d{4})'
r'[_-]\d+[_-]hadoop[_-](?P<suffix>\S*)$')
# escape sequence in pre-YARN history file. Characters inside COUNTERS
# fields are double escaped
_PRE_YARN_HISTORY_ESCAPE_RE = re.compile(r'\\(.)')
# capture key-value pairs like JOBNAME="streamjob8025762403845318969\.jar"
_PRE_YARN_HISTORY_KEY_PAIR = re.compile(
r'(?P<key>\w+)="(?P<escaped_value>(\\.|[^"\\])*)"', re.MULTILINE)
# an entire line in a pre-YARN history file
_PRE_YARN_HISTORY_RECORD = re.compile(
r'^(?P<type>\w+)'
r'(?P<key_pairs>( ' + _PRE_YARN_HISTORY_KEY_PAIR.pattern + ')*)'
r' \.$', re.MULTILINE)
# capture one group of counters
# this looks like: {(group_id)(group_name)[counter][counter]...}
_PRE_YARN_COUNTER_GROUP_RE = re.compile(
r'{\('
r'(?P<group_id>(\\.|[^)}\\])*)'
r'\)\('
r'(?P<group_name>(\\.|[^)}\\])*)'
r'\)'
r'(?P<counter_list_str>\[(\\.|[^}\\])*\])'
r'}')
# parse a single counter from a counter group (counter_list_str above)
# this looks like: [(counter_id)(counter_name)(amount)]
_PRE_YARN_COUNTER_RE = re.compile(
r'\[\('
r'(?P<counter_id>(\\.|[^)\\])*)'
r'\)\('
r'(?P<counter_name>(\\.|[^)\\])*)'
r'\)\('
r'(?P<amount>\d+)'
r'\)\]')
def _ls_history_logs(fs, log_dir_stream, job_id=None):
"""Yield matching files, optionally filtering by *job_id*. Yields dicts
with the keys:
job_id: job_id in path (must match *job_id* if set)
path: path/URI of log file
yarn: true if this is a YARN log file
*log_dir_stream* is a sequence of lists of log dirs. For each list, we'll
look in all directories, and if we find any logs, we'll stop. (The
assumption is that subsequent lists of log dirs would have copies
of the same logs, just in a different location.
"""
return _ls_logs(fs, log_dir_stream, _match_history_log_path,
job_id=job_id)
def _match_history_log_path(path, job_id=None):
"""Yield paths/uris of all job history files in the given directories,
optionally filtering by *job_id*.
"""
m = _HISTORY_LOG_PATH_RE.match(path)
if not m:
return None
if not (job_id is None or m.group('job_id') == job_id):
return None
# TODO: couldn't manage to include .jhist in regex; an optional
# group has less priority than a non-greedy match, apparently
return dict(job_id=m.group('job_id'), yarn='.jhist' in m.group('suffix'))
def _interpret_history_log(fs, matches):
"""Extract counters and errors from history log.
Matches is a list of dicts with the keys *job_id* and *yarn*
(see :py:func:`_ls_history_logs()`)
We expect *matches* to contain at most one match; further matches
will be ignored.
Returns a dictionary with the keys *counters* and *errors*.
"""
# we expect to go through this for loop 0 or 1 times
for match in matches:
path = match['path']
if match['yarn']:
# not yet implemented
result = _parse_yarn_history_log(_cat_log_lines(fs, path))
else:
result = _parse_pre_yarn_history_log(_cat_log_lines(fs, path))
# patch path, task_id, etc. into errors
for error in result.get('errors') or ():
if 'hadoop_error' in error:
error['hadoop_error']['path'] = path
_add_implied_task_id(error)
return result
return {}
def _parse_yarn_history_log(lines):
"""Collect useful info from a YARN history file, dealing gracefully
with unexpected data structures.
This returns a dictionary which may contain the following keys:
attempt_to_container_id: map from attempt_id to container_id (used
to find task logs corresponding to failed attempts)
counters: map from group to counter to amount. If job failed, we sum
counters for succesful tasks
errors: a list of dictionaries with the keys:
hadoop_error:
message: lines of error, as as string
start_line: first line of log containing the error (0-indexed)
num_lines: # of lines of log containing the error
task_id: ID of task with this error
attempt_id: ID of task attempt with this error
"""
result = {}
task_to_counters = {} # used for successful tasks in failed jobs
for line_num, line in enumerate(lines):
# empty space or "Avro-Json" header
if not line.startswith('{'):
continue
try:
record = json.loads(line)
except:
continue
record_type = record.get('type')
if not isinstance(record_type, string_types):
continue
# extract events. Looks like there's just one per record
event_record = record.get('event')
if not isinstance(event_record, dict):
continue
events = [e for e in record['event'].values()
if isinstance(e, dict)]
# update container_id -> attempt_id mapping
for event in events:
if 'attemptId' in event and 'containerId' in event:
result.setdefault('attempt_to_container_id', {})
result['attempt_to_container_id'][
event['attemptId']] = event['containerId']
if record_type.endswith('_ATTEMPT_FAILED'):
for event in events:
err_msg = event.get('error')
if not (err_msg and isinstance(err_msg, string_types)):
continue
error = dict(
hadoop_error=dict(
message=err_msg,
start_line=line_num,
num_lines=1))
if isinstance(event.get('taskid'), string_types):
error['task_id'] = event['taskid']
if isinstance(event.get('attemptId'), string_types):
error['attempt_id'] = event['attemptId']
result.setdefault('errors', [])
result['errors'].append(error)
elif record_type == 'TASK_FINISHED':
for event in events:
task_id = event.get('taskid')
if not isinstance(task_id, string_types):
continue
counters_record = event.get('counters')
if not isinstance(counters_record, dict):
continue
task_to_counters[task_id] = _extract_yarn_counters(
counters_record)
elif record_type == 'JOB_FINISHED':
for event in events:
# mapCounters and reduceCounters are also available
counters_record = event.get('totalCounters')
if not isinstance(counters_record, dict):
continue
result['counters'] = _extract_yarn_counters(counters_record)
# if job failed, patch together counters from successful tasks
if 'counters' not in result and task_to_counters:
result['counters'] = _sum_counters(*task_to_counters.values())
return result
def _extract_yarn_counters(counters_record):
"""Convert Avro-Json counter data structure to our
group -> counter -> amount format.
This deals gracefully with unexpected data structures.
"""
if not isinstance(counters_record, dict):
return {}
group_records = counters_record.get('groups')
if not isinstance(group_records, list):
return {}
counters = {}
for group_record in group_records:
if not isinstance(group_record, dict):
continue
group = group_record.get('displayName')
if not isinstance(group, string_types):
continue
counter_records = group_record.get('counts')
if not isinstance(counter_records, list):
continue
for counter_record in counter_records:
counter = counter_record.get('displayName')
if not isinstance(counter, string_types):
continue
# in YARN, counters can have an amount of 0. The Hadoop command
# prints them out, so we'll parse them
amount = counter_record.get('value')
if not (isinstance(amount, integer_types)):
continue
counters.setdefault(group, {})
counters[group].setdefault(counter, 0)
counters[group][counter] += amount
return counters
def _parse_pre_yarn_history_log(lines):
"""Collect useful info from a pre-YARN history file.
See :py:func:`_parse_yarn_history_log` for return format.
"""
# tantalizingly, STATE_STRING contains the split (URI and line numbers)
# read, but only for successful tasks, which doesn't help with debugging
result = {}
task_to_counters = {} # used for successful tasks in failed jobs
for record in _parse_pre_yarn_history_records(lines):
fields = record['fields']
# if job is successful, we get counters for the entire job at the end
if record['type'] == 'Job' and 'COUNTERS' in fields:
result['counters'] = _parse_pre_yarn_counters(fields['COUNTERS'])
# otherwise, compile counters for each successful task
#
# Note: this apparently records a higher total than the task tracker
# (possibly some tasks are duplicates?). Couldn't figure out the logic
# behind this while looking at the history file
elif (record['type'] == 'Task' and
'COUNTERS' in fields and 'TASKID' in fields):
task_id = fields['TASKID']
counters = _parse_pre_yarn_counters(fields['COUNTERS'])
task_to_counters[task_id] = counters
# only want FAILED (not KILLED) tasks with non-blank errors
elif (record['type'] in ('MapAttempt', 'ReduceAttempt') and
'TASK_ATTEMPT_ID' in fields and
fields.get('TASK_STATUS') == 'FAILED' and
fields.get('ERROR')):
result.setdefault('errors', [])
result['errors'].append(dict(
hadoop_error=dict(
message=fields['ERROR'],
start_line=record['start_line'],
num_lines=record['num_lines']),
attempt_id=fields['TASK_ATTEMPT_ID']))
# if job failed, patch together counters from successful tasks
if 'counters' not in result and task_to_counters:
result['counters'] = _sum_counters(*task_to_counters.values())
return result
def _parse_pre_yarn_history_records(lines):
r"""Yield records from the given sequence of lines. For example,
a line like this:
Task TASKID="task_201512311928_0001_m_000003" \
TASK_TYPE="MAP" START_TIME="1451590341378" \
SPLITS="/default-rack/172\.31\.22\.226" .
into a record like:
{
'fields': {'TASKID': 'task_201512311928_0001_m_00000',
'TASK_TYPE': 'MAP',
'START_TIME': '1451590341378',
'SPLITS': '/default-rack/172.31.22.226'},
'type': 'Task',
'line_num': 0,
'num_lines': 1,
}
This handles unescaping values, but doesn't do the further
unescaping needed to process counters. It can also handle multi-line
records (e.g. for Java stack traces).
"""
def yield_record_strings(lines):
record_lines = []
start_line = 0
for line_num, line in enumerate(lines):
record_lines.append(line)
if line.endswith(' .\n'):
yield start_line, len(record_lines), ''.join(record_lines)
record_lines = []
start_line = line_num + 1
for start_line, num_lines, record_str in yield_record_strings(lines):
record_match = _PRE_YARN_HISTORY_RECORD.match(record_str)
if not record_match:
continue
record_type = record_match.group('type')
key_pairs = record_match.group('key_pairs')
fields = {}
for m in _PRE_YARN_HISTORY_KEY_PAIR.finditer(key_pairs):
key = m.group('key')
value = _pre_yarn_history_unescape(m.group('escaped_value'))
fields[key] = value
yield dict(
fields=fields,
num_lines=num_lines,
start_line=start_line,
type=record_type,
)
def _parse_pre_yarn_counters(counters_str):
"""Parse a COUNTERS field from a pre-YARN history file.
Returns a map from group to counter to amount.
"""
counters = {}
for group_match in _PRE_YARN_COUNTER_GROUP_RE.finditer(counters_str):
group_name = _pre_yarn_history_unescape(
group_match.group('group_name'))
group_counters = {}
for counter_match in _PRE_YARN_COUNTER_RE.finditer(
group_match.group('counter_list_str')):
counter_name = _pre_yarn_history_unescape(
counter_match.group('counter_name'))
amount = int(counter_match.group('amount'))
group_counters[counter_name] = amount
counters[group_name] = group_counters
return counters
def _pre_yarn_history_unescape(s):
"""Un-escape string from a pre-YARN history file."""
return _PRE_YARN_HISTORY_ESCAPE_RE.sub(r'\1', s)