GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/logs/step.py

# -*- coding: utf-8 -*-
# Copyright 2015-2017 Yelp
# Copyright 2018 Yelp and Google, Inc.
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parse "step" logs, which are the stdout/stderr/syslog of the
Hadoop streaming command (we call them this because EMR puts them
in the steps/ subdir of the logs on S3)."""
import errno
import re
import logging
from logging import getLogger

from .ids import _add_implied_job_id
from .ids import _add_implied_task_id
from .log4j import _parse_hadoop_log4j_records
from .task import _parse_task_stderr
from .wrap import _cat_log_lines
from .wrap import _ls_logs


# path of step logs (these only exist on EMR). Step logs on S3 are
# rotated and timestamped with YYYY-MM-DD-HH
_EMR_STEP_LOG_PATH_RE = re.compile(
    r'^(?P<prefix>.*?/)'
    r'(?P<step_id>s-[A-Z0-9]+)/'
    r'(?P<log_type>syslog|stderr)'
    r'(\.(?P<timestamp>[\d-]+))?(?P<suffix>\.\w+)?$')

# hadoop streaming always prints "packageJobJar..." to stdout,
# and prints Streaming Command Failed! to stderr on failure
_HADOOP_STREAMING_NON_LOG4J_LINE_RE = re.compile(
    r'^(packageJobJar: .*|Streaming Command Failed!)$')

# first line of counters message
_INDENTED_COUNTERS_MESSAGE_RE = re.compile(
    r'^Counters: (?P<num_counters>\d+)\s*$', re.MULTILINE)

# header for a group of counters
_INDENTED_COUNTER_GROUP_RE = re.compile(r'^(?P<indent>\s+)(?P<group>.*)$')

# line for a counter
_INDENTED_COUNTER_RE = re.compile(
    r'^(?P<indent>\s+)(?P<counter>.*)=(?P<amount>\d+)\s*$')

# line specifying output directory
_OUTPUT_DIRECTORY_RE = re.compile(
    r'^Output( directory)?:'
    r'\s+(?P<output_dir>\S+://\S+)\s*$')

# how to get application_id from YARN
_SUBMITTED_APPLICATION_RE = re.compile(
    r'^Submitted application (?P<application_id>application_\d+_\d{4})\s*$')

# how to get job_id (all versions)
_RUNNING_JOB_RE = re.compile(
    r'^Running job: (?P<job_id>job_\d+_\d{4})\s*$')

# job progress (YARN)
# no need to make this work for pre-YARN, only Dataproc runner uses it
_JOB_PROGRESS_RE = re.compile(
    r'^\s*map\s+(?P<map>\d+)%\s+reduce\s+(?P<reduce>\d+)%\s*$')

# if you specify a bad jar, this is all you get
_NOT_A_VALID_JAR_RE = re.compile(r'^\s*Not a valid JAR:.*')

# YARN prints this (sometimes followed by a Java exception) when tasks fail
_TASK_ATTEMPT_FAILED_RE = re.compile(
    r'^Task Id *:'
    r' (?P<attempt_id>attempt_\d+_\d{4}_[mr]_\d+_\d+),'
    r' Status *: FAILED$',
    re.MULTILINE)

log = getLogger(__name__)


def _ls_emr_step_syslogs(fs, log_dir_stream, step_id=None):
    """Yield matching step logs, optionally filtering by *step_id*.
    Yields dicts with the keys:

    path: path/URI of step file
    step_id: step_id in *path* (must match *step_id* if set)
    """
    matches = _ls_logs(fs, log_dir_stream, _match_emr_step_syslog_path,
                       step_id=step_id)

    return sorted(matches, key=_match_sort_key)


def _ls_emr_step_stderr_logs(fs, log_dir_stream, step_id=None):
    """Yield matching step logs, optionally filtering by *step_id*.
    Yields dicts with the keys:

    path: path/URI of step file
    step_id: step_id in *path* (must match *step_id* if set)
    """
    matches = _ls_logs(fs, log_dir_stream, _match_emr_step_stderr_path,
                       step_id=step_id)

    # we basically want to tail the stderr log, so search rotated
    # logs in reverse
    return sorted(matches, key=_match_sort_key, reverse=True)


def _match_sort_key(m):
    """sort key which treats empty timestamp as most recent"""
    return (m['timestamp'] is None, m['timestamp'] or '')


def _match_emr_step_syslog_path(path, step_id=None):
    """Match path of a step syslog, optionally filtering by *step_id*.

    If there is a match, return a dict with the keys *step_id* and
    *timestamp* (a string). Otherwise, returns None.
    """
    return _match_emr_step_log_path(path, 'syslog', step_id=step_id)


def _match_emr_step_stderr_path(path, step_id=None):
    """Match path of a step stderr log, optionally filtering by *step_id*.

    If there is a match, return a dict with the keys *step_id* and
    *timestamp* (a string). Otherwise, returns None.
    """
    return _match_emr_step_log_path(path, 'stderr', step_id=step_id)


def _match_emr_step_log_path(path, log_type, step_id=None):
    """Helper for :py:func:`_match_emr_step_syslog_path` and
    :py:func:`_match_emr_step_stderr_path`
    """
    m = _EMR_STEP_LOG_PATH_RE.match(path)
    if not m:
        return None

    if m.group('log_type') != log_type:
        return None

    if not (step_id is None or m.group('step_id') == step_id):
        return None

    return dict(step_id=m.group('step_id'), timestamp=m.group('timestamp'))


def _interpret_emr_step_syslog(fs, matches):
    """Extract information from step syslog (see :py:func:`_parse_step_log()`),
    which may be split into several chunks by timestamp"""
    # going to merge results for each log into final result
    errors = []
    result = {}

    for match in matches:
        path = match['path']

        interpretation = _parse_step_syslog(_cat_log_lines(fs, path))

        result.update(interpretation)
        for error in result.get('errors') or ():
            if 'hadoop_error' in error:
                error['hadoop_error']['path'] = path
            _add_implied_task_id(error)
            errors.append(error)

    _add_implied_job_id(result)
    if errors:
        result['errors'] = errors

    return result


def _interpret_new_dataproc_step_stderr(step_interpretation, new_lines):
    """Incrementally update *step_interpretation* (a dict) with information
    from new lines read from Hadoop job driver output on Dataproc."""
    return _parse_step_syslog_from_log4j_records(
        _parse_hadoop_log4j_records(new_lines),
        step_interpretation)


def _interpret_emr_step_stderr(fs, matches):
    """Extract information from step stderr (see
    :py:func:`~mrjob.logs.task._parse_task_stderr()`),
    which may be split into several chunks by timestamp"""
    for match in matches:
        path = match['path']

        error = _parse_task_stderr(_cat_log_lines(fs, path))

        if error:
            error['path'] = path
            # We're essentially just tailing the stderr log, so stop when we
            # find an error.
            return dict(errors=[dict(task_error=error)])

    return {}


def _eio_to_eof(pty_master):
    """Yield lines from a PTY, gracefully handling an ``IOError`` with
    ``errno == EIO`` as end-of-file."""
    try:
        for line in pty_master:
            yield line
    except IOError as e:
        # this is just the PTY's way of saying goodbye
        if e.errno == errno.EIO:
            return
        else:
            raise


def _interpret_hadoop_jar_command_stderr(lines, record_callback=None):
    """Parse *lines* from the ``hadoop jar`` command's stderr (lines can be
    either bytes or unicode). Works like :py:func:`_parse_step_syslog` (same
    return format) with a few extra features to handle the output of the
    ``hadoop jar`` command on the fly:

    - Pre-filters non-log4j stuff from Hadoop Streaming so it doesn't
      get treated as part of a multi-line message
    - Optionally calls *record_callback* for each log4j record (see
      :py:func:`~mrjob.logs.log4j._parse_hadoop_log4j_records`).
    """
    def pre_filter(line):
        return bool(_HADOOP_STREAMING_NON_LOG4J_LINE_RE.match(line))

    def yield_records():
        for record in _parse_hadoop_log4j_records(
                lines, pre_filter=pre_filter):
            if record_callback:
                record_callback(record)
            yield record

    result = _parse_step_syslog_from_log4j_records(yield_records())

    _add_implied_job_id(result)
    for error in result.get('errors') or ():
        _add_implied_task_id(error)

    return result


def _parse_step_syslog(lines):
    """Parse the syslog from the ``hadoop jar`` command.

    Returns a dictionary which potentially contains the following keys:

    application_id: a string like 'application_1449857544442_0002'. Only
        set on YARN
    counters: a map from counter group -> counter -> amount, or None if
        no counters found (only YARN prints counters)
    errors: a list of errors, with the following keys:
        hadoop_error:
            message: lines of error, as as string
            start_line: first line of log containing the error (0-indexed)
            num_lines: # of lines of log containing the error
        attempt_id: ID of task attempt with this error
    job_id: a string like 'job_201512112247_0003'. Should always be set
    output_dir: a URI like 'hdfs:///user/hadoop/tmp/my-output-dir'. Should
        always be set on success.
    """
    return _parse_step_syslog_from_log4j_records(
        _parse_hadoop_log4j_records(lines))


def _parse_step_syslog_from_log4j_records(records, step_interpretation=None):
    """Pulls errors, counters, IDs, etc. from log4j records
    emitted by Hadoop.

    This powers :py:func:`_parse_step_syslog` and
    :py:func:`_interpret_hadoop_jar_command_stderr`.
    """
    if step_interpretation is None:
        result = {}
    else:
        result = step_interpretation

    for record in records:
        message = record['message']

        # counters
        if _is_counter_log4j_record(record):
            result['counters'] = _parse_indented_counters(
                message.splitlines())
            continue

        # output_dir
        m = _OUTPUT_DIRECTORY_RE.match(message)
        if m:
            result['output_dir'] = m.group('output_dir')
            continue

        # application_id
        m = _SUBMITTED_APPLICATION_RE.match(message)
        if m:
            result['application_id'] = m.group('application_id')
            continue

        # job_id
        m = _RUNNING_JOB_RE.match(message)
        if m:
            result['job_id'] = m.group('job_id')
            continue

        # progress
        m = _JOB_PROGRESS_RE.match(message)
        if m:
            result['progress'] = dict(
                map=int(m.group('map')),
                reduce=int(m.group('reduce')),
                message=message,
            )

        # invalid jar
        m = _NOT_A_VALID_JAR_RE.match(message)
        if m:
            error = dict(
                hadoop_error=dict(
                    message=message,
                    num_lines=record['num_lines'],
                    start_line=record['start_line'],
                ),
            )
            result.setdefault('errors', [])
            result['errors'].append(error)

        # task failure
        m = _TASK_ATTEMPT_FAILED_RE.match(message)
        if m:
            error_str = '\n'.join(message.splitlines()[1:])
            if not error_str:  # if no exception, print something
                error_str = message

            error = dict(
                attempt_id=m.group('attempt_id'),
                hadoop_error=dict(
                    message=error_str,
                    num_lines=record['num_lines'],
                    start_line=record['start_line'],
                )
            )

            result.setdefault('errors', [])
            result['errors'].append(error)

    return result


def _is_counter_log4j_record(record):
    """Is this the record containing counters?

    (HadoopJobRunner doesn't want to log this because it's very long
    and it has its own way of displaying counters.)
    """
    return bool(_INDENTED_COUNTERS_MESSAGE_RE.match(record['message']))


def _parse_indented_counters(lines):
    """Parse counters in the indented format output/logged by the
    Hadoop binary.

    Takes input as lines (should not include log record stuff) and
    returns a map from counter group to counter to amount.
    """
    counters = {}  # map group -> counter -> amount
    group = None
    group_indent = None

    for line in lines:
        if not (group is None or group_indent is None):
            m = _INDENTED_COUNTER_RE.match(line)
            if m and len(m.group('indent')) > group_indent:

                counter = m.group('counter')
                amount = int(m.group('amount'))

                counters.setdefault(group, {})
                counters[group][counter] = amount

                continue

        m = _INDENTED_COUNTER_GROUP_RE.match(line)
        if m:
            group = m.group('group')
            group_indent = len(m.group('indent'))

        elif not _INDENTED_COUNTERS_MESSAGE_RE.match(line):
            log.warning('unexpected counter line: %s' % line)

    return counters


def _log_line_from_driver(line, level=None):
    """Log ``'  <line>'``. *line* should be a string.

    Optionally specify a logging level (default is logging.INFO).
    """
    log.log(level or logging.INFO, '  %s' % line)


def _log_log4j_record(record):
    """Log a log4j message at the appropriate logging level"""
    level = getattr(logging, record.get('level') or '', None)
    _log_line_from_driver(record['message'], level=level)