GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/logs/task.py

# -*- coding: utf-8 -*-
# Copyright 2015-2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parse "task" logs, which are the syslog and stderr for each individual
task and typically appear in the userlogs/ directory."""
from collections import defaultdict
from logging import getLogger
import re

from .ids import _add_implied_task_id
from .ids import _to_job_id
from .log4j import _parse_hadoop_log4j_records
from .wrap import _cat_log_lines
from .wrap import _ls_logs
from mrjob import parse

log = getLogger(__name__)

# Match a java exception, possibly preceded by 'PipeMapRed failed!', etc.
# use this with search()
_JAVA_TRACEBACK_RE = re.compile(
    r'\s+at .*\((.*\.(java|scala):\d+|Native Method)\)$',
    re.MULTILINE)

# Match an error stating that Spark's subprocess has failed (and thus we
# should read stdout
_SPARK_APP_EXITED_RE = re.compile(
    r'^\s*User application exited with status \d+\s*$')

# the name of the logger that logs the above
_SPARK_APP_MASTER_LOGGER = 'ApplicationMaster'

# this seems to only happen for S3. Not sure if this happens in YARN
_OPENING_FOR_READING_RE = re.compile(
    r"^Opening '(?P<path>.*?)' for reading$")

# what log paths look like pre-YARN
_PRE_YARN_TASK_LOG_PATH_RE = re.compile(
    r'^(?P<prefix>.*?/)'
    r'(?P<attempt_id>attempt_(?P<timestamp>\d+)_(?P<step_num>\d+)_'
    r'(?P<task_type>[mr])_(?P<task_num>\d+)_'
    r'(?P<attempt_num>\d+))/'
    r'(?P<log_type>stderr|syslog)(?P<suffix>\.\w{1,3})?$')

# ignore counters and status (only happens in sim runners, where task stderr
# is dumped straight to a file).

# convert from bytes regex to text regex
_COUNTER_RE = re.compile(parse._COUNTER_RE.pattern.decode('ascii'))
_STATUS_RE = re.compile(parse._STATUS_RE.pattern.decode('ascii'))

# ignore warnings about initializing log4j in task stderr
_LOG4J_WARN_RE = re.compile(r'^log4j:WARN .*$')

# also ignore counters and status messages (this only happens in
# local mode, where there's no real Hadoop to filter them out)
_TASK_STDERR_IGNORE_RES = [
    _COUNTER_RE,
    _STATUS_RE,
    _LOG4J_WARN_RE,
]

# this is the start of a Java stacktrace that Hadoop 1 always logs to
# stderr when tasks fail (see #1430)
_SUBPROCESS_FAILED_STACK_TRACE_START = re.compile(
    r'^java\.lang\.RuntimeException: PipeMapRed\.waitOutputThreads\(\):'
    r' subprocess failed with code .*$')

# message telling us about a (input) split. Looks like this:
#
# Processing split: hdfs://ddf64167693a:9000/path/to/bootstrap.sh:0+335
_YARN_INPUT_SPLIT_RE = re.compile(
    r'^Processing split:\s+(?P<path>.*)'
    r':(?P<start_line>\d+)\+(?P<num_lines>\d+)$')

# what log paths look like on YARN (also used for Spark, hence stdout)
_YARN_TASK_LOG_PATH_RE = re.compile(
    r'^(?P<prefix>.*?/)'
    r'(?P<application_id>application_\d+_\d{4})/'
    r'(?P<container_id>container(_\d+)+)/'
    r'(?P<log_type>stderr|stdout|syslog)(?P<suffix>\.\w{1,3})?$')


def _ls_task_logs(fs, log_dir_stream, application_id=None, job_id=None,
                  error_attempt_ids=None, attempt_to_container_id=None):
    """Yield matching logs, optionally filtering by application_id
    or job_id.

    This will yield matches for stderr logs first, followed by syslogs. stderr
    logs will have a 'syslog' field pointing to the match for the
    corresponding syslog (stderr logs without a corresponding syslog won't be
    included).
    """
    return _ls_task_logs_helper(
        fs, log_dir_stream, is_spark=False,
        application_id=application_id, job_id=job_id,
        error_attempt_ids=error_attempt_ids,
        attempt_to_container_id=attempt_to_container_id)


def _ls_spark_task_logs(
        fs, log_dir_stream, application_id=None, job_id=None,
        error_attempt_ids=None, attempt_to_container_id=None):
    """Yield matching Spark logs, optionally filtering by application_id
    or job_id.

    This will yield matches for stderr logs only. stderr
    logs will have a 'stdout' field pointing to the match for the
    corresponding stdout file; whether we process this depends on the content
    of the stderr file.
    """
    return _ls_task_logs_helper(
        fs, log_dir_stream, is_spark=True,
        application_id=application_id, job_id=job_id,
        error_attempt_ids=error_attempt_ids,
        attempt_to_container_id=attempt_to_container_id)


def _ls_task_logs_helper(fs, log_dir_stream, is_spark,
                         application_id=None, job_id=None,
                         error_attempt_ids=None, attempt_to_container_id=None):
    """Helper for _ls_task_logs() and _ls_spark_task_logs().

    *syslog_type* is the type of the log to pair with stderr logs.

    This is actually a bit weird; on Spark, 'stderr' is the equivalent
    of syslog in Streaming, and 'stdout' is the equivlend of Streaming's
    stderr.

    For Streaming, we want stderr logs with corresponding syslogs, and if after
    listing all task logs we don't find any, syslogs.

    For Spark, we want stderr logs (which are equivalent to Streaming syslogs)
    with corresponding stdouts, and if after listing all task logs we don't
    find any, stderr logs without corresponding stdouts.
    """
    syslog_type = 'stdout' if is_spark else 'syslog'

    error_attempt_ids = error_attempt_ids or ()

    # figure out subdirs to look for logs in
    if attempt_to_container_id:
        # YARN
        subdirs = [
            attempt_to_container_id[a] for a in error_attempt_ids
            if a in attempt_to_container_id]
    else:
        subdirs = list(error_attempt_ids)

    # only look in subdirs corresponding to failed attempts
    if subdirs:
        log_subdir_stream = ([
            fs.join(log_dir, subdir)
            for subdir in subdirs
            for log_dir in log_dir_list
        ] for log_dir_list in log_dir_stream)
    else:
        log_subdir_stream = log_dir_stream

    key_to_type_to_match = defaultdict(dict)

    # less desirable errors to yield if we don't find the ones we want
    other_matches = []

    for match in _ls_logs(
            fs, log_subdir_stream, _match_task_log_path, is_spark,
            application_id=application_id,
            job_id=job_id):

        log_key = _log_key(match)
        log_type = match['log_type']

        if log_type not in ('stderr', syslog_type):
            continue  # don't care

        type_to_match = key_to_type_to_match[log_key]

        if log_type in type_to_match:
            continue  # already seen

        type_to_match[log_type] = match

        # yield stderrs with syslogs as we find them
        if 'stderr' in type_to_match and syslog_type in type_to_match:
            stderr_match = type_to_match['stderr']
            syslog_match = type_to_match[syslog_type]

            stderr_match[syslog_type] = syslog_match

            yield stderr_match

        if log_type == ('stderr' if is_spark else syslog_type):
            other_matches.append(match)

    # yield logs that don't have both syslog and stderr
    for other_match in other_matches:
        if syslog_type not in other_match:  # already yielded
            yield other_match


def _log_key(match):
    """Helper method for _ls_task_logs() and _ls_spark_task_logs()."""
    return tuple((k, v) for k, v in sorted(match.items())
                 if k not in ('log_type', 'path'))


def _match_task_log_path(path, application_id=None, job_id=None):
    """Is this the path/URI of a task log? (Including Spark)

    If so, return a dictionary containing application_id and container_id
    (on YARN) or attempt_id (on pre-YARN Hadoop), plus log_type (either
    stdout, stderr, or syslog).

    Otherwise, return None

    Optionally, filter by application_id (YARN) or job_id (pre-YARN).
    """
    m = _PRE_YARN_TASK_LOG_PATH_RE.match(path)
    if m:
        if job_id and job_id != _to_job_id(m.group('attempt_id')):
            return None  # matches, but wrong job_id

        return dict(
            attempt_id=m.group('attempt_id'),
            log_type=m.group('log_type'))

    m = _YARN_TASK_LOG_PATH_RE.match(path)
    if m:
        if application_id and application_id != m.group('application_id'):
            return None  # matches, but wrong application_id

        return dict(
            application_id=m.group('application_id'),
            container_id=m.group('container_id'),
            log_type=m.group('log_type'))

    return None


def _interpret_task_logs(fs, matches, partial=True, log_callback=None):
    """Look for errors in task syslog/stderr.

    If *partial* is true (the default), stop when we find the first error
    that includes a *task_error*.

    If *log_callback* is set, every time we're about to parse a
        file, call it with a single argument, the path of that file

    Returns a dictionary possibly containing the key 'errors', which
    is a dict containing:

    hadoop_error:
        message: string containing error message and Java exception
        num_lines: number of lines in syslog this takes up
        path: syslog we read this error from
        start_line: where in syslog exception starts (0-indexed)
    split: (optional)
        path: URI of input file task was processing
        num_lines: (optional) number of lines in split
        start_line: (optional) first line of split (0-indexed)
    task_error:
        message: command and error message from task, as a string
        num_lines: number of lines in stderr this takes up
        path: stderr we read this from
        start_line: where in stderr error message starts (0-indexed)

    In addition, if *partial* is set to true (and we found an error),
    this dictionary will contain the key *partial*, set to True.
    """
    result = {}
    syslogs_parsed = set()

    for match in matches:
        error = {}

        # are is this match for a stderr file, or a syslog?
        if match.get('syslog'):
            stderr_path = match['path']
            syslog_path = match['syslog']['path']
        else:
            stderr_path = None
            syslog_path = match['path']

        if stderr_path:
            if log_callback:
                log_callback(stderr_path)
            task_error = _parse_task_stderr(_cat_log_lines(fs, stderr_path))

            if task_error:
                task_error['path'] = stderr_path
                error['task_error'] = task_error
            else:
                continue  # can parse syslog independently later

        # already parsed this syslog in conjunction with an earlier task error
        if syslog_path in syslogs_parsed:
            continue

        if log_callback:
            log_callback(syslog_path)
        syslog_error = _parse_task_syslog(_cat_log_lines(fs, syslog_path))
        syslogs_parsed.add(syslog_path)

        if not syslog_error.get('hadoop_error'):
            # if no entry in Hadoop syslog, probably just noise
            continue

        error.update(syslog_error)
        error['hadoop_error']['path'] = syslog_path

        # patch in IDs we learned from path
        for id_key in 'attempt_id', 'container_id':
            if id_key in match:
                error[id_key] = match[id_key]
        _add_implied_task_id(error)

        result.setdefault('errors', [])
        result['errors'].append(error)

        if partial:
            result['partial'] = True
            break

    return result


def _parse_task_syslog(lines):
    """Parse an error out of a syslog file (or a Spark stderr file).

    Returns a dict, possibly containing the following keys:

    check_stdout:
        if true, we should look for task errors in the corresponding
        'stdout' file. Used for Spark logs.
    hadoop_error:
        message: string containing error message and Java exception
        num_lines: number of lines in syslog this takes up
        start_line: where in syslog exception starts (0-indexed)
    split: (optional)
        path: URI of input file task was processing
        num_lines: (optional) number of lines in split
        start_line: (optional) first line of split (0-indexed)
    """
    return _parse_task_syslog_records(_parse_hadoop_log4j_records(lines))


def _parse_task_syslog_records(records):
    """Helper for _parse_task_syslog(); takes log4j records rather than
    lines"""
    result = {}

    for record in records:
        message = record['message']

        m = _OPENING_FOR_READING_RE.match(message)
        if m:
            result['split'] = dict(path=m.group('path'))
            continue

        m = _YARN_INPUT_SPLIT_RE.match(message)
        if m:
            result['split'] = dict(
                path=m.group('path'),
                start_line=int(m.group('start_line')),
                num_lines=int(m.group('num_lines')))
            continue

        m = _JAVA_TRACEBACK_RE.search(message)
        if m:
            result['hadoop_error'] = dict(
                message=message,
                num_lines=record['num_lines'],
                start_line=record['start_line'],
            )
            break  # nothing to do once we've found the error

        if (record['logger'] == _SPARK_APP_MASTER_LOGGER and
                record['level'] == 'ERROR'):
            m = _SPARK_APP_EXITED_RE.match(message)
            if m:
                result['hadoop_error'] = dict(
                    message=message,
                    num_lines=record['num_lines'],
                    start_line=record['start_line'],
                )
                result['check_stdout'] = True
                break  # nothing else to do once we've found the error

    return result


# TODO: allow filtering of bad lines to happen elsewhere, pass this
# function numbered lines
def _parse_task_stderr(lines):
    """Attempt to explain any error in task stderr, be it a Python
    exception or a problem with a setup command (see #1203).

    Looks for '+ ' followed by a command line, and then the command's
    stderr. If there are no such lines (because we're not using a setup
    script), assumes the entire file contents are the cause of error.

    Returns a task error dictionary with the following keys, or None
    if the file is empty.

    message: a string (e.g. Python command line followed by Python traceback)
    start_line: where in lines message appears (0-indexed)
    num_lines: how may lines the message takes up
    """
    task_error = None
    stack_trace_start_line = None

    for line_num, line in enumerate(lines):
        line = line.rstrip('\r\n')

        # ignore "subprocess failed" stack trace
        if _SUBPROCESS_FAILED_STACK_TRACE_START.match(line):
            stack_trace_start_line = line_num
            continue

        # once we detect a stack trace, keep ignoring lines until
        # we find a non-indented one
        if stack_trace_start_line is not None:
            if line.lstrip() != line:
                continue
            else:
                stack_trace_start_line = None

        # ignore warnings about initializing log4j, counters, etc.
        if any(ir.match(line) for ir in _TASK_STDERR_IGNORE_RES):
            # ignored lines shouldn't count as part of the line range
            if task_error and task_error.get('num_lines') is None:
                task_error['num_lines'] = line_num - task_error['start_line']
            continue
        elif not task_error or line.startswith('+ '):
            # stderr log should only contain counters and status
            # messages in local mode
            task_error = dict(
                message=line,
                start_line=line_num)
        else:
            task_error['message'] += '\n' + line
            task_error['num_lines'] = None

    if task_error:
        if task_error.get('num_lines') is None:
            end_line = stack_trace_start_line or (line_num + 1)
            task_error['num_lines'] = end_line - task_error['start_line']
        return task_error
    else:
        return None