GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/logs/spark.py

# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parse Spark driver and executor output. This can appear as either a "step"
log (output of the spark-submit binary) or as a "task" log (executors in
YARN containers), but has more or less the same format in either case."""
from .ids import _add_implied_task_id
from .log4j import _parse_hadoop_log4j_records
from .step import _SUBMITTED_APPLICATION_RE
from .wrap import _cat_log_lines


# if a message ends with this, it's the beginning of a traceback
_TRACEBACK_ENDS_WITH = 'Traceback (most recent call last):'

# if a traceback starts with this, strip it from the error message
_CAUSED_BY = 'Caused by: '


def _parse_spark_log(lines, record_callback=None):
    """Parse a Spark log, looking for errors and application_id"""
    def yield_records():
        for record in _parse_hadoop_log4j_records(lines):
            if record_callback:
                record_callback(record)
            yield record

    return _parse_spark_log_from_log4j_records(yield_records())


def _parse_spark_log_from_log4j_records(records):
    """Helper for _parse_spark_log()"""

    # make sure *records* is a generator
    records = iter(records)

    result = {}

    for record in records:
        message = record['message']

        m = _SUBMITTED_APPLICATION_RE.match(message)
        if m:
            # need this on YARN or we won't be able to find container logs
            result['application_id'] = m.group('application_id')
            continue

        if record['level'] in ('WARN', 'ERROR'):
            # only interested in multi-line warnings
            if record['level'] == 'WARN' and record['num_lines'] == 1:
                continue

            error = dict(
                spark_error=dict(
                    message=message,
                    start_line=record['start_line'],
                    num_lines=record['num_lines'],
                )
            )

            if not result.get('errors'):
                result['errors'] = []

            result['errors'].append(error)
            continue

    return result


def _interpret_spark_logs(fs, matches, partial=True, log_callback=None):
    result = {}
    errors = []

    for match in matches:
        stop_if_partial = False

        path = match['path']
        if log_callback:
            log_callback(path)

        interpretation = _parse_spark_log(_cat_log_lines(fs, path))

        result.update(interpretation)
        # don't _add_implied_job_id() because it doesn't work that way on Spark

        for error in interpretation.get('errors') or ():
            if 'spark_error' in error:
                error['spark_error']['path'] = path
                if error['spark_error']['num_lines'] > 1:
                    stop_if_partial = True
                    # still worth parsing all the errors in this log

            for id_key in 'attempt_id', 'container_id':
                if id_key in match:
                    error[id_key] = match[id_key]
            _add_implied_task_id(error)

            errors.append(error)

        if partial and stop_if_partial:
            result['partial'] = True
            break

    if errors:
        result['errors'] = errors

    return result