GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/logs/spark.py
2025-06-22 13:36:01 +05:30

118 lines
3.7 KiB
Python

# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parse Spark driver and executor output. This can appear as either a "step"
log (output of the spark-submit binary) or as a "task" log (executors in
YARN containers), but has more or less the same format in either case."""
from .ids import _add_implied_task_id
from .log4j import _parse_hadoop_log4j_records
from .step import _SUBMITTED_APPLICATION_RE
from .wrap import _cat_log_lines
# if a message ends with this, it's the beginning of a traceback
_TRACEBACK_ENDS_WITH = 'Traceback (most recent call last):'
# if a traceback starts with this, strip it from the error message
_CAUSED_BY = 'Caused by: '
def _parse_spark_log(lines, record_callback=None):
"""Parse a Spark log, looking for errors and application_id"""
def yield_records():
for record in _parse_hadoop_log4j_records(lines):
if record_callback:
record_callback(record)
yield record
return _parse_spark_log_from_log4j_records(yield_records())
def _parse_spark_log_from_log4j_records(records):
"""Helper for _parse_spark_log()"""
# make sure *records* is a generator
records = iter(records)
result = {}
for record in records:
message = record['message']
m = _SUBMITTED_APPLICATION_RE.match(message)
if m:
# need this on YARN or we won't be able to find container logs
result['application_id'] = m.group('application_id')
continue
if record['level'] in ('WARN', 'ERROR'):
# only interested in multi-line warnings
if record['level'] == 'WARN' and record['num_lines'] == 1:
continue
error = dict(
spark_error=dict(
message=message,
start_line=record['start_line'],
num_lines=record['num_lines'],
)
)
if not result.get('errors'):
result['errors'] = []
result['errors'].append(error)
continue
return result
def _interpret_spark_logs(fs, matches, partial=True, log_callback=None):
result = {}
errors = []
for match in matches:
stop_if_partial = False
path = match['path']
if log_callback:
log_callback(path)
interpretation = _parse_spark_log(_cat_log_lines(fs, path))
result.update(interpretation)
# don't _add_implied_job_id() because it doesn't work that way on Spark
for error in interpretation.get('errors') or ():
if 'spark_error' in error:
error['spark_error']['path'] = path
if error['spark_error']['num_lines'] > 1:
stop_if_partial = True
# still worth parsing all the errors in this log
for id_key in 'attempt_id', 'container_id':
if id_key in match:
error[id_key] = match[id_key]
_add_implied_task_id(error)
errors.append(error)
if partial and stop_if_partial:
result['partial'] = True
break
if errors:
result['errors'] = errors
return result