mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-10 04:01:19 +00:00
118 lines
3.7 KiB
Python
118 lines
3.7 KiB
Python
# Copyright 2019 Yelp
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Parse Spark driver and executor output. This can appear as either a "step"
|
|
log (output of the spark-submit binary) or as a "task" log (executors in
|
|
YARN containers), but has more or less the same format in either case."""
|
|
from .ids import _add_implied_task_id
|
|
from .log4j import _parse_hadoop_log4j_records
|
|
from .step import _SUBMITTED_APPLICATION_RE
|
|
from .wrap import _cat_log_lines
|
|
|
|
|
|
# if a message ends with this, it's the beginning of a traceback
|
|
_TRACEBACK_ENDS_WITH = 'Traceback (most recent call last):'
|
|
|
|
# if a traceback starts with this, strip it from the error message
|
|
_CAUSED_BY = 'Caused by: '
|
|
|
|
|
|
def _parse_spark_log(lines, record_callback=None):
|
|
"""Parse a Spark log, looking for errors and application_id"""
|
|
def yield_records():
|
|
for record in _parse_hadoop_log4j_records(lines):
|
|
if record_callback:
|
|
record_callback(record)
|
|
yield record
|
|
|
|
return _parse_spark_log_from_log4j_records(yield_records())
|
|
|
|
|
|
def _parse_spark_log_from_log4j_records(records):
|
|
"""Helper for _parse_spark_log()"""
|
|
|
|
# make sure *records* is a generator
|
|
records = iter(records)
|
|
|
|
result = {}
|
|
|
|
for record in records:
|
|
message = record['message']
|
|
|
|
m = _SUBMITTED_APPLICATION_RE.match(message)
|
|
if m:
|
|
# need this on YARN or we won't be able to find container logs
|
|
result['application_id'] = m.group('application_id')
|
|
continue
|
|
|
|
if record['level'] in ('WARN', 'ERROR'):
|
|
# only interested in multi-line warnings
|
|
if record['level'] == 'WARN' and record['num_lines'] == 1:
|
|
continue
|
|
|
|
error = dict(
|
|
spark_error=dict(
|
|
message=message,
|
|
start_line=record['start_line'],
|
|
num_lines=record['num_lines'],
|
|
)
|
|
)
|
|
|
|
if not result.get('errors'):
|
|
result['errors'] = []
|
|
|
|
result['errors'].append(error)
|
|
continue
|
|
|
|
return result
|
|
|
|
|
|
def _interpret_spark_logs(fs, matches, partial=True, log_callback=None):
|
|
result = {}
|
|
errors = []
|
|
|
|
for match in matches:
|
|
stop_if_partial = False
|
|
|
|
path = match['path']
|
|
if log_callback:
|
|
log_callback(path)
|
|
|
|
interpretation = _parse_spark_log(_cat_log_lines(fs, path))
|
|
|
|
result.update(interpretation)
|
|
# don't _add_implied_job_id() because it doesn't work that way on Spark
|
|
|
|
for error in interpretation.get('errors') or ():
|
|
if 'spark_error' in error:
|
|
error['spark_error']['path'] = path
|
|
if error['spark_error']['num_lines'] > 1:
|
|
stop_if_partial = True
|
|
# still worth parsing all the errors in this log
|
|
|
|
for id_key in 'attempt_id', 'container_id':
|
|
if id_key in match:
|
|
error[id_key] = match[id_key]
|
|
_add_implied_task_id(error)
|
|
|
|
errors.append(error)
|
|
|
|
if partial and stop_if_partial:
|
|
result['partial'] = True
|
|
break
|
|
|
|
if errors:
|
|
result['errors'] = errors
|
|
|
|
return result
|