# -*- coding: utf-8 -*- # Copyright 2015-2018 Yelp # Copyright 2019 Yelp # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Parse "task" logs, which are the syslog and stderr for each individual task and typically appear in the userlogs/ directory.""" from collections import defaultdict from logging import getLogger import re from .ids import _add_implied_task_id from .ids import _to_job_id from .log4j import _parse_hadoop_log4j_records from .wrap import _cat_log_lines from .wrap import _ls_logs from mrjob import parse log = getLogger(__name__) # Match a java exception, possibly preceded by 'PipeMapRed failed!', etc. # use this with search() _JAVA_TRACEBACK_RE = re.compile( r'\s+at .*\((.*\.(java|scala):\d+|Native Method)\)$', re.MULTILINE) # Match an error stating that Spark's subprocess has failed (and thus we # should read stdout _SPARK_APP_EXITED_RE = re.compile( r'^\s*User application exited with status \d+\s*$') # the name of the logger that logs the above _SPARK_APP_MASTER_LOGGER = 'ApplicationMaster' # this seems to only happen for S3. Not sure if this happens in YARN _OPENING_FOR_READING_RE = re.compile( r"^Opening '(?P.*?)' for reading$") # what log paths look like pre-YARN _PRE_YARN_TASK_LOG_PATH_RE = re.compile( r'^(?P.*?/)' r'(?Pattempt_(?P\d+)_(?P\d+)_' r'(?P[mr])_(?P\d+)_' r'(?P\d+))/' r'(?Pstderr|syslog)(?P\.\w{1,3})?$') # ignore counters and status (only happens in sim runners, where task stderr # is dumped straight to a file). # convert from bytes regex to text regex _COUNTER_RE = re.compile(parse._COUNTER_RE.pattern.decode('ascii')) _STATUS_RE = re.compile(parse._STATUS_RE.pattern.decode('ascii')) # ignore warnings about initializing log4j in task stderr _LOG4J_WARN_RE = re.compile(r'^log4j:WARN .*$') # also ignore counters and status messages (this only happens in # local mode, where there's no real Hadoop to filter them out) _TASK_STDERR_IGNORE_RES = [ _COUNTER_RE, _STATUS_RE, _LOG4J_WARN_RE, ] # this is the start of a Java stacktrace that Hadoop 1 always logs to # stderr when tasks fail (see #1430) _SUBPROCESS_FAILED_STACK_TRACE_START = re.compile( r'^java\.lang\.RuntimeException: PipeMapRed\.waitOutputThreads\(\):' r' subprocess failed with code .*$') # message telling us about a (input) split. Looks like this: # # Processing split: hdfs://ddf64167693a:9000/path/to/bootstrap.sh:0+335 _YARN_INPUT_SPLIT_RE = re.compile( r'^Processing split:\s+(?P.*)' r':(?P\d+)\+(?P\d+)$') # what log paths look like on YARN (also used for Spark, hence stdout) _YARN_TASK_LOG_PATH_RE = re.compile( r'^(?P.*?/)' r'(?Papplication_\d+_\d{4})/' r'(?Pcontainer(_\d+)+)/' r'(?Pstderr|stdout|syslog)(?P\.\w{1,3})?$') def _ls_task_logs(fs, log_dir_stream, application_id=None, job_id=None, error_attempt_ids=None, attempt_to_container_id=None): """Yield matching logs, optionally filtering by application_id or job_id. This will yield matches for stderr logs first, followed by syslogs. stderr logs will have a 'syslog' field pointing to the match for the corresponding syslog (stderr logs without a corresponding syslog won't be included). """ return _ls_task_logs_helper( fs, log_dir_stream, is_spark=False, application_id=application_id, job_id=job_id, error_attempt_ids=error_attempt_ids, attempt_to_container_id=attempt_to_container_id) def _ls_spark_task_logs( fs, log_dir_stream, application_id=None, job_id=None, error_attempt_ids=None, attempt_to_container_id=None): """Yield matching Spark logs, optionally filtering by application_id or job_id. This will yield matches for stderr logs only. stderr logs will have a 'stdout' field pointing to the match for the corresponding stdout file; whether we process this depends on the content of the stderr file. """ return _ls_task_logs_helper( fs, log_dir_stream, is_spark=True, application_id=application_id, job_id=job_id, error_attempt_ids=error_attempt_ids, attempt_to_container_id=attempt_to_container_id) def _ls_task_logs_helper(fs, log_dir_stream, is_spark, application_id=None, job_id=None, error_attempt_ids=None, attempt_to_container_id=None): """Helper for _ls_task_logs() and _ls_spark_task_logs(). *syslog_type* is the type of the log to pair with stderr logs. This is actually a bit weird; on Spark, 'stderr' is the equivalent of syslog in Streaming, and 'stdout' is the equivlend of Streaming's stderr. For Streaming, we want stderr logs with corresponding syslogs, and if after listing all task logs we don't find any, syslogs. For Spark, we want stderr logs (which are equivalent to Streaming syslogs) with corresponding stdouts, and if after listing all task logs we don't find any, stderr logs without corresponding stdouts. """ syslog_type = 'stdout' if is_spark else 'syslog' error_attempt_ids = error_attempt_ids or () # figure out subdirs to look for logs in if attempt_to_container_id: # YARN subdirs = [ attempt_to_container_id[a] for a in error_attempt_ids if a in attempt_to_container_id] else: subdirs = list(error_attempt_ids) # only look in subdirs corresponding to failed attempts if subdirs: log_subdir_stream = ([ fs.join(log_dir, subdir) for subdir in subdirs for log_dir in log_dir_list ] for log_dir_list in log_dir_stream) else: log_subdir_stream = log_dir_stream key_to_type_to_match = defaultdict(dict) # less desirable errors to yield if we don't find the ones we want other_matches = [] for match in _ls_logs( fs, log_subdir_stream, _match_task_log_path, is_spark, application_id=application_id, job_id=job_id): log_key = _log_key(match) log_type = match['log_type'] if log_type not in ('stderr', syslog_type): continue # don't care type_to_match = key_to_type_to_match[log_key] if log_type in type_to_match: continue # already seen type_to_match[log_type] = match # yield stderrs with syslogs as we find them if 'stderr' in type_to_match and syslog_type in type_to_match: stderr_match = type_to_match['stderr'] syslog_match = type_to_match[syslog_type] stderr_match[syslog_type] = syslog_match yield stderr_match if log_type == ('stderr' if is_spark else syslog_type): other_matches.append(match) # yield logs that don't have both syslog and stderr for other_match in other_matches: if syslog_type not in other_match: # already yielded yield other_match def _log_key(match): """Helper method for _ls_task_logs() and _ls_spark_task_logs().""" return tuple((k, v) for k, v in sorted(match.items()) if k not in ('log_type', 'path')) def _match_task_log_path(path, application_id=None, job_id=None): """Is this the path/URI of a task log? (Including Spark) If so, return a dictionary containing application_id and container_id (on YARN) or attempt_id (on pre-YARN Hadoop), plus log_type (either stdout, stderr, or syslog). Otherwise, return None Optionally, filter by application_id (YARN) or job_id (pre-YARN). """ m = _PRE_YARN_TASK_LOG_PATH_RE.match(path) if m: if job_id and job_id != _to_job_id(m.group('attempt_id')): return None # matches, but wrong job_id return dict( attempt_id=m.group('attempt_id'), log_type=m.group('log_type')) m = _YARN_TASK_LOG_PATH_RE.match(path) if m: if application_id and application_id != m.group('application_id'): return None # matches, but wrong application_id return dict( application_id=m.group('application_id'), container_id=m.group('container_id'), log_type=m.group('log_type')) return None def _interpret_task_logs(fs, matches, partial=True, log_callback=None): """Look for errors in task syslog/stderr. If *partial* is true (the default), stop when we find the first error that includes a *task_error*. If *log_callback* is set, every time we're about to parse a file, call it with a single argument, the path of that file Returns a dictionary possibly containing the key 'errors', which is a dict containing: hadoop_error: message: string containing error message and Java exception num_lines: number of lines in syslog this takes up path: syslog we read this error from start_line: where in syslog exception starts (0-indexed) split: (optional) path: URI of input file task was processing num_lines: (optional) number of lines in split start_line: (optional) first line of split (0-indexed) task_error: message: command and error message from task, as a string num_lines: number of lines in stderr this takes up path: stderr we read this from start_line: where in stderr error message starts (0-indexed) In addition, if *partial* is set to true (and we found an error), this dictionary will contain the key *partial*, set to True. """ result = {} syslogs_parsed = set() for match in matches: error = {} # are is this match for a stderr file, or a syslog? if match.get('syslog'): stderr_path = match['path'] syslog_path = match['syslog']['path'] else: stderr_path = None syslog_path = match['path'] if stderr_path: if log_callback: log_callback(stderr_path) task_error = _parse_task_stderr(_cat_log_lines(fs, stderr_path)) if task_error: task_error['path'] = stderr_path error['task_error'] = task_error else: continue # can parse syslog independently later # already parsed this syslog in conjunction with an earlier task error if syslog_path in syslogs_parsed: continue if log_callback: log_callback(syslog_path) syslog_error = _parse_task_syslog(_cat_log_lines(fs, syslog_path)) syslogs_parsed.add(syslog_path) if not syslog_error.get('hadoop_error'): # if no entry in Hadoop syslog, probably just noise continue error.update(syslog_error) error['hadoop_error']['path'] = syslog_path # patch in IDs we learned from path for id_key in 'attempt_id', 'container_id': if id_key in match: error[id_key] = match[id_key] _add_implied_task_id(error) result.setdefault('errors', []) result['errors'].append(error) if partial: result['partial'] = True break return result def _parse_task_syslog(lines): """Parse an error out of a syslog file (or a Spark stderr file). Returns a dict, possibly containing the following keys: check_stdout: if true, we should look for task errors in the corresponding 'stdout' file. Used for Spark logs. hadoop_error: message: string containing error message and Java exception num_lines: number of lines in syslog this takes up start_line: where in syslog exception starts (0-indexed) split: (optional) path: URI of input file task was processing num_lines: (optional) number of lines in split start_line: (optional) first line of split (0-indexed) """ return _parse_task_syslog_records(_parse_hadoop_log4j_records(lines)) def _parse_task_syslog_records(records): """Helper for _parse_task_syslog(); takes log4j records rather than lines""" result = {} for record in records: message = record['message'] m = _OPENING_FOR_READING_RE.match(message) if m: result['split'] = dict(path=m.group('path')) continue m = _YARN_INPUT_SPLIT_RE.match(message) if m: result['split'] = dict( path=m.group('path'), start_line=int(m.group('start_line')), num_lines=int(m.group('num_lines'))) continue m = _JAVA_TRACEBACK_RE.search(message) if m: result['hadoop_error'] = dict( message=message, num_lines=record['num_lines'], start_line=record['start_line'], ) break # nothing to do once we've found the error if (record['logger'] == _SPARK_APP_MASTER_LOGGER and record['level'] == 'ERROR'): m = _SPARK_APP_EXITED_RE.match(message) if m: result['hadoop_error'] = dict( message=message, num_lines=record['num_lines'], start_line=record['start_line'], ) result['check_stdout'] = True break # nothing else to do once we've found the error return result # TODO: allow filtering of bad lines to happen elsewhere, pass this # function numbered lines def _parse_task_stderr(lines): """Attempt to explain any error in task stderr, be it a Python exception or a problem with a setup command (see #1203). Looks for '+ ' followed by a command line, and then the command's stderr. If there are no such lines (because we're not using a setup script), assumes the entire file contents are the cause of error. Returns a task error dictionary with the following keys, or None if the file is empty. message: a string (e.g. Python command line followed by Python traceback) start_line: where in lines message appears (0-indexed) num_lines: how may lines the message takes up """ task_error = None stack_trace_start_line = None for line_num, line in enumerate(lines): line = line.rstrip('\r\n') # ignore "subprocess failed" stack trace if _SUBPROCESS_FAILED_STACK_TRACE_START.match(line): stack_trace_start_line = line_num continue # once we detect a stack trace, keep ignoring lines until # we find a non-indented one if stack_trace_start_line is not None: if line.lstrip() != line: continue else: stack_trace_start_line = None # ignore warnings about initializing log4j, counters, etc. if any(ir.match(line) for ir in _TASK_STDERR_IGNORE_RES): # ignored lines shouldn't count as part of the line range if task_error and task_error.get('num_lines') is None: task_error['num_lines'] = line_num - task_error['start_line'] continue elif not task_error or line.startswith('+ '): # stderr log should only contain counters and status # messages in local mode task_error = dict( message=line, start_line=line_num) else: task_error['message'] += '\n' + line task_error['num_lines'] = None if task_error: if task_error.get('num_lines') is None: end_line = stack_trace_start_line or (line_num + 1) task_error['num_lines'] = end_line - task_error['start_line'] return task_error else: return None