# Copyright 2015-2016 Yelp # Copyright 2017 Yelp # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Parse logs from EMR bootstrap actions (and, eventually, Dataproc initialization actions).""" import re from .task import _parse_task_stderr from .wrap import _cat_log_lines from .wrap import _ls_logs # match cause of failure when there's a problem with bootstrap script. Example: # # On the master instance (i-96c21a39), bootstrap action 1 returned a non-zero # return code # # On 2 slave instances (including i-105af6bf and i-b659f519), bootstrap action # 1 returned a non-zero return code # # (EMR doesn't seem to return errors that include both master and slave # instances) _BOOTSTRAP_NONZERO_RETURN_CODE_RE = re.compile( r'^.*\(.*?(?Pi-[0-9a-f]+).*\)' r'.*bootstrap action (?P\d+)' r'.*non-zero return code' r'.*$') # match a path like: # /node/i-96c21a39/bootstrap-actions/1/stderr.gz # # (may or may not actually be gzipped) _EMR_BOOTSTRAP_STDERR_PATH_RE = re.compile( r'^(?P.*?/)' r'node/' r'(?Pi-[0-9a-f]+)/' r'bootstrap-actions/' r'(?P\d+)/' r'stderr(?P\.\w+)?') def _check_for_nonzero_return_code(reason): """Given a reason for cluster termination, check if it's because a bootstrap action terminated with an error. If it is, return a dictionary with the keys action_num (0-indexed bootstrap action number) and node_id (a string). Otherwise return None. """ m = _BOOTSTRAP_NONZERO_RETURN_CODE_RE.match(reason) if m: return _extract_action_num_and_node_id(m) else: return None def _ls_emr_bootstrap_stderr_logs( fs, log_dir_stream, action_num=None, node_id=None): """Find all stderr from bootstrap actions in the given dir. Sort so the most recent one comes first, using node ID as a tiebreaker. (In practice, we look at a single a action on a single node anyway.) """ matches = _ls_logs(fs, log_dir_stream, _match_emr_bootstrap_stderr_path, action_num=None, node_id=None) return sorted(matches, key=lambda m: (-m['action_num'], m['node_id'])) def _match_emr_bootstrap_stderr_path(path, node_id=None, action_num=None): """If *path* corresponds to a bootstrap stderr file, return a dict with the keys *action_num* (an 0-indexed int) and *node_id*. Otherwise return None. Optionally, filter by *action_num* and *node_id*. """ m = _EMR_BOOTSTRAP_STDERR_PATH_RE.match(path) if not m: return result = _extract_action_num_and_node_id(m) if action_num is not None and action_num != result['action_num']: return None if node_id is not None and node_id != result['node_id']: return None return result # This strategy assumes we can ask the EMR API which node(s) the error # occurred on. This is true even after the cluster has shut down, so it's # a pretty reasonable assumption, even for after-the-fact log parsing. # # If we *had* to figure out from logs alone whether a node had an error, # we'd want to first check the controller file for a line like: # # 2016-07-07T23:26:49.565Z ERROR Execution failed with code '1' # # and then look in the corresponding stderr file, much like how we handle # task logs. This seems like overkill at the moment. def _interpret_emr_bootstrap_stderr(fs, matches, partial=True): """Extract errors from bootstrap stderr. If *partial* is true, stop when we find the first match. (In practice, we usually target a single file anyway.) """ result = {} for match in matches: stderr_path = match['path'] task_error = _parse_task_stderr(_cat_log_lines(fs, stderr_path)) if task_error: task_error = dict(task_error) # make a copy task_error['path'] = stderr_path error = dict( action_num=match['action_num'], node_id=match['node_id'], task_error=task_error) result.setdefault('errors', []) result['errors'].append(error) if partial: result['partial'] = True break return result def _extract_action_num_and_node_id(m): """Helper method: Extract *action_num* and *node_id* from the given regex match. Convert *action_num* to a 0-indexed integer.""" return dict( action_num=(int(m.group('action_num')) - 1), node_id=m.group('node_id'), )