# Copyright 2017-2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Print probable cause of error for a failed step.

Currently this only works on EMR.

Usage::

    mrjob diagnose [opts] j-CLUSTERID

Options::

  -c CONF_PATHS, --conf-path CONF_PATHS
                        Path to alternate mrjob.conf file to read from
  --no-conf             Don't load mrjob.conf even if it's available
  --ec2-endpoint EC2_ENDPOINT
                        Force mrjob to connect to EC2 on this endpoint (e.g.
                        ec2.us-west-1.amazonaws.com). Default is to infer this
                        from region.
  --emr-endpoint EMR_ENDPOINT
                        Force mrjob to connect to EMR on this endpoint (e.g.
                        us-west-1.elasticmapreduce.amazonaws.com). Default is
                        to infer this from region.
  -h, --help            show this help message and exit
  -q, --quiet           Don't print anything to stderr
  --region REGION       GCE/AWS region to run Dataproc/EMR jobs in.
  --s3-endpoint S3_ENDPOINT
                        Force mrjob to connect to S3 on this endpoint (e.g. s3
                        -us-west-1.amazonaws.com). You usually shouldn't set
                        this; by default mrjob will choose the correct
                        endpoint for each S3 bucket based on its location.
  --step-id STEP_ID     ID of a particular failed step to diagnose
  -v, --verbose         print more messages to stderr

.. versionadded:: 0.6.1
"""
from argparse import ArgumentParser
from logging import getLogger

from mrjob.aws import _boto3_paginate
from mrjob.emr import EMRJobRunner
from mrjob.job import MRJob
from mrjob.logs.errors import _format_error
from mrjob.options import _add_basic_args
from mrjob.options import _add_runner_args
from mrjob.options import _alphabetize_actions
from mrjob.options import _filter_by_role

log = getLogger(__name__)


def main(cl_args=None):
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    runner_kwargs = {k: v for k, v in options.__dict__.items()
                     if k not in ('quiet', 'verbose', 'step_id')}

    runner = EMRJobRunner(**runner_kwargs)
    emr_client = runner.make_emr_client()

    # pick step
    step = _get_step(emr_client, options.cluster_id, options.step_id)

    if not step:
        raise SystemExit(1)

    if step['Status']['State'] != 'FAILED':
        log.warning('step %s has state %s, not FAILED' %
                    (step['Id'], step['Status']['State']))

    # interpret logs
    log.info('Diagnosing step %s (%s)' % (step['Id'], step['Name']))

    log_interpretation = dict(step_id=step['Id'])

    step_type = _infer_step_type(step)

    error = runner._pick_error(log_interpretation, step_type)

    # print error
    if error:
        log.error('Probable cause of failure:\n\n%s\n\n' %
                  _format_error(error))
    else:
        log.warning('No error detected')


def _get_step(emr_client, cluster_id, step_id=None):

    # just iterate backwards through steps, rather than filtering
    # by step ID or status. usually it'll be the last step anyhow

    for step in _boto3_paginate('Steps', emr_client, 'list_steps',
                                ClusterId=cluster_id):

        if _step_matches(step, step_id=step_id):
            return step
    else:
        if step_id:
            log.error('step %s not found on cluster %s' %
                      (step_id, cluster_id))
        else:
            log.error('cluster %s has no failed steps' % cluster_id)


def _step_matches(step, step_id=None):
    if not step_id:
        return step['Status']['State'] == 'FAILED'
    else:
        return step['Id'] == step_id


def _infer_step_type(step):
    args = step['Config']['Args']

    # all that matters for log parsing is picking out Spark steps
    # (doesn't matter if it's spark or spark_jar or spark_script)
    #
    # and of course we don't know the logging habits of jar steps,
    # so we might as well use streaming's logic
    if '--master' in args and '--deploy-mode' in args:
        return 'spark'
    else:
        return 'streaming'


def _make_arg_parser():
    usage = '%(prog)s diagnose [opts] [--step-id STEP_ID] CLUSTER_ID'
    description = (
        'Get probable cause of failure for step on CLUSTER_ID.'
        ' By default we look at the last failed step')
    arg_parser = ArgumentParser(usage=usage, description=description)

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    arg_parser.add_argument(
        dest='cluster_id',
        help='ID of cluster with failed step')
    arg_parser.add_argument(
        '--step-id', dest='step_id',
        help='ID of a particular failed step to diagnose')

    _alphabetize_actions(arg_parser)

    return arg_parser


if __name__ == '__main__':
    main()