# -*- coding: utf-8 -*-
# Copyright 2011 Matthew Tai and Yelp
# Copyright 2012-2016 Yelp and Contributors
# Copyright 2017-2018 Yelp
# Copyright 2019 Yelp
# Coypright 2020 Affirm, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run an MRJob inline by running all mappers and reducers through the same
process. Useful for debugging."""
import logging
import os
import sys

from mrjob.job import MRJob
from mrjob.runner import _fix_env
from mrjob.sim import SimMRJobRunner
from mrjob.util import save_current_environment
from mrjob.util import save_cwd
from mrjob.util import save_sys_path
from mrjob.util import save_sys_std

log = logging.getLogger(__name__)


class InlineMRJobRunner(SimMRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` in the same process, so it's easy
    to attach a debugger.

    This is the default way to run jobs (we assume you'll spend some time
    debugging your job before you're ready to run it on EMR or Hadoop).

    Unlike other runners, ``InlineMRJobRunner``\\'s ``run()`` method
    raises the actual exception that caused a step to fail (rather than
    :py:class:`~mrjob.step.StepFailedException`).

    To more accurately simulate your environment prior to running on
    Hadoop/EMR, use ``-r local`` (see
    :py:class:`~mrjob.local.LocalMRJobRunner`).

    .. versionadded:: 0.6.8

       can run :py:class:`~mrjob.step.SparkStep`\\s via the
       :py:mod:`pyspark` library.
    """
    alias = 'inline'

    def __init__(self, mrjob_cls=None, **kwargs):
        """:py:class:`~mrjob.inline.InlineMRJobRunner` takes the same keyword
        args as :py:class:`~mrjob.runner.MRJobRunner`. However, please note
        that
        *hadoop_input_format*, *hadoop_output_format*, and *partitioner*
        are ignored
        because they require Java. If you need to test these, consider
        starting up a standalone Hadoop instance and running your job with
        ``-r hadoop``."""
        super(InlineMRJobRunner, self).__init__(**kwargs)
        # if we run python -m mrjob.job, mrjob_cls is __main__.MRJob
        # which is identical to (but not a subclass of) mrjob.job.MRJob
        #
        # the base MRJob still isn't runnable, but this yields a more
        # useful error about the step having no mappers or reducers
        if not (mrjob_cls is None or issubclass(mrjob_cls, MRJob) or
                mrjob_cls.__module__ == '__main__'):
            raise TypeError

        self._mrjob_cls = mrjob_cls

        # used to explain exceptions
        self._error_while_reading_from = None

        if self._opts['py_files']:
            log.warning("inline runner doesn't import py_files")

        if self._opts['setup']:
            log.warning("inline runner can't run setup commands")

    def _check_step(self, step, step_num):
        """Don't try to run steps that include commands."""
        super(InlineMRJobRunner, self)._check_step(step, step_num)

        if step['type'] == 'streaming':
            for mrc in ('mapper', 'combiner', 'reducer'):
                if step.get(mrc):
                    if 'command' in step[mrc] or 'pre_filter' in step[mrc]:
                        raise NotImplementedError(
                            "step %d's %s runs a command, but inline"
                            " runner does not support subprocesses (try"
                            " -r local)" % (step_num, mrc))

    def _invoke_task_func(self, task_type, step_num, task_num):
        """Just run tasks in the same process."""
        manifest = (step_num == 0 and task_type == 'mapper' and
                    self._uses_input_manifest())

        # Don't care about pickleability since this runs in the same process
        def invoke_task(stdin, stdout, stderr, wd, env):
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                # pretend we're running the script in the working dir
                os.environ.update(env)
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we've redirected stdin/stdout/stderr
                sys.stdin = stdin
                sys.stdout = stdout
                sys.stderr = stderr

                input_uri = None
                try:
                    args = self._args_for_task(step_num, task_type)

                    if manifest:
                        # read input path from stdin, add to args
                        line = stdin.readline().decode('utf_8')
                        input_uri = line.split('\t')[-1].rstrip()
                        # input_uri is an absolute path, can serve
                        # as path and uri both
                        args = list(args) + [input_uri, input_uri]

                    task = self._mrjob_cls(args)
                    task.execute()
                except:
                    # so users can figure out where the exception came from;
                    # see _log_cause_of_error(). we can't wrap the exception
                    # because then we lose the stacktrace (which is the whole
                    # point of the inline runner)

                    if input_uri:  # from manifest
                        self._error_while_reading_from = input_uri
                    else:
                        self._error_while_reading_from = self._task_input_path(
                            task_type, step_num, task_num)

                    raise

        return invoke_task

    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._cmdenv()))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()

    def _log_cause_of_error(self, ex):
        """Just tell what file we were reading from (since they'll see
        the stacktrace from the actual exception)"""
        if self._error_while_reading_from:
            log.error('\nError while reading from %s:\n' %
                      self._error_while_reading_from)

    def _spark_executors_have_own_wd(self):
        return True  # because we fake it

    def _spark_driver_has_own_wd(self):
        return True  # because we fake it

    def _wd_mirror(self):
        return None  # no need for this, we set up the working dir (Spark too)