# -*- coding: utf-8 -*-
# Copyright 2009-2017 Yelp and Contributors
# Copyright 2018-2019 Yelp
# Copyright 2020 Affirm, Inc. and Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Abstract base class for all runners that execute binaries/scripts
(that is, everything but inline mode).
"""
import logging
import os
import os.path
import pipes
import re
import sys
from mrjob.py2 import PY2
from platform import python_implementation
from subprocess import Popen
from subprocess import PIPE

try:
    import pty
    pty  # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
    pty = None

try:
    import pyspark
    pyspark  # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
    pyspark = None

import mrjob.step
from mrjob.compat import translate_jobconf
from mrjob.conf import combine_cmds
from mrjob.conf import combine_dicts
from mrjob.logs.log4j import _parse_hadoop_log4j_records
from mrjob.logs.spark import _parse_spark_log
from mrjob.logs.step import _eio_to_eof
from mrjob.py2 import string_types
from mrjob.runner import MRJobRunner
from mrjob.setup import parse_setup_cmd
from mrjob.util import cmd_line
from mrjob.util import shlex_split
from mrjob.util import unique
from mrjob.util import which
from mrjob.util import zip_dir

log = logging.getLogger(__name__)

# no need to escape arguments that only include these characters
_HADOOP_SAFE_ARG_RE = re.compile(r'^[\w\./=-]*$')

# used to handle manifest files
_MANIFEST_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'

# map archive file extensions to the command used to unarchive them
_EXT_TO_UNARCHIVE_CMD = {
    '.zip': 'unzip -o %(file)s -d %(dir)s',
    '.tar': 'mkdir %(dir)s; tar xf %(file)s -C %(dir)s',
    '.tar.gz': 'mkdir %(dir)s; tar xfz %(file)s -C %(dir)s',
    '.tgz': 'mkdir %(dir)s; tar xfz %(file)s -C %(dir)s',
}


def _unarchive_cmd(path):
    """Look up the unarchive command to use with the given file extension,
    or raise KeyError if there is no matching command."""
    for ext, unarchive_cmd in sorted(_EXT_TO_UNARCHIVE_CMD.items()):
        # use this so we can match e.g. mrjob-0.7.0.tar.gz
        if path.endswith(ext):
            return unarchive_cmd

    raise KeyError('unknown archive type: %s' % path)


class MRJobBinRunner(MRJobRunner):

    OPT_NAMES = MRJobRunner.OPT_NAMES | {
        'python_bin',
        'sh_bin',
        'spark_args',
        'spark_submit_bin',
        'task_python_bin',
    }

    def __init__(self, **kwargs):
        super(MRJobBinRunner, self).__init__(**kwargs)

        # where a zip file of the mrjob library is stored locally
        self._mrjob_zip_path = None

        # we'll create the setup wrapper scripts later
        self._setup_wrapper_script_path = None
        self._manifest_setup_script_path = None
        self._spark_python_wrapper_path = None

        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmd() for details
        self._setup = [parse_setup_cmd(cmd) for cmd in self._opts['setup']]

        for cmd in self._setup:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archives tokens to archives
                    if token['type'] == 'dir':
                        # feed the archive's path to self._working_dir_mgr
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'

                    self._working_dir_mgr.add(**token)

        # warning: no setup scripts on Spark when no working dir
        if self._setup and self._has_pyspark_steps() and not(
                self._spark_executors_have_own_wd()):
            log.warning("setup commands aren't supported on Spark master %r" %
                        self._spark_master())

        # --py-files on Spark doesn't allow '#' (see #1375)
        if any('#' in path for path in self._opts['py_files']):
            raise ValueError("py_files cannot contain '#'")

        # Keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

    @classmethod
    def _default_opts(cls):
        return combine_dicts(
            super(MRJobBinRunner, cls)._default_opts(),
            dict(
                read_logs=True,
            )
        )

    def _fix_opt(self, opt_key, opt_value, source):
        """Check sh_bin"""
        opt_value = super(MRJobBinRunner, self)._fix_opt(
            opt_key, opt_value, source)

        # check that sh_bin doesn't have too many args
        if opt_key == 'sh_bin':
            # opt_value is usually a string, combiner makes it a list of args
            sh_bin = combine_cmds(opt_value)

            # empty sh_bin just means to use the default, see #1926

            # make these hard requirements in v0.7.0?
            if len(sh_bin) > 1 and not os.path.isabs(sh_bin[0]):
                log.warning('sh_bin (from %s) should use an absolute path'
                            ' if you want it to take arguments' % source)
            elif len(sh_bin) > 2:
                log.warning('sh_bin (from %s) should not take more than one'
                            ' argument' % source)

        return opt_value

    ### python binary ###

    def _python_bin(self):
        """Python binary used for everything other than invoking the job.

        For running job tasks (e.g. ``--mapper``, ``--spark``), we use
        :py:meth:`_task_python_bin`, which can be set to a different value
        by setting :mrjob-opt:`task_python_bin`.

        Ways mrjob uses Python other than running tasks:
         * file locking in setup wrapper scripts
         * finding site-packages dir to bootstrap mrjob on clusters
         * invoking ``cat.py`` in local mode
         * the Python binary for Spark (``$PYSPARK_PYTHON``)
        """
        # python_bin isn't an option for inline runners
        return self._opts['python_bin'] or self._default_python_bin()

    def _task_python_bin(self):
        """Python binary used to invoke job with ``--mapper``,
        ``--reducer``, ``--spark``, etc."""
        return (self._opts['task_python_bin'] or
                self._python_bin())

    def _default_python_bin(self, local=False):
        """The default python command. If local is true, try to use
        sys.executable. Otherwise use 'python2.7' or 'python3' as appropriate.

        This returns a single-item list (because it's a command).
        """
        is_pypy = (python_implementation() == 'PyPy')

        if local and sys.executable:
            return [sys.executable]
        else:
            if PY2:
                return ['pypy'] if is_pypy else ['python2.7']
            else:
                return ['pypy3'] if is_pypy else ['python3']

    ### running MRJob scripts ###

    def _script_args_for_step(self, step_num, mrc, input_manifest=False):
        args = (self._task_python_bin() +
                [self._working_dir_mgr.name('file', self._script_path)] +
                self._args_for_task(step_num, mrc))

        if input_manifest and mrc == 'mapper':
            wrapper = self._manifest_setup_script_path
        elif self._setup_wrapper_script_path:
            wrapper = self._setup_wrapper_script_path
        else:
            return args

        return (self._sh_bin() + [
            self._working_dir_mgr.name('file', wrapper)] + args)

    def _substep_args(self, step_num, mrc):
        step = self._get_step(step_num)

        if step[mrc]['type'] == 'command':
            cmd = step[mrc]['command']

            # never wrap custom hadoop streaming commands in bash
            if isinstance(cmd, string_types):
                return shlex_split(cmd)
            else:
                return cmd

        elif step[mrc]['type'] == 'script':
            script_args = self._script_args_for_step(
                step_num, mrc, input_manifest=step.get('input_manifest'))

            if 'pre_filter' in step[mrc]:
                return self._sh_wrap(
                    '%s | %s' % (step[mrc]['pre_filter'],
                                 cmd_line(script_args)))
            else:
                return script_args
        else:
            raise ValueError("Invalid %s step %d: %r" % (
                mrc, step_num, step[mrc]))

    ### hadoop streaming ###

    def _render_substep(self, step_num, mrc):
        step = self._get_step(step_num)

        if mrc in step:
            # cmd_line() does things that shell is fine with but
            # Hadoop Streaming finds confusing.
            return _hadoop_cmd_line(self._substep_args(step_num, mrc))
        else:
            if mrc == 'mapper':
                return 'cat'
            else:
                return None

    def _hadoop_args_for_step(self, step_num):
        """Build a list of extra arguments to the hadoop binary.

        This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
        *hadoop_output_format*, *jobconf*, and *partitioner*.

        This doesn't handle input, output, mappers, reducers, or uploading
        files.
        """
        args = []

        # -libjars, -D
        args.extend(self._hadoop_generic_args_for_step(step_num))

        # hadoop_extra_args (if defined; it's not for sim runners)
        # this has to come after -D because it may include streaming-specific
        # args (see #1332).
        args.extend(self._opts.get('hadoop_extra_args', ()))

        # partitioner
        partitioner = self._partitioner or self._sort_values_partitioner()
        if partitioner:
            args.extend(['-partitioner', partitioner])

        # cmdenv
        for key, value in sorted(self._cmdenv().items()):
            args.append('-cmdenv')
            args.append('%s=%s' % (key, value))

        # hadoop_input_format
        if step_num == 0:
            if self._uses_input_manifest():
                args.extend(['-inputformat', _MANIFEST_INPUT_FORMAT])
            elif self._hadoop_input_format:
                args.extend(['-inputformat', self._hadoop_input_format])

        # hadoop_output_format
        if (step_num == self._num_steps() - 1 and self._hadoop_output_format):
            args.extend(['-outputformat', self._hadoop_output_format])

        return args

    def _hadoop_streaming_jar_args(self, step_num):
        """The arguments that come after ``hadoop jar <streaming jar path>``
        when running a Hadoop streaming job."""
        args = []

        # get command for each part of the job
        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        # set up uploading from HDFS/cloud storage to the working dir
        args.extend(self._upload_args())

        # if no reducer, shut off reducer tasks. This has to come before
        # extra hadoop args, which could contain jar-specific args
        # (e.g. -outputformat). See #1331.
        #
        # might want to just integrate this into _hadoop_args_for_step?
        if not reducer:
            args.extend(['-D', ('%s=0' % translate_jobconf(
                'mapreduce.job.reduces', self.get_hadoop_version()))])

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._step_input_uris(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._step_output_uri(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args

    def _hadoop_streaming_commands(self, step_num):
        return (
            self._render_substep(step_num, 'mapper'),
            self._render_substep(step_num, 'combiner'),
            self._render_substep(step_num, 'reducer'),
        )

    def _hadoop_generic_args_for_step(self, step_num):
        """Arguments like -D and -libjars that apply to every Hadoop
        subcommand."""
        args = []

        # libjars (#198)
        libjar_paths = self._libjar_paths()
        if libjar_paths:
            args.extend(['-libjars', ','.join(libjar_paths)])

        # jobconf (-D)
        jobconf = self._jobconf_for_step(step_num)

        for key, value in sorted(jobconf.items()):
            args.extend(['-D', '%s=%s' % (key, value)])

        return args

    def _libjar_paths(self):
        """Paths or URIs of libjars, from Hadoop/Spark's point of view.

        Override this for non-local libjars (e.g. on EMR).
        """
        return self._opts['libjars']

    def _interpolate_jar_step_args(self, args, step_num):
        """Like :py:meth:`_interpolate_step_args` except it
        also replaces `~mrjob.step.GENERIC_ARGS` with
        :py:meth:`_hadoop_generic_args_for_step`. This only
        makes sense for jar steps; Spark should raise an error
        if `~mrjob.step.GENERIC_ARGS` is encountered.
        """
        result = []

        for arg in args:
            if arg == mrjob.step.GENERIC_ARGS:
                result.extend(
                    self._hadoop_generic_args_for_step(step_num))
            else:
                result.append(arg)

        return self._interpolate_step_args(result, step_num)

    ### setup scripts ###

    def _py_files(self):
        """Everything in the *py_files* opt, plus a .zip of the mrjob
        library if needed.
        """
        py_files = list(self._opts['py_files'])

        if self._bootstrap_mrjob():
            py_files.append(self._create_mrjob_zip())

        return py_files

    def _create_setup_wrapper_scripts(self):
        """Create the setup wrapper script, and write it into our local temp
        directory (by default, to a file named setup-wrapper.sh).

        This will set ``self._setup_wrapper_script_path``, and add it to
        ``self._working_dir_mgr``

        This will do nothing if ``self._setup`` is empty or
        this method has already been called.

        If *local* is true, use local line endings (e.g. Windows). Otherwise,
        use UNIX line endings (see #1071).
        """
        if self._has_hadoop_streaming_steps():
            streaming_setup = self._py_files_setup() + self._setup

            if streaming_setup and not self._setup_wrapper_script_path:

                self._setup_wrapper_script_path = self._write_setup_script(
                    streaming_setup, 'setup-wrapper.sh',
                    'streaming setup wrapper script')

            if (self._uses_input_manifest() and not
                    self._manifest_setup_script_path):

                self._manifest_setup_script_path = self._write_setup_script(
                    streaming_setup, 'manifest-setup.sh',
                    'manifest setup wrapper script',
                    manifest=True)

        if (self._has_pyspark_steps() and
                self._spark_executors_have_own_wd() and
                not self._spark_python_wrapper_path):

            pyspark_setup = self._pyspark_setup()
            if pyspark_setup:
                self._spark_python_wrapper_path = self._write_setup_script(
                    pyspark_setup,
                    'python-wrapper.sh', 'Spark Python wrapper script',
                    wrap_python=True)

    def _pyspark_setup(self):
        """Like ``self._setup``, but prepends commands for archive
        emulation if needed."""
        setup = []

        if self._emulate_archives_on_spark():
            for name, path in sorted(
                    self._working_dir_mgr.name_to_path('archive').items()):

                archive_file_name = self._working_dir_mgr.name(
                    'archive_file', path)

                setup.append(_unarchive_cmd(path) % dict(
                    file=pipes.quote(archive_file_name),
                    dir=pipes.quote(name)))

        setup.extend(self._setup)

        return setup

    def _py_files_setup(self):
        """A list of additional setup commands to emulate Spark's
        --py-files option on Hadoop Streaming."""
        result = []

        for py_file in self._py_files():
            path_dict = {'type': 'file', 'name': None, 'path': py_file}
            self._working_dir_mgr.add(**path_dict)
            result.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        return result

    def _write_setup_script(self, setup, filename, desc,
                            manifest=False, wrap_python=False):
        """Write a setup script and return its path."""
        contents = self._setup_wrapper_script_content(
            setup, manifest=manifest, wrap_python=wrap_python)

        path = os.path.join(self._get_local_tmp_dir(), filename)
        self._write_script(contents, path, desc)

        self._working_dir_mgr.add('file', path)

        return path

    def _create_mrjob_zip(self):
        """Make a zip of the mrjob library, without .pyc or .pyo files,
        This will also set ``self._mrjob_zip_path`` and return it.

        Typically called from
        :py:meth:`_create_setup_wrapper_scripts`.

        It's safe to call this method multiple times (we'll only create
        the zip file once.)
        """
        if not self._mrjob_zip_path:
            # find mrjob library
            import mrjob

            if not os.path.basename(mrjob.__file__).startswith('__init__.'):
                raise Exception(
                    "Bad path for mrjob library: %s; can't bootstrap mrjob",
                    mrjob.__file__)

            mrjob_dir = os.path.dirname(mrjob.__file__) or '.'

            zip_path = os.path.join(self._get_local_tmp_dir(), 'mrjob.zip')

            def filter_path(path):
                filename = os.path.basename(path)
                return not(filename.lower().endswith('.pyc') or
                           filename.lower().endswith('.pyo') or
                           # filter out emacs backup files
                           filename.endswith('~') or
                           # filter out emacs lock files
                           filename.startswith('.#') or
                           # filter out MacFuse resource forks
                           filename.startswith('._'))

            log.debug('archiving %s -> %s as %s' % (
                mrjob_dir, zip_path, os.path.join('mrjob', '')))
            zip_dir(mrjob_dir, zip_path, filter=filter_path, prefix='mrjob')

            self._mrjob_zip_path = zip_path

        return self._mrjob_zip_path

    def _setup_wrapper_script_content(
            self, setup, manifest=False, wrap_python=False):
        """Return a (Bourne) shell script that runs the setup commands and then
        executes whatever is passed to it (this will be our mapper/reducer),
        as a list of strings (one for each line, including newlines).

        We obtain a file lock so that two copies of the setup commands
        cannot run simultaneously on the same machine (this helps for running
        :command:`make` on a shared source code archive, for example).
        """
        lines = []

        # TODO: this is very similar to _start_of_sh_script() in cloud.py

        if wrap_python:
            # start with shebang
            sh_bin = self._sh_bin()

            if os.path.isabs(sh_bin[0]):
                shebang_bin = sh_bin
            else:
                shebang_bin = ['/usr/bin/env'] + list(sh_bin)

            if len(shebang_bin) > 2:
                # Linux limits shebang to one binary and one arg
                shebang_bin = shebang_bin[:2]
                log.warning('Limiting shebang to two arguments:'
                            '#!%s' % cmd_line(shebang_bin))

            lines.append('#!%s' % cmd_line(shebang_bin))

        # hook for 'set -e', etc.
        pre_commands = self._sh_pre_commands()
        if pre_commands:
            for cmd in pre_commands:
                lines.append(cmd)
            lines.append('')

        if setup:
            lines.extend(self._setup_cmd_content(setup))

        # handle arguments to the script
        if wrap_python:
            # pretend to be python ($@ is arguments to the python binary)
            python_bin = self._task_python_bin()
            lines.append('%s "$@"' % cmd_line(python_bin))
        elif manifest:
            # arguments ($@) are a command
            # eventually runs: "$@" $INPUT_PATH $INPUT_URI
            lines.extend(self._manifest_download_content())
        else:
            # arguments ($@) are a command, just run it
            lines.append('"$@"')

        return lines

    def _setup_cmd_content(self, setup):
        """Write setup script content to obtain a file lock, run setup
        commands in a way that doesn't perturb the script, and then
        release the lock and return to the original working directory."""
        lines = []

        lines.append('# store $PWD')
        lines.append('__mrjob_PWD=$PWD')
        lines.append('')

        lines.append('# obtain exclusive file lock')
        # Basically, we're going to tie file descriptor 9 to our lockfile,
        # use a subprocess to obtain a lock (which we somehow inherit too),
        # and then release the lock by closing the file descriptor.
        # File descriptors 10 and higher are used internally by the shell,
        # so 9 is as out-of-the-way as we can get.
        lines.append('exec 9>/tmp/wrapper.lock.%s' % self._job_key)
        # would use flock(1), but it's not always available
        lines.append("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" %
                     cmd_line(self._python_bin()))
        lines.append('')

        lines.append('# setup commands')
        # group setup commands so we can redirect their input/output (see
        # below). Don't use parens; this would invoke a subshell, which would
        # keep us from exporting environment variables to the task.
        lines.append('{')
        for cmd in setup:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '  # indent, since these commands are in a group
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._working_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            lines.append(line)
        # redirect setup commands' input/output so they don't interfere
        # with the task (see Issue #803).
        lines.append('} 0</dev/null 1>&2')
        lines.append('')

        lines.append('# release exclusive file lock')
        lines.append('exec 9>&-')
        lines.append('')

        lines.append('# run task from the original working directory')
        lines.append('cd $__mrjob_PWD')

        return lines

    def _manifest_download_content(self):
        """write the part of the manifest setup script after setup, that
        downloads the input file, runs the script, and then deletes
        the file."""
        lines = []

        lines.append('{')

        # read URI from stdin
        lines.append('  # read URI of input file from stdin')
        lines.append('  INPUT_URI=$(cut -f 2)')
        lines.append('')

        # pick file extension (e.g. ".warc.gz")
        lines.append('  # pick file extension')
        lines.append("  FILE_EXT=$(basename $INPUT_URI | sed -e 's/^[^.]*//')")
        lines.append('')

        # pick a unique name in the current directory to download the file to
        lines.append('  # pick filename to download to')
        lines.append('  INPUT_PATH=$(mktemp ./input-XXXXXXXXXX$FILE_EXT)')
        lines.append('  rm $INPUT_PATH')
        lines.append('')

        # download the file (using different commands depending on the path)
        lines.append('  # download the input file')
        lines.append('  case $INPUT_URI in')
        download_cmds = (
            list(self._manifest_download_commands()) + [('*', 'cp')])
        for glob, cmd in download_cmds:
            lines.append('    %s)' % glob)
            lines.append('      %s $INPUT_URI $INPUT_PATH' % cmd)
            lines.append('      ;;')
        lines.append('  esac')
        lines.append('')

        # unpack .bz2 and .gz files
        lines.append('  # if input file is compressed, unpack it')
        lines.append('  case $INPUT_PATH in')
        for ext, cmd in self._manifest_uncompress_commands():
            lines.append('    *.%s)' % ext)
            lines.append('      %s $INPUT_PATH' % cmd)
            lines.append("      INPUT_PATH="
                         r"$(echo $INPUT_PATH | sed -e 's/\.%s$//')" % ext)
            lines.append('      ;;')
        lines.append('  esac')
        lines.append('} 1>&2')
        lines.append('')

        # don't exit if script fails
        lines.append('# run our mrjob script')
        lines.append('set +e')
        # pass input path and URI to script
        lines.append('"$@" $INPUT_PATH $INPUT_URI')
        lines.append('')

        # save return code, turn off echo
        lines.append('# if script fails, print input URI before exiting')
        lines.append('{ RETURNCODE=$?; set +x; } 1>&2 2>/dev/null')
        lines.append('')

        lines.append('{')

        # handle errors
        lines.append('  if [ $RETURNCODE -ne 0 ]')
        lines.append('  then')
        lines.append('    echo')
        lines.append('    echo "while reading input from $INPUT_URI"')
        lines.append('  fi')
        lines.append('')

        # clean up input
        lines.append('  rm $INPUT_PATH')
        lines.append('} 1>&2')
        lines.append('')

        # exit with correct status
        lines.append('exit $RETURNCODE')

        return lines

    def _manifest_download_commands(self):
        """Return a list of ``(glob, cmd)``, where *glob*
        matches a path or URI to download, and download command is a command
        to download it (e.g. ```hadoop fs -copyToLocal``), as a
        string.

        Redefine this in your subclass. More specific blobs should come first.
        """
        return []

    def _manifest_uncompress_commands(self):
        """Return a list of ``(ext, cmd)`` where ``ext`` is a file extension
        (e.g. ``gz``) and ``cmd`` is a command to uncompress it (e.g.
        ``gunzip``)."""
        return [
            ('bz2', 'bunzip2'),
            ('gz', 'gunzip'),
        ]

    def _sh_bin(self):
        """The sh binary and any arguments, as a list. Override this
        if, for example, a runner needs different default values
        depending on circumstances (see :py:class:`~mrjob.emr.EMRJobRunner`).
        """
        return self._opts['sh_bin'] or self._default_sh_bin()

    def _default_sh_bin(self):
        """The default sh binary, if :mrjob-opt:`sh_bin` isn't set."""
        return ['/bin/sh', '-ex']

    def _sh_pre_commands(self):
        """A list of lines to put at the very start of any sh script
        (e.g. ``set -e`` when ``sh -e`` wont work, see #1549)
        """
        return []

    def _sh_wrap(self, cmd_str):
        """Helper for _substep_args()

        Wrap command in sh -c '...' to allow for pipes, etc.
        Use *sh_bin* option."""
        # prepend set -e etc.
        cmd_str = '; '.join(self._sh_pre_commands() + [cmd_str])

        return self._sh_bin() + ['-c', cmd_str]

    ### spark ###

    def _args_for_spark_step(self, step_num, last_step_num=None):
        """The actual arguments used to run the spark-submit command.

        This handles both all Spark step types (``spark``, ``spark_jar``,
        and ``spark_script``).

        *last_step_num* is only used by the Spark runner, where multiple
        streaming steps are run in a single Spark job
        """
        return (
            self.get_spark_submit_bin() +
            self._spark_submit_args(step_num) +
            [self._spark_script_path(step_num)] +
            self._spark_script_args(step_num, last_step_num)
        )

    def _run_spark_submit(self, spark_submit_args, env, record_callback):
        """Run the spark submit binary in a subprocess, using a PTY if possible

        :param spark_submit_args: spark-submit binary and arguments, as as list
        :param env: environment variables, as a dict
        :param record_callback: a function that takes a single log4j record
                                as its argument (see
                                :py:func:`~mrjob.logs.log4j\
                                ._parse_hadoop_log4j_records)

        :return: tuple of the subprocess's return code and a
                 step interpretation dictionary
        """
        log.debug('> %s' % cmd_line(spark_submit_args))
        log.debug('  with environment: %r' % sorted(env.items()))

        # these should always be set, but just in case
        returncode = 0
        step_interpretation = {}

        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # spark-submit is running
            log.debug('No PTY available, using Popen() to invoke spark-submit')

            step_proc = Popen(
                spark_submit_args, stdout=PIPE, stderr=PIPE, env=env)

            # parse driver output
            step_interpretation = _parse_spark_log(
                step_proc.stderr, record_callback=record_callback)

            # there shouldn't be much output on STDOUT, just echo it
            for record in _parse_hadoop_log4j_records(step_proc.stdout):
                record_callback(record)

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                try:
                    os.execvpe(spark_submit_args[0], spark_submit_args, env)
                    # now this process is no longer Python
                except OSError as ex:
                    # use _exit() so we don't do cleanup, etc. that's
                    # the parent process's job
                    os._exit(ex.errno)
                finally:
                    # if we get some other exception, still exit hard
                    os._exit(-1)
            else:
                log.debug('Invoking spark-submit via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    step_interpretation = (
                        _parse_spark_log(
                            _eio_to_eof(master),
                            record_callback=record_callback))

                    _, returncode = os.waitpid(pid, 0)

        return (returncode, step_interpretation)

    def get_spark_submit_bin(self):
        """Return the location of the ``spark-submit`` binary, searching for it
        if necessary."""
        if not self._spark_submit_bin:
            self._spark_submit_bin = self._find_spark_submit_bin()
        return self._spark_submit_bin

    def _find_spark_submit_bin(self):
        """Attempt to find the spark binary. Returns a list of arguments.
        Defaults to ``['spark-submit']``.

        Re-define this in your subclass if you already know where
        to find spark-submit (e.g. on cloud services).
        """
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' % (
                path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']

    def _spark_submit_bin_dirs(self):
        # $SPARK_HOME
        spark_home = os.environ.get('SPARK_HOME')
        if spark_home:
            yield os.path.join(spark_home, 'bin')

        yield None  # use $PATH

        # look for pyspark installation (see #1984)
        if pyspark:
            yield os.path.join(os.path.dirname(pyspark.__file__), 'bin')

        # some other places recommended by install docs (see #1366)
        yield '/usr/lib/spark/bin'
        yield '/usr/local/spark/bin'
        yield '/usr/local/lib/spark/bin'

    def _spark_submit_args(self, step_num):
        """Build a list of extra args to the spark-submit binary for
        the given spark or spark_script step."""
        step = self._get_step(step_num)

        args = []

        # --conf arguments include python bin, cmdenv, jobconf. Make sure
        # that we can always override these manually
        jobconf = {}
        for key, value in self._spark_cmdenv(step_num).items():
            jobconf['spark.executorEnv.%s' % key] = value
            if self._spark_master() == 'yarn':  # YARN only, see #1919
                jobconf['spark.yarn.appMasterEnv.%s' % key] = value

        jobconf.update(self._jobconf_for_step(step_num))

        for key, value in sorted(jobconf.items()):
            args.extend(['--conf', '%s=%s' % (key, value)])

        # add --class (JAR steps)
        if step.get('main_class'):
            args.extend(['--class', step['main_class']])

        # add --jars, if any
        libjar_paths = self._libjar_paths()
        if libjar_paths:
            args.extend(['--jars', ','.join(libjar_paths)])

        # spark-submit treats --master and --deploy-mode as aliases for
        # --conf spark.master=... and --conf spark.deploy-mode=... (see #2032).
        #
        # we never want jobconf to override spark master or deploy mode, so put
        # these switches after --conf

        # add --master
        if self._spark_master():
            args.extend(['--master', self._spark_master()])

        # add --deploy-mode
        if self._spark_deploy_mode():
            args.extend(['--deploy-mode', self._spark_deploy_mode()])

        # --files and --archives
        args.extend(self._spark_upload_args())

        # --py-files (Python only)
        # spark runner can run 'streaming' steps, so just exclude
        # non-Python steps
        if 'jar' not in step['type']:
            py_file_uris = self._py_files()

            if self._upload_mgr:
                # don't assume py_files are in _upload_mgr; for example,
                # spark-submit doesn't need to upload them
                path_to_uri = self._upload_mgr.path_to_uri()
                py_file_uris = [path_to_uri.get(p, p) for p in py_file_uris]

            if py_file_uris:
                args.extend(['--py-files', ','.join(py_file_uris)])

        # spark_args option
        args.extend(self._opts['spark_args'])

        # step spark_args
        if step.get('spark_args'):
            args.extend(step['spark_args'])

        return args

    def _spark_upload_args(self):
        if not self._spark_executors_have_own_wd():
            # don't bother, there's no working dir to upload to
            return []

        return self._upload_args_helper(
            '--files', None,
            '--archives', None,
            always_use_hash=False,
            emulate_archives=self._emulate_archives_on_spark())

    def _spark_script_path(self, step_num):
        """The path of the spark script or JAR, used by
        _args_for_spark_step()."""
        step = self._get_step(step_num)

        if step['type'] == 'spark':
            path = self._script_path
        elif step['type'] == 'spark_jar':
            path = step['jar']
        elif step['type'] == 'spark_script':
            path = step['script']
        else:
            raise TypeError('Bad step type: %r' % step['type'])

        return self._interpolate_spark_script_path(path)

    def _interpolate_spark_script_path(self, path):
        """Redefine this in your subclass if the given path needs to be
        translated to a URI when running spark (e.g. on EMR)."""
        return path

    def _spark_cmdenv(self, step_num):
        """Returns a dictionary mapping environment variable to value,
        including mapping PYSPARK_PYTHON to self._python_bin()
        """
        step = self._get_step(step_num)

        cmdenv = {}

        if self._step_type_uses_pyspark(step['type']):
            driver_python = cmd_line(self._python_bin())

            if self._spark_python_wrapper_path:
                executor_python = './%s' % self._working_dir_mgr.name(
                    'file', self._spark_python_wrapper_path)
            else:
                executor_python = cmd_line(self._task_python_bin())

            if self._spark_deploy_mode() == 'cluster':
                # treat driver like executors (they run in same environment)
                cmdenv['PYSPARK_PYTHON'] = executor_python
            elif driver_python == executor_python:
                # no difference, just set $PYSPARK_PYTHON
                cmdenv['PYSPARK_PYTHON'] = driver_python
            else:
                # set different pythons for driver and executor
                cmdenv['PYSPARK_PYTHON'] = executor_python
                cmdenv['PYSPARK_DRIVER_PYTHON'] = driver_python

        cmdenv.update(self._cmdenv())
        return cmdenv


# these don't need to be methods

def _hadoop_cmd_line(args):
    """Escape args of a command line in a way that Hadoop can process
    them."""
    return ' '.join(_hadoop_escape_arg(arg) for arg in args)


def _hadoop_escape_arg(arg):
    """Escape a single command argument in a way that Hadoop can process it."""
    if _HADOOP_SAFE_ARG_RE.match(arg):
        return arg
    else:
        return "'%s'" % arg.replace("'", r"'\''")