GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/runner.py

# -*- coding: utf-8 -*-
# Copyright 2009-2017 Yelp and Contributors
# Copyright 2018 Yelp and Google, Inc.
# Copyright 2019 Yelp
# Copyright 2020 Affirm, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Base class for all runners."""
import copy
import datetime
import getpass
import logging
import os
import os.path
import posixpath
import pprint
import re
import sys
import tarfile
import tempfile
from shutil import rmtree

from mrjob.compat import translate_jobconf
from mrjob.compat import translate_jobconf_dict
from mrjob.compat import translate_jobconf_for_all_versions
from mrjob.conf import ClearedValue
from mrjob.conf import combine_jobconfs
from mrjob.conf import combine_opts
from mrjob.conf import load_opts_from_mrjob_confs
from mrjob.fs.composite import CompositeFilesystem
from mrjob.fs.local import LocalFilesystem
from mrjob.options import _combiners
from mrjob.options import _deprecated_aliases
from mrjob.options import CLEANUP_CHOICES
from mrjob.parse import is_uri
from mrjob.parse import to_uri
from mrjob.py2 import PY2
from mrjob.py2 import string_types
from mrjob.setup import WorkingDirManager
from mrjob.setup import name_uniquely
from mrjob.setup import parse_legacy_hash_path
from mrjob.step import INPUT
from mrjob.step import OUTPUT
from mrjob.step import _is_spark_step_type
from mrjob.step import _is_pyspark_step_type


log = logging.getLogger(__name__)

# use to detect globs and break into the part before and after the glob
GLOB_RE = re.compile(r'^(.*?)([\[\*\?].*)$')

# buffer for piping files into sort on Windows
_BUFFER_SIZE = 4096

# jobconf options for implementing SORT_VALUES
_SORT_VALUES_JOBCONF = {
    'mapreduce.partition.keypartitioner.options': '-k1,1',
    'stream.num.map.output.key.fields': 2
}

# partitioner for sort_values
_SORT_VALUES_PARTITIONER = \
    'org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'


class MRJobRunner(object):
    """Abstract base class for all runners"""

    # this class handles the basic runner framework, options and config files,
    # arguments to mrjobs, and setting up job working dirs and environments.
    # this will put files from setup scripts, py_files, and bootstrap_mrjob
    # into the job's working dir, but won't actually run/import them
    #
    # command lines to run substeps (including Spark) are handled by
    # mrjob.bin.MRJobBinRunner

    #: alias for this runner, used on the command line with ``-r``
    alias = None

    # libjars is only here because the job can set it; might want to
    # handle this with a warning from the launcher instead
    OPT_NAMES = {
        'bootstrap_mrjob',
        'check_input_paths',
        'cleanup',
        'cleanup_on_failure',
        'cmdenv',
        'jobconf',
        'label',
        'libjars',
        'local_tmp_dir',
        # no max_output_files because it doesn't go in self._opts
        'owner',
        'py_files',
        'read_logs',
        'setup',
        'upload_archives',
        'upload_dirs',
        'upload_files'
    }

    # re-define this as a set of step types supported by your runner
    _STEP_TYPES = None

    ### methods to call from your batch script ###

    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 sort_values=None, stdin=None, steps=None,
                 step_output_dir=None,
                 **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
                                    :mrjob-opt:`hadoop_streaming_jar`).
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
                                     :mrjob-opt:`hadoop_streaming_jar`).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
                            sort.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :param steps: a list of descriptions of steps to run (see :doc:`step`
                      for description formats)
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        """
        self._ran_job = False

        # opts are made from:
        #
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)]
        )

        log.debug('Active configuration:')
        log.debug(pprint.pformat({
            opt_key: self._obfuscate_opt(opt_key, opt_value)
            for opt_key, opt_value in self._opts.items()
        }))

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        if self._emulate_archives_on_spark():
            # keep Spark from auto-uncompressing tarballs
            archive_file_suffix = '.file'
        else:
            # otherwise, leave as-is, so that --archive will
            # work properly
            archive_file_suffix = ''

        self._working_dir_mgr = WorkingDirManager(
            archive_file_suffix=archive_file_suffix)

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # set this to an :py:class:`~mrjob.setup.UploadDirManager` in
        # runners that upload files to HDFS, S3, etc.
        #
        # this manager should not handle files belonging to
        # self._working_dir_mgr,
        # which, if they are uploaded, will go into self._wd_upload_dir()
        self._upload_mgr = None

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key()

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                self._working_dir_mgr.add(**extra_arg)

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file', hash_path,
                                        must_name='upload_files')
            self._working_dir_mgr.add(**uf)

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive', hash_path,
                                        must_name='upload_archives')
            self._working_dir_mgr.add(**ua)

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir', hash_path,
                                        must_name='upload_archives')
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
            self._working_dir_mgr.add(
                'archive', archive_path, name=ud['name'])

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
        else:
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where to keep the input manifest
        self._input_manifest_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # check and store *steps*
        self._steps = []
        if steps:
            self._check_steps(steps)
            self._steps = copy.deepcopy(steps)

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False

    ### Options ####

    @classmethod
    def _default_opts(cls):
        try:
            owner = getpass.getuser()
        except:
            owner = None

        return dict(
            check_input_paths=True,
            cleanup=['ALL'],
            cleanup_on_failure=['NONE'],
            owner=owner,
        )

    def _combine_confs(self, source_and_opt_list):
        """Combine several opt dictionaries into one.

        *source_and_opt_list* is a list of tuples of *source*,
        *opts* where *opts* is a dictionary and *source* is either
        None or a description of where the opts came from (usually a path).

        Only override this if you need truly fine-grained control,
        including knowledge of the options' source.
        """
        opt_list = [
            self._fix_opts(opts, source)
            for source, opts in source_and_opt_list
        ]

        return self._combine_opts(opt_list)

    def _combine_opts(self, opt_list):
        """Combine several opt dictionaries into one. *opt_list*
        is a list of dictionaries containing validated options

        Override this if you need to base options off the values of
        other options, but don't need to issue warnings etc.
        about the options' source.
        """
        return combine_opts(self._opt_combiners(), *opt_list)

    def _opt_combiners(self):
        """A dictionary mapping opt name to combiner funciton. This
        won't necessarily include every opt name (we default to
        :py:func:`~mrjob.conf.combine_value`).
        """
        return _combiners(self.OPT_NAMES)

    def _fix_opts(self, opts, source=None):
        """Take an options dictionary, and either return a sanitized
        version of it, or raise an exception.

        *source* is either a string describing where the opts came from
        or None.

        This ensures that opt dictionaries are really dictionaries
        and handles deprecated options.
        """
        if source is None:
            source = 'defaults'  # defaults shouldn't trigger warnings

        if not isinstance(opts, dict):
            raise TypeError(
                'options for %s (from %s) must be a dict' %
                (self.alias, source))

        deprecated_aliases = _deprecated_aliases(self.OPT_NAMES)

        results = {}

        for k, v in sorted(opts.items()):
            # rewrite deprecated aliases
            if k in deprecated_aliases:
                if v is None:  # don't care
                    continue

                aliased_opt = deprecated_aliases

                log.warning('Deprecated option %s (from %s) has been renamed'
                            ' to %s and will be removed in v0.7.0' % (
                                k, source, aliased_opt))

                if opts.get(aliased_opt) is not None:
                    return  # don't overwrite non-aliased opt

                k = aliased_opt

            if k in self.OPT_NAMES:
                if v is None:
                    fixed_v = None
                elif isinstance(v, ClearedValue):
                    # _fix_opt() doesn't need to know about !clear (see #2102)
                    fixed_v = ClearedValue(self._fix_opt(k, v.value, source))
                else:
                    fixed_v = self._fix_opt(k, v, source)

                results[k] = fixed_v
            elif v:
                log.warning('Unexpected option %s (from %s)' % (k, source))

        return results

    def _fix_opt(self, opt_key, opt_value, source):
        """Fix a single option, returning its correct value or raising
        an exception. This is not called for options that are ``None``.

        This currently handles cleanup opts.

        Override this if you require additional opt validation or cleanup.
        """
        if opt_key in ('cleanup', 'cleanup_on_failure'):
            return self._fix_cleanup_opt(opt_key, opt_value, source)
        else:
            return opt_value

    def _fix_cleanup_opt(self, opt_key, opt_value, source):
        """Fix a cleanup option, or raise ValueError."""
        if isinstance(opt_value, string_types):
            opt_value = [opt_value]

        if 'NONE' in opt_value and len(set(opt_value)) > 1:
            raise ValueError(
                'Cannot clean up both nothing and something!'
                ' (%s option from %s)' % (opt_key, source))

        for cleanup_type in opt_value:
            if cleanup_type not in CLEANUP_CHOICES:
                raise ValueError(
                    '%s must be one of %s, not %s (from %s)' % (
                        opt_key, ', '.join(CLEANUP_CHOICES), opt_value,
                        source))

        return opt_value

    def _obfuscate_opt(self, opt_key, opt_value):
        """Return value of opt to show in debug printout. Used to obfuscate
        credentials, etc."""
        return opt_value

    ### Filesystem object ###

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for the local
        filesystem.
        """
        if self._fs is None:
            # wrap LocalFilesystem in LocalFilesystem to get IOError
            # on URIs (see #1185)
            self._fs = CompositeFilesystem()
            self._fs.add_fs('local', LocalFilesystem())
        return self._fs

    ### Running the job and parsing output ###

    def run(self):
        """Run the job, and block until it finishes.

        Raise :py:class:`~mrjob.step.StepFailedException` if there
        are any problems (except on
        :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the
        actual exception that caused the step to fail).
        """
        if self._ran_job:
            raise ValueError('Job already ran!')

        if self._num_steps() == 0:
            raise ValueError('Job has no steps!')

        self._create_dir_archives()
        # TODO: no point in checking input paths if we're going to
        # make a manifest out of them
        self._check_input_paths()
        self._add_input_files_for_upload()
        self._create_input_manifest_if_needed()
        self._run()
        self._ran_job = True

        last_step = self._get_steps()[-1]

        # only print this message if the last step uses our output dir
        if 'args' not in last_step or OUTPUT in last_step['args']:
            log.info('job output is in %s' % self._output_dir)

    def cat_output(self):
        """Stream the job's output, as a stream of ``bytes``. If there are
        multiple output files, there will be an empty bytestring
        (``b''``) between them.

        Like Hadoop input formats, we ignore files and subdirectories whose
        names start with ``"_"`` or ``"."`` (e.g. ``_SUCCESS``, ``_logs/``,
        ``.part-00000.crc``.

        .. versionchanged:: 0.6.8

           Ignore file/dirnames starting with ``"."`` as well as ``"_"``.
        """
        output_dir = self.get_output_dir()
        if output_dir is None:
            raise ValueError('Run the job before streaming output')

        if self._closed is True:
            log.warning(
                'WARNING! Trying to stream output from a closed runner, output'
                ' will probably be empty.')

        log.info('Streaming final output from %s...' % output_dir)

        def split_path(path):
            while True:
                base, name = os.path.split(path)

                # no more elements
                if not name:
                    break

                yield name

                path = base

        def ls_output():
            for filename in self.fs.ls(output_dir):
                subpath = filename[len(output_dir):]
                # Hadoop ignores files and dirs inside the output dir
                # whose names start with '_' or '.'. See #1337.
                if not (any(name[0] in '_.'
                            for name in split_path(subpath))):
                    yield filename

        for i, filename in enumerate(ls_output()):
            if i > 0:
                yield b''  # EOF of previous file

            for chunk in self.fs._cat_file(filename):
                yield chunk

    def _cleanup_mode(self, mode=None):
        """Actual cleanup action to take based on various options"""
        if self._script_path and not self._ran_job:
            return mode or self._opts['cleanup_on_failure']
        else:
            return mode or self._opts['cleanup']

    def _cleanup_cloud_tmp(self):
        """Cleanup any files/directories on cloud storage (e.g. S3) we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        """
        pass  # only EMR runner does this

    def _cleanup_hadoop_tmp(self):
        """Cleanup any files/directories on HDFS we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        """
        pass  # only Hadoop runner does this

    def _cleanup_local_tmp(self):
        """Cleanup any files/directories on the local machine we created while
        running this job. Should be safe to run this at any time, or multiple
        times.

        This particular function removes any local tmp directories
        added to the list self._local_tmp_dirs

        This won't remove output_dir if it's outside of our tmp dir.
        """
        if self._local_tmp_dir:
            log.info('Removing temp directory %s...' % self._local_tmp_dir)
            try:
                rmtree(self._local_tmp_dir)
            except OSError as e:
                log.exception(e)

        self._local_tmp_dir = None

    def _cleanup_cluster(self):
        """Terminate the cluster if there is one."""
        pass  # this only happens on EMR

    def _cleanup_logs(self):
        """Cleanup any log files that are created as a side-effect of the job.
        """
        pass  # this only happens on EMR

    def _cleanup_job(self):
        """Stop any jobs that we created that are still running."""
        pass  # currently disabled (see #1241)

    def cleanup(self, mode=None):
        """Clean up running jobs, temp files, and logs, subject to the
        *cleanup* option passed to the constructor.

        If you create your runner in a ``with`` block,
        :py:meth:`cleanup` will be called automatically::

            with mr_job.make_runner() as runner:
                ...

            # cleanup() called automatically here

        :param mode: override *cleanup* passed into the constructor. Should be
                     a list of strings from
                     :py:data:`~mrjob.options.CLEANUP_CHOICES`
        """
        mode = self._cleanup_mode(mode)

        def mode_has(*args):
            return any((choice in mode) for choice in args)

        if self._script_path and not self._ran_job:
            if mode_has('CLUSTER', 'ALL'):
                self._cleanup_cluster()

            if mode_has('JOB', 'ALL'):
                self._cleanup_job()

        if mode_has('ALL', 'TMP', 'CLOUD_TMP'):
            self._cleanup_cloud_tmp()

        if mode_has('ALL', 'TMP', 'HADOOP_TMP'):
            self._cleanup_hadoop_tmp()

        if mode_has('ALL', 'TMP', 'LOCAL_TMP'):
            self._cleanup_local_tmp()

        if mode_has('ALL', 'LOGS'):
            self._cleanup_logs()

        self._closed = True

    def counters(self):
        """Get counters associated with this run in this form::

            [{'group name': {'counter1': 1, 'counter2': 2}},
             {'group name': ...}]

        The list contains an entry for every step of the current job.
        """
        raise NotImplementedError

    ### hooks for the with statement ###

    def __enter__(self):
        """Don't do anything special at start of with block"""
        return self

    def __exit__(self, type, value, traceback):
        """Call self.cleanup() at end of with block."""
        self.cleanup()

    ### more runner information ###

    def get_opts(self):
        """Get options set for this runner, as a dict."""
        log.warning('get_opts() is deprecated and will be removed in v0.7.0')
        return copy.deepcopy(self._opts)

    def get_job_key(self):
        """Get the unique key for the job run by this runner.
        This has the format ``label.owner.date.time.microseconds``
        """
        return self._job_key

    def get_output_dir(self):
        """Find the directory containing the job output. If the job hasn't
        run yet, returns None"""
        if self._script_path and not self._ran_job:
            return None

        return self._output_dir

    ### other methods you need to implement in your subclass ###

    def get_hadoop_version(self):
        """Return the version number of the Hadoop environment as a string if
        Hadoop is being used or simulated. Return None if not applicable.

        :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster.
        :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from
        ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an
        additional `hadoop_version` option to specify which version it
        simulates.
        :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at
        all.
        """
        return None

    # you'll probably wan't to add your own __init__() and cleanup() as well

    def _run(self):
        """Run the job."""
        raise NotImplementedError

    ### internal utilities for implementing MRJobRunners ###

    def _get_local_tmp_dir(self):
        """Create a tmp directory on the local filesystem that will be
        cleaned up by self.cleanup()"""
        if not self._local_tmp_dir:
            tmp_dir = (self._opts['local_tmp_dir'] or
                       tempfile.gettempdir())

            path = os.path.join(tmp_dir, self._job_key)
            log.info('Creating temp directory %s' % path)
            if os.path.isdir(path):
                rmtree(path)
            os.makedirs(path)
            self._local_tmp_dir = path

        return self._local_tmp_dir

    def _make_unique_job_key(self, label=None, owner=None):
        """Come up with a useful unique ID for this job. Optionally,
        you can specify a custom label or owner (otherwise we use
        :py:meth:`_label` and :py:meth:`_owner`.

        We use this to choose the output directory, etc. for the job.
        """
        if label is None:
            label = self._label()

        if owner is None:
            owner = self._owner()

        now = datetime.datetime.utcnow()
        return '%s.%s.%s.%06d' % (
            label, owner,
            now.strftime('%Y%m%d.%H%M%S'), now.microsecond)

    def _label(self):
        """Return *label* opt, or if not set, the name of the file
        containing the MRJob, minus extension, or if none, ``'no_script'``"""
        if self._opts['label']:
            return self._opts['label']
        elif self._script_path:
            return os.path.basename(self._script_path).split('.')[0]
        else:
            return 'no_script'

    def _owner(self):
        """Return *owner* opt (which defaults to :py:func:`getpass.getuser`),
        or ``'no_user'`` if not set."""
        if self._opts['owner']:
            # owner opt defaults to getpass.getuser()
            return self._opts['owner']
        else:
            return 'no_user'

    def _get_steps(self):
        """Returns ``self._steps``.
        """
        # TODO: remove this
        return self._steps

    def _check_steps(self, steps):
        """Look at the step definition (*steps*). If it is not supported by
        the runner, raise :py:class:`NotImplementedError`. If it is not
        supported by mrjob, raise :py:class:`ValueError`.
        """
        if not self._STEP_TYPES:
            # use __class__.__name__ because only MRJobRunner would
            # trigger this
            raise NotImplementedError(
                '%s cannot run steps!' % self.__class__.__name__)

        for step_num, step in enumerate(steps):
            self._check_step(step, step_num)

    def _check_step(self, step, step_num):
        """Raise an exception if the given step is invalid
        (:py:class:`ValueError`) or not handled by this runner
        (:py:class:`NotImplementedError`).

        By default, we check that *step* has a support step type,
        only uses an input manifest if it's the first step, and that
        :py:attr:`_script_path` exists if necessary. You can re-define
        this in your subclass.
        """
        if step.get('type') not in self._STEP_TYPES:
            raise NotImplementedError(
                'step %d has type %r, but %s runner only supports:'
                ' %s' % (step_num, step.get('type'), self.alias,
                         ', '.join(sorted(self._STEP_TYPES))))

        if step.get('input_manifest') and step_num != 0:
            raise ValueError(
                'step %d may not take an input manifest (only'
                ' first step can' % step_num)

        # some step types assume a MRJob script
        if not self._script_path:
            if step['type'] == 'spark':
                raise ValueError(
                    "SparkStep (step %d) can't run without a MRJob script"
                    " (try SparkScriptStep instead)" % step_num)

            elif step['type'] == 'streaming':
                for mrc in ('mapper', 'combiner', 'reducer'):
                    if not step.get(mrc):
                        continue

                    substep = step[mrc]
                    if substep['type'] == 'script':
                        raise ValueError(
                            "%s (step %d) can't run without a MRJob"
                            " script" % (mrc, step_num))

    def _get_step(self, step_num):
        """Get a single step (calls :py:meth:`_get_steps`)."""
        return self._get_steps()[step_num]

    def _num_steps(self):
        """Get the number of steps (calls :py:meth:`get_steps`)."""
        return len(self._get_steps())

    def _uses_input_manifest(self):
        """Does the first step take an input manifest?"""
        return bool(self._get_step(0).get('input_manifest'))

    def _has_hadoop_streaming_steps(self):
        """Are any of our steps Hadoop Streaming steps?"""
        return any(step['type'] == 'streaming'
                   for step in self._get_steps())

    def _has_spark_steps(self):
        """Are any of our steps Spark steps? (e.g. spark, spark_jar,
        spark_script)

        Generally used to determine if we need to install Spark on a cluster.
        """
        return any(self._step_type_uses_spark(step['type'])
                   for step in self._get_steps())

    def _has_pyspark_steps(self):
        """Do any of our steps involve running Python on Spark?
        Includes spark and spark_script types, but not spark_jar.

        Generally used to tell if we need a Spark setup script.
        """
        return any(self._step_type_uses_pyspark(step['type'])
                   for step in self._get_steps())

    def _step_type_uses_spark(self, step_type):
        """Does this step run on Spark?

        (This is re-defined in the Spark runner to include
        streaming steps, and used by mrjob.logs.mixin)
        """
        return _is_spark_step_type(step_type)

    def _step_type_uses_pyspark(self, step_type):
        """Does this step involve running Python on Spark?

        (This is re-defined in the Spark runner to include
        streaming steps, and used by mrjob.logs.mixin)
        """
        return _is_pyspark_step_type(step_type)

    def _spark_master(self):
        return self._opts.get('spark_master') or 'local[*]'

    def _spark_deploy_mode(self):
        return self._opts.get('spark_deploy_mode') or 'client'

    def _spark_driver_has_own_wd(self):
        """Does the spark driver have a working directory different
        from the one *spark-submit* was run in?

        (Only true in cluster mode.)
        """
        return (self._spark_deploy_mode() == 'cluster' and
                self._spark_executors_have_own_wd())

    def _spark_executors_have_own_wd(self):
        """Do spark executors have a working directory different
        from the one *spark-submit* was run in?

        (True on everything but local.)
        """
        # note: local-cluster[...] master does in fact have working dirs
        return self._spark_master().split('[')[0] != 'local'

    def _emulate_archives_on_spark(self):
        """True if spark-submit's --archives doesn't work on the given Spark
        master, which means we'll need to emulate archives in setup scripts.
        """
        return self._spark_master() != 'yarn'

    def _args_for_task(self, step_num, mrc):
        return [
            '--step-num=%d' % step_num,
            '--%s' % mrc,
        ] + self._mr_job_extra_args()

    def _mr_job_extra_args(self, local=False):
        """Return arguments to add to every invocation of MRJob.

        :type local: boolean
        :param local: if this is True, use files' local paths rather than
            the path they'll have inside Hadoop streaming
        """
        result = []

        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if local:
                    result.append(extra_arg['path'])
                else:
                    result.append(self._working_dir_mgr.name(**extra_arg))
            else:
                result.append(extra_arg)

        return result

    def _spark_script_args(self, step_num, last_step_num=None):
        """A list of args to the spark script/jar/MRJob, used by
        _args_for_spark_step().

        *last_step_num* is only used by the Spark runner, where multiple
        streaming steps are run in a single Spark job."""
        step = self._get_step(step_num)

        if step['type'] == 'spark':
            # if on local[*] master, keep file upload args as-is (see #2031)
            local = not self._spark_executors_have_own_wd()

            args = (
                [
                    '--step-num=%d' % step_num,
                    '--spark',
                ] + self._mr_job_extra_args(local=local) + [
                    INPUT,
                    OUTPUT,
                ]
            )
        elif step['type'] in ('spark_jar', 'spark_script'):
            args = step['args']
        else:
            raise TypeError('Bad step type: %r' % step['type'])

        return self._interpolate_step_args(args, step_num)

    def _interpolate_step_args(self, args, step_num):
        """Replace :py:data:`~mrjob.step.INPUT` and
        :py:data:`~mrjob.step.OUTPUT` in arguments to a jar or Spark
        step.
        """
        result = []

        for arg in args:
            if arg == INPUT:
                result.append(
                    ','.join(self._step_input_uris(step_num)))

            elif arg == OUTPUT:
                result.append(
                    self._step_output_uri(step_num))

            else:
                result.append(arg)

        return result

    def _dir_archive_path(self, dir_path):
        """Assign a path for the archive of *dir_path* but don't
        actually create anything."""
        if dir_path not in self._dir_to_archive_path:
            # we can check local paths now
            if not (is_uri(dir_path) or os.path.isdir(dir_path)):
                raise OSError('%s is not a directory!' % dir_path)

            name = name_uniquely(
                dir_path, names_taken=self._dir_archive_names_taken)
            self._dir_archive_names_taken.add(name)

            self._dir_to_archive_path[dir_path] = os.path.join(
                self._get_local_tmp_dir(), 'archives', name + '.tar.gz')

        return self._dir_to_archive_path[dir_path]

    def _create_dir_archives(self):
        """Call this to create all dir archives"""
        for dir_path in sorted(set(self._dir_to_archive_path)):
            self._create_dir_archive(dir_path)

    def _create_dir_archive(self, dir_path):
        """Helper for :py:meth:`archive_dir`"""
        if not self.fs.exists(dir_path):
            raise OSError('%s does not exist')

        tar_gz_path = self._dir_archive_path(dir_path)

        if tar_gz_path in self._dir_archives_created:
            return  # already created

        if not os.path.isdir(os.path.dirname(tar_gz_path)):
            os.makedirs(os.path.dirname(tar_gz_path))

        # for remote files
        tmp_download_path = os.path.join(
            self._get_local_tmp_dir(), 'tmp-download')

        log.info('Archiving %s -> %s' % (dir_path, tar_gz_path))

        with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz:
            for path in self.fs.ls(dir_path):
                # fs.ls() only lists files
                if path == dir_path:
                    raise OSError('%s is a file, not a directory!' % dir_path)

                # TODO: do we need this?
                if os.path.realpath(path) == os.path.realpath(tar_gz_path):
                    raise OSError(
                        'attempted to archive %s into itself!' % tar_gz_path)

                if is_uri(path):
                    path_in_tar_gz = path[len(dir_path):].lstrip('/')

                    log.info('  downloading %s -> %s' % (
                        path, tmp_download_path))
                    with open(tmp_download_path, 'wb') as f:
                        for chunk in self.fs.cat(path):
                            f.write(chunk)
                    local_path = tmp_download_path
                else:
                    path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
                    local_path = path

                log.debug('  adding %s to %s' % (path, tar_gz_path))
                tar_gz.add(local_path, path_in_tar_gz, recursive=False)

        self._dir_archives_created.add(tar_gz_path)

    def _bootstrap_mrjob(self):
        """Should we bootstrap mrjob?"""
        if self._opts['bootstrap_mrjob'] is None:
            return True
        else:
            return bool(self._opts['bootstrap_mrjob'])

    def _get_input_paths(self):
        """Get the paths to input files, dumping STDIN to a local
        file if need be."""
        if self._input_manifest_path:
            return [self._input_manifest_path]

        if '-' in self._input_paths:
            if self._stdin_path is None:
                # prompt user, so they don't think the process has stalled
                log.info('reading from STDIN')

                stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
                log.debug('dumping stdin to local file %s' % stdin_path)
                with open(stdin_path, 'wb') as stdin_file:
                    for line in self._stdin:
                        # catch missing newlines (often happens with test data)
                        if not line.endswith(b'\n'):
                            line += b'\n'
                        stdin_file.write(line)

                self._stdin_path = stdin_path

        return [self._stdin_path if p == '-' else p for p in self._input_paths]

    def _create_input_manifest_if_needed(self):
        """Create a file with a list of URIs of input files."""
        if self._input_manifest_path or not self._uses_input_manifest():
            return

        uris = []

        log.info('finding input files to add to manifest...')

        for path in self._get_input_paths():
            log.debug('  in %s' % path)
            if is_uri(path):
                # URIs might be globs
                for uri in self.fs.ls(path):
                    uris.append(uri)
            else:
                # local paths are expected to be single files
                # (shell would resolve globs)
                if self._upload_mgr:
                    uris.append(self._upload_mgr.uri(path))
                else:
                    # just make sure job can find files from its working dir
                    uris.append(os.path.abspath(path))

        log.info('found %d input files' % len(uris))

        path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt')
        self._write_script(uris, path, 'input manifest')

        self._input_manifest_path = path
        if self._upload_mgr:
            self._upload_mgr.add(self._input_manifest_path)

    def _check_input_paths(self):
        """Check that input exists prior to running the job, if the
        `check_input_paths` option is true."""
        if not self._opts['check_input_paths']:
            return

        for path in self._input_paths:
            self._check_input_path(path)

    def _check_input_path(self, path):
        """Raise :py:class:`IOError` if the given input does not exist or
        is otherwise invalid. Override this to provide custom check
        behavior."""
        if path == '-':
            return  # STDIN always exists

        if not self.fs.can_handle_path(path):
            return  # no way to check (e.g. non-S3 URIs on EMR)

        if not self.fs.exists(path):
            raise IOError(
                'Input path %s does not exist!' % (path,))

    def _add_input_files_for_upload(self):
        """If there is an upload manager, add input files to it."""
        if self._upload_mgr:
            for path in self._get_input_paths():
                self._upload_mgr.add(path)

    def _upload_local_files(self):
        self._copy_files_to_wd_mirror()

        if self._upload_mgr:
            self.fs.mkdir(self._upload_mgr.prefix)

            log.info('Copying other local files to %s' %
                     self._upload_mgr.prefix)
            for src_path, uri in self._upload_mgr.path_to_uri().items():
                log.debug('  %s -> %s' % (src_path, uri))
                self.fs.put(src_path, uri)

    def _wd_mirror(self):
        """A directory to upload files belonging to
        :py:attr:`_working_dir_mgr`. This will be a subdir of
        ``self._upload_mgr.prefix`, if it exists, and otherwise will
        be ``None``."""
        if self._upload_mgr and is_uri(self._upload_mgr.prefix):
            return posixpath.join(self._upload_mgr.prefix, 'wd')
        elif (self._has_spark_steps() and self._spark_executors_have_own_wd()):
            return os.path.join(self._get_local_tmp_dir(), 'wd-mirror')
        else:
            return None

    def _wd_filenames_must_match(self):
        """When we tell Hadoop/Spark to put files in the working directory,
        must they have the same names as the files in the working dir?

        This basically only happens with Spark on non-YARN masters. YARN/Hadoop
        allows you to specify a name for each file (``path#name_in_wd``).
        """
        return self._has_spark_steps() and self._spark_master() != 'yarn'

    def _dest_in_wd_mirror(self, path, name):
        """Return the URI of where to upload *path* so it can appear in the
        working dir as *name*, or ``None`` if it doesn't need to be uploaded.
        """
        dest_dir = self._wd_mirror()
        if not dest_dir:
            return None

        # the only reason to re-upload a URI is if it has the wrong name
        #
        # similarly, the only point of a local working dir mirror is
        # to rename things
        if (is_uri(path) or not is_uri(dest_dir)) and (
                posixpath.basename(path) == name or
                not self._wd_filenames_must_match()):
            return None

        return posixpath.join(dest_dir, name)

    def _copy_file_to_wd_mirror(self, path, name):
        """Upload/copy *path* to the appropriate place in the working dir
        mirror, if necessary.

        We don't track whether something has already been uploaded.
        """
        dest = self._dest_in_wd_mirror(path, name)
        if not dest:
            return

        if is_uri(path):
            # file is visible to non-YARN Spark, but has the wrong name, so
            # download and re-upload it
            wd_tmp = os.path.join(self._get_local_tmp_dir(), 'wd-mirror')
            self.fs.mkdir(wd_tmp)

            tmp_path = os.path.join(wd_tmp, name)

            log.debug('  %s <- %s' % (tmp_path, path))
            try:
                with open(tmp_path, 'wb') as tmp_f:
                    for chunk in self.fs.cat(path):
                        tmp_f.write(chunk)

                log.debug('  %s -> %s' % (tmp_path, dest))
                self.fs.put(tmp_path, dest)
            finally:
                os.remove(tmp_path)
        else:
            # upload it
            log.debug('  %s -> %s' % (path, dest))
            self.fs.put(path, dest)

    def _copy_files_to_wd_mirror(self):
        """Upload working dir files to the working dir mirror, if necessary.

        This does not handle archives, which we always rename with
        hash paths anyhow (see #2059).
        """
        wd_mirror = self._wd_mirror()
        if not wd_mirror:
            return

        self.fs.mkdir(wd_mirror)

        log.info('%s working dir files to %s...' %
                 ('uploading' if is_uri(wd_mirror) else 'copying', wd_mirror))

        for name, path in sorted(
                self._working_dir_mgr.name_to_path('file').items()):
            self._copy_file_to_wd_mirror(path, name)

        for name, path in sorted(
                self._working_dir_mgr.name_to_path('archive_file').items()):
            self._copy_file_to_wd_mirror(path, name)

    def _upload_part_size(self):
        """Part size for uploads, in bytes, or ``None``,
        from :mrjob-opt:`cloud_part_size_mb`"""
        if self._opts.get('cloud_part_size_mb'):
            return int(self._opts['cloud_part_size_mb'] * 1024 * 1024)
        else:
            return None

    def _intermediate_output_dir(self, step_num, local=False):
        """A directory for intermediate output for the given step number."""
        join = os.path.join if local else posixpath.join

        return join(
            self._step_output_dir or self._default_step_output_dir(),
            '%04d' % step_num)

    def _default_step_output_dir(self):
        """Where to put output for steps other than the last one,
        if not specified by the *output_dir* constructor keyword.
        Usually you want this to be on HDFS (most efficient).

        Define this in your runner subclass.
        """
        raise NotImplementedError

    def _step_input_uris(self, step_num):
        """A list of URIs to use as input for the given step. For all
        except the first step, this list will have a single item (a
        directory)."""
        if step_num == 0:
            return [self._upload_mgr.uri(path) if self._upload_mgr
                    else to_uri(path)
                    for path in self._get_input_paths()]
        else:
            return [to_uri(self._intermediate_output_dir(step_num - 1))]

    def _step_output_uri(self, step_num):
        """URI to use as output for the given step. This is either an
        intermediate dir (see :py:meth:`intermediate_output_uri`) or
        ``self._output_dir`` for the final step."""
        if step_num == len(self._get_steps()) - 1:
            return to_uri(self._output_dir)
        else:
            return to_uri(self._intermediate_output_dir(step_num))

    def _cmdenv(self):
        """Return a copy of ``self._opts['cmdenv']``. This exists so we
        can instrument cmdenv in runner subclasses."""
        return dict(self._opts['cmdenv'])

    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.
        """

        step = self._get_step(step_num)

        # _sort_values_jobconf() isn't relevant to Spark,
        # but it doesn't do any harm either

        jobconf = combine_jobconfs(self._sort_values_jobconf(),
                                   self._opts['jobconf'],
                                   step.get('jobconf'))

        # if user is using the wrong jobconfs, add in the correct ones
        # and log a warning
        hadoop_version = self.get_hadoop_version()
        if hadoop_version:
            jobconf = translate_jobconf_dict(jobconf, hadoop_version)

        return jobconf

    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        """
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
            else:
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf

    def _sort_values_partitioner(self):
        """Partitioner to use with *sort_values* keyword to the constructor."""
        if self._sort_values:
            return _SORT_VALUES_PARTITIONER
        else:
            return None

    def _upload_args(self):
        # just upload every file and archive in the working dir manager
        return self._upload_args_helper('-files', None, '-archives', None)

    def _upload_args_helper(
            self, files_opt_str, files, archives_opt_str, archives,
            always_use_hash=True, emulate_archives=False):
        args = []

        file_hash_paths = list(
            self._file_arg_hash_paths(files,
                                      always_use_hash=always_use_hash))

        # if emulating --archives, upload archives with files (we'll unpack
        # them later with a setup script)
        if emulate_archives:
            file_hash_paths.extend(
                self._file_archive_hash_paths(archives))

        # --files ...
        if file_hash_paths:
            args.append(files_opt_str)
            args.append(','.join(file_hash_paths))

        if not emulate_archives:
            archive_hash_paths = list(self._archive_arg_hash_paths(archives))

            # --archives ...
            if archive_hash_paths:
                args.append(archives_opt_str)
                args.append(','.join(archive_hash_paths))

        return args

    def _file_arg_hash_paths(self, named_paths=None, always_use_hash=True):
        """Helper function for the *upload_args methods. The names of all
        arguments to ``-files`` (or ``--files`` on Spark).

        If *always_use_hash* is false, only use ``path#name`` syntax
        when the name is different.
        """
        if named_paths is None:
            # just return every file managed by _working_dir_mgr
            named_paths = sorted(
                self._working_dir_mgr.name_to_path('file').items())

        for name, path in named_paths:
            if not name:
                name = self._working_dir_mgr.name('file', path)

            uri = self._dest_in_wd_mirror(path, name) or path

            if not always_use_hash and _basename(uri) == name:
                yield uri
            else:
                yield '%s#%s' % (uri, name)

    def _file_archive_hash_paths(self, named_paths=None):
        """Helper function for the *upload_args methods. The names of
        archives to pass to the ``--files`` switch of ``spark-submit``,
        since we can't use ``--archives``.

        The names in *named_paths* should be of the archive destination
        (the 'archive' type in WorkingDirManager)
        not of the filename we're going to copy the archive to before
        unpacking it into its destination (the 'archive_file' type).
        """
        if named_paths is None:
            named_paths = sorted(
                self._working_dir_mgr.name_to_path('archive').items())

        for name, path in named_paths:
            if not name:
                name = self._working_dir_mgr.name('archive', path)

            archive_file_name = self._working_dir_mgr.name(
                'archive_file', path)

            uri = self._dest_in_wd_mirror(path, archive_file_name) or path

            yield uri

    def _archive_arg_hash_paths(self, named_paths=None):
        """Helper function for the *upload_args methods. The names of all
        arguments to ``-archives`` (or ``--archives`` on Spark).
        """
        # we always use path#name syntax, even on Spark, because unlike
        # with --files, Spark will either accept that syntax with --archives
        # (if we're on YARN) or ignore --archives completely (if we're on
        # any other Spark master)
        if named_paths is None:
            # just return every archive managed by _working_dir_mgr
            named_paths = sorted(
                self._working_dir_mgr.name_to_path('archive').items())

        for name, path in named_paths:
            if not name:
                name = self._working_dir_mgr.name('archive', path)

            # archives are uploaded to the working dir mirror by the
            # name of the original archive file, not the dir it unpacks into
            archive_file_name = self._working_dir_mgr.name(
                'archive_file', path)

            uri = self._dest_in_wd_mirror(path, archive_file_name) or path

            yield '%s#%s' % (uri, name)

    def _write_script(self, lines, path, description):
        """Write text of a setup script, input manifest, etc. to the given
        file.

        By default, this writes binary data. Redefine :py:meth:`write_lines`
        to use other line endings.

        :param lines: a list of lines as ``str``
        :param path: path of file to write to
        :param description: what we're writing to, for debug messages
        """
        log.debug('Writing %s to %s:' % (description, path))
        for line in lines:
            log.debug('  ' + line)

        self._write_script_lines(lines, path)

    def _write_script_lines(self, lines, path):
        """Write text to the given file. By default, this writes
        binary data, but can be redefined to use local line endings."""
        with open(path, 'wb') as f:
            for line in lines:
                f.write((line + '\n').encode('utf-8'))


def _fix_env(env):
    """Convert environment dictionary to strings (Python 2.7 on Windows
    doesn't allow unicode)."""
    def _to_str(s):
        if isinstance(s, string_types) and not isinstance(s, str):
            return s.encode('utf_8')
        else:
            return s

    return dict((_to_str(k), _to_str(v)) for k, v in env.items())


def _blank_out_conflicting_opts(opt_list, opt_names, conflicting_opts=None):
    """Utility for :py:meth:`MRJobRunner._combine_opts()`: if multiple
    configs specify conflicting opts, blank them out in all but the
    last config (so, for example, the command line beats the config file).

    This returns a copy of *opt_list*
    """
    conflicting_opts = set(conflicting_opts or ()) | set(opt_names)

    # copy opt_list so we can modify it
    opt_list = [dict(opts) for opts in opt_list]

    # blank out region/zone before the last config where they are set
    blank_out = False
    for opts in reversed(opt_list):
        if blank_out:
            for opt_name in opt_names:
                opts[opt_name] = None
        elif any(opts.get(opt_name) is not None
                 for opt_name in conflicting_opts):
            blank_out = True

    return opt_list


def _runner_class(alias):
    """Get the runner subclass corresponding to the given alias
    (importing code only as needed)."""
    if alias == 'dataproc':
        from mrjob.dataproc import DataprocJobRunner
        return DataprocJobRunner

    elif alias == 'emr':
        from mrjob.emr import EMRJobRunner
        return EMRJobRunner

    elif alias == 'hadoop':
        from mrjob.hadoop import HadoopJobRunner
        return HadoopJobRunner

    elif alias == 'inline':
        from mrjob.inline import InlineMRJobRunner
        return InlineMRJobRunner

    elif alias == 'local':
        from mrjob.local import LocalMRJobRunner
        return LocalMRJobRunner

    elif alias == 'spark':
        from mrjob.spark.runner import SparkMRJobRunner
        return SparkMRJobRunner

    else:
        raise ValueError('bad runner alias: %s' % alias)


def _basename(path_or_uri):
    if is_uri(path_or_uri):
        return posixpath.basename(path_or_uri)
    else:
        return os.path.basename(path_or_uri)