GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/job.py

# Copyright 2009-2017 Yelp and Contributors
# Copyright 2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Class to inherit your MapReduce jobs from. See :doc:`guides/writing-mrjobs`
for more information."""

# don't add imports here that aren't part of the standard Python library,
# since MRJobs need to run in Amazon's generic EMR environment
import codecs
import inspect
import itertools
import logging
import os
import os.path
import sys
import time
from io import BytesIO
from argparse import ArgumentParser
from argparse import ArgumentTypeError


# don't use relative imports, to allow this script to be invoked as __main__
from mrjob.cat import decompress
from mrjob.conf import combine_dicts
from mrjob.conf import combine_lists
from mrjob.options import _add_basic_args
from mrjob.options import _add_job_args
from mrjob.options import _add_runner_args
from mrjob.options import _add_step_args
from mrjob.options import _parse_raw_args
from mrjob.options import _print_basic_help
from mrjob.options import _print_help_for_runner
from mrjob.options import _RUNNER_OPTS
from mrjob.protocol import JSONProtocol
from mrjob.protocol import RawValueProtocol
from mrjob.py2 import integer_types
from mrjob.py2 import string_types
from mrjob.runner import _runner_class
from mrjob.setup import parse_legacy_hash_path
from mrjob.step import _JOB_STEP_FUNC_PARAMS
from mrjob.step import MRStep
from mrjob.step import SparkStep
from mrjob.step import StepFailedException
from mrjob.util import expand_path
from mrjob.util import log_to_null
from mrjob.util import log_to_stream
from mrjob.util import to_lines

log = logging.getLogger(__name__)

# sentinel value; used when running MRJob as a script
_READ_ARGS_FROM_SYS_ARGV = '_READ_ARGS_FROM_SYS_ARGV'


class UsageError(Exception):
    pass


def _im_func(f):
    """Wrapper to get at the underlying function belonging to a method.

    Python 2 is slightly different because classes have "unbound methods"
    which wrap the underlying function, whereas on Python 3 they're just
    functions. (Methods work the same way on both versions.)
    """
    # "im_func" is the old Python 2 name for __func__
    if hasattr(f, '__func__'):
        return f.__func__
    else:
        return f


class MRJob(object):
    """The base class for all MapReduce jobs. See :py:meth:`__init__`
    for details."""

    def __init__(self, args=None):
        """Entry point for running your job from other Python code.

        You can pass in command-line arguments, and the job will act the same
        way it would if it were run from the command line. For example, to
        run your job on EMR::

            mr_job = MRYourJob(args=['-r', 'emr'])
            with mr_job.make_runner() as runner:
                ...

        Passing in ``None`` is the same as passing in ``sys.argv[1:]``

        For a full list of command-line arguments, run:
        ``python -m mrjob.job --help``

        :param args: Arguments to your script (switches and input files)

        .. versionchanged:: 0.7.0

           Previously, *args* set to ``None`` was equivalent to ``[]``.
        """
        # make sure we respect the $TZ (time zone) environment variable
        if hasattr(time, 'tzset'):
            time.tzset()

        # argument dests for args to pass through
        self._passthru_arg_dests = set()
        self._file_arg_dests = set()

        self.arg_parser = ArgumentParser(usage=self._usage(),
                                         add_help=False)
        self.configure_args()

        if args is None:
            self._cl_args = sys.argv[1:]
        else:
            # don't pass sys.argv to self.arg_parser, and have it
            # raise an exception on error rather than printing to stderr
            # and exiting.
            self._cl_args = args

            def error(msg):
                raise ValueError(msg)

            self.arg_parser.error = error

        self.load_args(self._cl_args)

        # Make it possible to redirect stdin, stdout, and stderr, for testing
        # See stdin, stdout, stderr properties and sandbox(), below.
        self._stdin = None
        self._stdout = None
        self._stderr = None

    # by default, self.stdin, self.stdout, and self.stderr are sys.std*.buffer
    # if it exists, and otherwise sys.std* otherwise (they should always deal
    # with bytes, not Unicode).
    #
    # *buffer* is pretty much a Python 3 thing, though some platforms
    # (notably Jupyterhub) don't have it. See #1441

    @property
    def stdin(self):
        return self._stdin or getattr(sys.stdin, 'buffer', sys.stdin)

    @property
    def stdout(self):
        return self._stdout or getattr(sys.stdout, 'buffer', sys.stdout)

    @property
    def stderr(self):
        return self._stderr or getattr(sys.stderr, 'buffer', sys.stderr)

    def _usage(self):
        return "%(prog)s [options] [input files]"

    def _print_help(self, options):
        """Print help for this job. This will either print runner
        or basic help. Override to allow other kinds of help."""
        if options.runner:
            _print_help_for_runner(
                self._runner_opt_names_for_help(), options.deprecated)
        else:
            _print_basic_help(self.arg_parser,
                              self._usage(),
                              options.deprecated,
                              options.verbose)

    def _runner_opt_names_for_help(self):
        opts = set(self._runner_class().OPT_NAMES)

        if self.options.runner == 'spark':
            # specific to Spark runner, but command-line only, so it doesn't
            # appear in SparkMRJobRunner.OPT_NAMES (see #2040)
            opts.add('max_output_files')

        return opts

    def _non_option_kwargs(self):
        """Keyword arguments to runner constructor that can't be set
        in mrjob.conf.

        These should match the (named) arguments to
        :py:meth:`~mrjob.runner.MRJobRunner.__init__`.
        """
        # build extra_args
        raw_args = _parse_raw_args(self.arg_parser, self._cl_args)

        extra_args = []

        for dest, option_string, args in raw_args:
            if dest in self._file_arg_dests:
                extra_args.append(option_string)
                extra_args.append(parse_legacy_hash_path('file', args[0]))
            elif dest in self._passthru_arg_dests:
                # special case for --hadoop-args=-verbose etc.
                if (option_string and len(args) == 1 and
                        args[0].startswith('-')):
                    extra_args.append('%s=%s' % (option_string, args[0]))
                else:
                    if option_string:
                        extra_args.append(option_string)
                    extra_args.extend(args)

        # max_output_files is added by _add_runner_args() but can only
        # be set from the command line, so we add it here (see #2040)
        return dict(
            conf_paths=self.options.conf_paths,
            extra_args=extra_args,
            hadoop_input_format=self.hadoop_input_format(),
            hadoop_output_format=self.hadoop_output_format(),
            input_paths=self.options.args,
            max_output_files=self.options.max_output_files,
            mr_job_script=self.mr_job_script(),
            output_dir=self.options.output_dir,
            partitioner=self.partitioner(),
            stdin=self.stdin,
            step_output_dir=self.options.step_output_dir,
        )

    def _kwargs_from_switches(self, keys):
        return dict(
            (key, getattr(self.options, key))
            for key in keys if hasattr(self.options, key)
        )

    def _job_kwargs(self):
        """Keyword arguments to the runner class that can be specified
        by the job/launcher itself."""
        # use the most basic combiners; leave magic like resolving paths
        # and blanking out jobconf values to the runner
        return dict(
            # command-line has the final say on jobconf and libjars
            jobconf=combine_dicts(
                self.jobconf(), self.options.jobconf),
            libjars=combine_lists(
                self.libjars(), self.options.libjars),
            partitioner=self.partitioner(),
            sort_values=self.sort_values(),
            # TODO: should probably put self.options last below for consistency
            upload_archives=combine_lists(
                self.options.upload_archives, self.archives()),
            upload_dirs=combine_lists(
                self.options.upload_dirs, self.dirs()),
            upload_files=combine_lists(
                self.options.upload_files, self.files()),
        )

    ### Defining one-step streaming jobs ###

    def mapper(self, key, value):
        """Re-define this to define the mapper for a one-step job.

        Yields zero or more tuples of ``(out_key, out_value)``.

        :param key: A value parsed from input.
        :param value: A value parsed from input.

        If you don't re-define this, your job will have a mapper that simply
        yields ``(key, value)`` as-is.

        By default (if you don't mess with :ref:`job-protocols`):
         - ``key`` will be ``None``
         - ``value`` will be the raw input line, with newline stripped.
         - ``out_key`` and ``out_value`` must be JSON-encodable: numeric,
           unicode, boolean, ``None``, list, or dict whose keys are unicodes.
        """
        raise NotImplementedError

    def reducer(self, key, values):
        """Re-define this to define the reducer for a one-step job.

        Yields one or more tuples of ``(out_key, out_value)``

        :param key: A key which was yielded by the mapper
        :param value: A generator which yields all values yielded by the
                      mapper which correspond to ``key``.

        By default (if you don't mess with :ref:`job-protocols`):
         - ``out_key`` and ``out_value`` must be JSON-encodable.
         - ``key`` and ``value`` will have been decoded from JSON (so tuples
           will become lists).
        """
        raise NotImplementedError

    def combiner(self, key, values):
        """Re-define this to define the combiner for a one-step job.

        Yields one or more tuples of ``(out_key, out_value)``

        :param key: A key which was yielded by the mapper
        :param value: A generator which yields all values yielded by one mapper
                      task/node which correspond to ``key``.

        By default (if you don't mess with :ref:`job-protocols`):
         - ``out_key`` and ``out_value`` must be JSON-encodable.
         - ``key`` and ``value`` will have been decoded from JSON (so tuples
           will become lists).
        """
        raise NotImplementedError

    def mapper_init(self):
        """Re-define this to define an action to run before the mapper
        processes any input.

        One use for this function is to initialize mapper-specific helper
        structures.

        Yields one or more tuples of ``(out_key, out_value)``.

        By default, ``out_key`` and ``out_value`` must be JSON-encodable;
        re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
        """
        raise NotImplementedError

    def mapper_final(self):
        """Re-define this to define an action to run after the mapper reaches
        the end of input.

        One way to use this is to store a total in an instance variable, and
        output it after reading all input data. See :py:mod:`mrjob.examples`
        for an example.

        Yields one or more tuples of ``(out_key, out_value)``.

        By default, ``out_key`` and ``out_value`` must be JSON-encodable;
        re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
        """
        raise NotImplementedError

    def mapper_cmd(self):
        """Re-define this to define the mapper for a one-step job **as a shell
        command.** If you define your mapper this way, the command will be
        passed unchanged to Hadoop Streaming, with some minor exceptions. For
        important specifics, see :ref:`cmd-steps`.

        Basic example::

            def mapper_cmd(self):
                return 'cat'
        """
        raise NotImplementedError

    def mapper_pre_filter(self):
        """Re-define this to specify a shell command to filter the mapper's
        input before it gets to your job's mapper in a one-step job. For
        important specifics, see :ref:`cmd-filters`.

        Basic example::

            def mapper_pre_filter(self):
                return 'grep "ponies"'
        """
        raise NotImplementedError

    def mapper_raw(self, input_path, input_uri):
        """Re-define this to make Hadoop pass one input file to each
        mapper.

        :param input_path: a local path that the input file has been copied to
        :param input_uri: the URI of the input file on HDFS, S3, etc

        .. versionadded:: 0.6.3
        """
        raise NotImplementedError

    def reducer_init(self):
        """Re-define this to define an action to run before the reducer
        processes any input.

        One use for this function is to initialize reducer-specific helper
        structures.

        Yields one or more tuples of ``(out_key, out_value)``.

        By default, ``out_key`` and ``out_value`` must be JSON-encodable;
        re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
        """
        raise NotImplementedError

    def reducer_final(self):
        """Re-define this to define an action to run after the reducer reaches
        the end of input.

        Yields one or more tuples of ``(out_key, out_value)``.

        By default, ``out_key`` and ``out_value`` must be JSON-encodable;
        re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
        """
        raise NotImplementedError

    def reducer_cmd(self):
        """Re-define this to define the reducer for a one-step job **as a shell
        command.** If you define your mapper this way, the command will be
        passed unchanged to Hadoop Streaming, with some minor exceptions. For
        specifics, see :ref:`cmd-steps`.

        Basic example::

            def reducer_cmd(self):
                return 'cat'
        """
        raise NotImplementedError

    def reducer_pre_filter(self):
        """Re-define this to specify a shell command to filter the reducer's
        input before it gets to your job's reducer in a one-step job. For
        important specifics, see :ref:`cmd-filters`.

        Basic example::

            def reducer_pre_filter(self):
                return 'grep "ponies"'
        """
        raise NotImplementedError

    def combiner_init(self):
        """Re-define this to define an action to run before the combiner
        processes any input.

        One use for this function is to initialize combiner-specific helper
        structures.

        Yields one or more tuples of ``(out_key, out_value)``.

        By default, ``out_key`` and ``out_value`` must be JSON-encodable;
        re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
        """
        raise NotImplementedError

    def combiner_final(self):
        """Re-define this to define an action to run after the combiner reaches
        the end of input.

        Yields one or more tuples of ``(out_key, out_value)``.

        By default, ``out_key`` and ``out_value`` must be JSON-encodable;
        re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
        """
        raise NotImplementedError

    def combiner_cmd(self):
        """Re-define this to define the combiner for a one-step job **as a
        shell command.** If you define your mapper this way, the command will
        be passed unchanged to Hadoop Streaming, with some minor exceptions.
        For specifics, see :ref:`cmd-steps`.

        Basic example::

            def combiner_cmd(self):
                return 'cat'
        """
        raise NotImplementedError

    def combiner_pre_filter(self):
        """Re-define this to specify a shell command to filter the combiner's
        input before it gets to your job's combiner in a one-step job. For
        important specifics, see :ref:`cmd-filters`.

        Basic example::

            def combiner_pre_filter(self):
                return 'grep "ponies"'
        """
        raise NotImplementedError

    ### Defining one-step Spark jobs ###

    def spark(self, input_path, output_path):
        """Re-define this with Spark code to run. You can read input
        with *input_path* and output with *output_path*.

        .. warning::

           Prior to v0.6.8, to pass job methods into Spark
           (``rdd.flatMap(self.some_method)``), you first had to call
           :py:meth:`self.sandbox() <mrjob.job.MRJob.sandbox>`; otherwise
           Spark would error because *self* was not serializable.
        """
        raise NotImplementedError

    def spark_args(self):
        """Redefine this to pass custom arguments to Spark."""
        return []

    ### Defining multi-step jobs ###

    def steps(self):
        """Re-define this to make a multi-step job.

        If you don't re-define this, we'll automatically create a one-step
        job using any of :py:meth:`mapper`, :py:meth:`mapper_init`,
        :py:meth:`mapper_final`, :py:meth:`reducer_init`,
        :py:meth:`reducer_final`, and :py:meth:`reducer` that you've
        re-defined. For example::

            def steps(self):
                return [MRStep(mapper=self.transform_input,
                               reducer=self.consolidate_1),
                        MRStep(reducer_init=self.log_mapper_init,
                               reducer=self.consolidate_2)]

        :return: a list of steps constructed with
                 :py:class:`~mrjob.step.MRStep` or other classes in
                 :py:mod:`mrjob.step`.
        """
        # only include methods that have been redefined
        kwargs = dict(
            (func_name, getattr(self, func_name))
            for func_name in _JOB_STEP_FUNC_PARAMS + ('spark',)
            if (_im_func(getattr(self, func_name)) is not
                _im_func(getattr(MRJob, func_name))))

        # special case for spark()
        # TODO: support jobconf as well
        if 'spark' in kwargs:
            if sorted(kwargs) != ['spark']:
                raise ValueError(
                    "Can't mix spark() and streaming functions")
            return [SparkStep(
                spark=kwargs['spark'],
                spark_args=self.spark_args())]

        # MRStep takes commands as strings, but the user defines them in the
        # class as functions that return strings, so call the functions.
        updates = {}
        for k, v in kwargs.items():
            if k.endswith('_cmd') or k.endswith('_pre_filter'):
                updates[k] = v()

        kwargs.update(updates)

        if kwargs:
            return [MRStep(**kwargs)]
        else:
            return []

    def increment_counter(self, group, counter, amount=1):
        """Increment a counter in Hadoop streaming by printing to stderr.

        :type group: str
        :param group: counter group
        :type counter: str
        :param counter: description of the counter
        :type amount: int
        :param amount: how much to increment the counter by

        Commas in ``counter`` or ``group`` will be automatically replaced
        with semicolons (commas confuse Hadoop streaming).
        """
        # don't allow people to pass in floats
        if not isinstance(amount, integer_types):
            raise TypeError('amount must be an integer, not %r' % (amount,))

        # cast non-strings to strings (if people pass in exceptions, etc)
        if not isinstance(group, string_types):
            group = str(group)
        if not isinstance(counter, string_types):
            counter = str(counter)

        # Extra commas screw up hadoop and there's no way to escape them. So
        # replace them with the next best thing: semicolons!
        #
        # The relevant Hadoop code is incrCounter(), here:
        # http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup  # noqa
        group = group.replace(',', ';')
        counter = counter.replace(',', ';')

        line = 'reporter:counter:%s,%s,%d\n' % (group, counter, amount)
        if not isinstance(line, bytes):
            line = line.encode('utf_8')

        self.stderr.write(line)
        self.stderr.flush()

    def set_status(self, msg):
        """Set the job status in hadoop streaming by printing to stderr.

        This is also a good way of doing a keepalive for a job that goes a
        long time between outputs; Hadoop streaming usually times out jobs
        that give no output for longer than 10 minutes.
        """
        line = 'reporter:status:%s\n' % (msg,)
        if not isinstance(line, bytes):
            line = line.encode('utf_8')

        self.stderr.write(line)
        self.stderr.flush()

    ### Running the job ###

    @classmethod
    def run(cls):
        """Entry point for running job from the command-line.

        This is also the entry point when a mapper or reducer is run
        by Hadoop Streaming.

        Does one of:

        * Run a mapper (:option:`--mapper`). See :py:meth:`run_mapper`
        * Run a combiner (:option:`--combiner`). See :py:meth:`run_combiner`
        * Run a reducer (:option:`--reducer`). See :py:meth:`run_reducer`
        * Run the entire job. See :py:meth:`run_job`
        """
        # load options from the command line
        cls().execute()

    def run_job(self):
        """Run the all steps of the job, logging errors (and debugging output
        if :option:`--verbose` is specified) to STDERR and streaming the
        output to STDOUT.

        Called from :py:meth:`run`. You'd probably only want to call this
        directly from automated tests.
        """
        # self.stderr is strictly binary, need to wrap it so it's possible
        # to log to it in Python 3
        log_stream = codecs.getwriter('utf_8')(self.stderr)

        self.set_up_logging(quiet=self.options.quiet,
                            verbose=self.options.verbose,
                            stream=log_stream)

        with self.make_runner() as runner:
            try:
                runner.run()
            except StepFailedException as e:
                # no need for a runner stacktrace if step failed; runners will
                # log more useful information anyway
                log.error(str(e))
                sys.exit(1)

            if self._should_cat_output():
                for chunk in runner.cat_output():
                    self.stdout.write(chunk)
                self.stdout.flush()

    @classmethod
    def set_up_logging(cls, quiet=False, verbose=False, stream=None):
        """Set up logging when running from the command line. This is also
        used by the various command-line utilities.

        :param bool quiet: If true, don't log. Overrides *verbose*.
        :param bool verbose: If true, set log level to ``DEBUG`` (default is
                             ``INFO``)
        :param bool stream: Stream to log to (default is ``sys.stderr``)
        """
        if quiet:
            log_to_null(name='mrjob')
            log_to_null(name='__main__')
        else:
            log_to_stream(name='mrjob', debug=verbose, stream=stream)
            log_to_stream(name='__main__', debug=verbose, stream=stream)

    def _should_cat_output(self):
        if self.options.cat_output is None:
            return not self.options.output_dir
        else:
            return self.options.cat_output

    def execute(self):
        # MRJob does Hadoop Streaming stuff, or defers to its superclass
        # (MRJobLauncher) if not otherwise instructed
        if self.options.run_mapper:
            self.run_mapper(self.options.step_num)

        elif self.options.run_combiner:
            self.run_combiner(self.options.step_num)

        elif self.options.run_reducer:
            self.run_reducer(self.options.step_num)

        elif self.options.run_spark:
            self.run_spark(self.options.step_num)

        else:
            self.run_job()

    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        bad_words = (
            '--mapper', '--reducer', '--combiner', '--step-num', '--spark')
        for w in bad_words:
            if w in sys.argv:
                raise UsageError("make_runner() was called with %s. This"
                                 " probably means you tried to use it from"
                                 " __main__, which doesn't work." % w)

        runner_class = self._runner_class()
        kwargs = self._runner_kwargs()

        # screen out most false-ish args so that it's readable
        log.debug('making runner: %s(%s, ...)' % (
            runner_class.__name__,
            ', '.join('%s=%s' % (k, v)
                      for k, v in sorted(kwargs.items())
                      if v not in (None, [], {}))))

        return self._runner_class()(**self._runner_kwargs())

    def _runner_class(self):
        """Runner class as indicated by ``--runner``. Defaults to ``'inline'``.
        """
        return _runner_class(self.options.runner or 'inline')

    def _runner_kwargs(self):
        """If we're building an inline or Spark runner,
        include mrjob_cls in kwargs."""
        kwargs = combine_dicts(
            self._non_option_kwargs(),
            # don't screen out irrelevant opts (see #1898)
            self._kwargs_from_switches(set(_RUNNER_OPTS)),
            self._job_kwargs(),
        )

        if self._runner_class().alias in ('inline', 'spark'):
            kwargs = dict(mrjob_cls=self.__class__, **kwargs)

        # pass steps to runner (see #1845)
        kwargs = dict(steps=self._steps_desc(), **kwargs)

        return kwargs

    def _get_step(self, step_num, expected_type):
        """Helper for run_* methods"""
        steps = self.steps()
        if not 0 <= step_num < len(steps):
            raise ValueError('Out-of-range step: %d' % step_num)
        step = steps[step_num]
        if not isinstance(step, expected_type):
            raise TypeError('Step %d is not a %s', expected_type.__name__)
        return step

    def run_mapper(self, step_num=0):
        """Run the mapper and final mapper action for the given step.

        :type step_num: int
        :param step_num: which step to run (0-indexed)

        Called from :py:meth:`run`. You'd probably only want to call this
        directly from automated tests.
        """
        # pick input and output protocol
        read_lines, write_line = self._wrap_protocols(step_num, 'mapper')

        for k, v in self.map_pairs(read_lines(), step_num=step_num):
            write_line(k, v)

    def run_combiner(self, step_num=0):
        """Run the combiner for the given step.

        :type step_num: int
        :param step_num: which step to run (0-indexed)

        If we encounter a line that can't be decoded by our input protocol,
        or a tuple that can't be encoded by our output protocol, we'll
        increment a counter rather than raising an exception. If
        --strict-protocols is set, then an exception is raised

        Called from :py:meth:`run`. You'd probably only want to call this
        directly from automated tests.
        """
        # pick input and output protocol
        read_lines, write_line = self._wrap_protocols(step_num, 'combiner')

        for k, v in self.combine_pairs(read_lines(), step_num=step_num):
            write_line(k, v)

    def run_reducer(self, step_num=0):
        """Run the reducer for the given step.

        :type step_num: int
        :param step_num: which step to run (0-indexed)

        Called from :py:meth:`run`. You'd probably only want to call this
        directly from automated tests.
        """
        # pick input and output protocol
        read_lines, write_line = self._wrap_protocols(step_num, 'reducer')

        for k, v in self.reduce_pairs(read_lines(), step_num=step_num):
            write_line(k, v)

    def map_pairs(self, pairs, step_num=0):
        """Runs :py:meth:`mapper_init`,
        :py:meth:`mapper`/:py:meth:`mapper_raw`, and :py:meth:`mapper_final`
        for one map task in one step.

        Takes in a sequence of (key, value) pairs as input, and yields
        (key, value) pairs as output.

        :py:meth:`run_mapper` essentially wraps this method with code to handle
        reading/decoding input and writing/encoding output.

        .. versionadded:: 0.6.7
        """
        step = self._get_step(step_num, MRStep)

        mapper = step['mapper']
        mapper_raw = step['mapper_raw']
        mapper_init = step['mapper_init']
        mapper_final = step['mapper_final']

        if mapper_init:
            for k, v in mapper_init() or ():
                yield k, v

        if mapper_raw:
            if len(self.options.args) != 2:
                raise ValueError('Wrong number of args')
            input_path, input_uri = self.options.args
            for k, v in mapper_raw(input_path, input_uri) or ():
                yield k, v
        else:
            for key, value in pairs:
                for k, v in mapper(key, value) or ():
                    yield k, v

        if mapper_final:
            for k, v in mapper_final() or ():
                yield k, v

    def combine_pairs(self, pairs, step_num=0):
        """Runs :py:meth:`combiner_init`,
        :py:meth:`combiner`, and :py:meth:`combiner_final`
        for one reduce task in one step.

        Takes in a sequence of (key, value) pairs as input, and yields
        (key, value) pairs as output.

        :py:meth:`run_combiner` essentially wraps this method with code to
        handle reading/decoding input and writing/encoding output.

        .. versionadded:: 0.6.7
        """
        for k, v in self._combine_or_reduce_pairs(pairs, 'combiner', step_num):
            yield k, v

    def reduce_pairs(self, pairs, step_num=0):
        """Runs :py:meth:`reducer_init`,
        :py:meth:`reducer`, and :py:meth:`reducer_final`
        for one reduce task in one step.

        Takes in a sequence of (key, value) pairs as input, and yields
        (key, value) pairs as output.

        :py:meth:`run_reducer` essentially wraps this method with code to
        handle reading/decoding input and writing/encoding output.

        .. versionadded:: 0.6.7
        """
        for k, v in self._combine_or_reduce_pairs(pairs, 'reducer', step_num):
            yield k, v

    def _combine_or_reduce_pairs(self, pairs, mrc, step_num=0):
        """Helper for :py:meth:`combine_pairs` and :py:meth:`reduce_pairs`."""
        step = self._get_step(step_num, MRStep)

        task = step[mrc]
        task_init = step[mrc + '_init']
        task_final = step[mrc + '_final']
        if task is None:
            raise ValueError('No %s in step %d' % (mrc, step_num))

        if task_init:
            for k, v in task_init() or ():
                yield k, v

        # group all values of the same key together, and pass to the reducer
        #
        # be careful to use generators for everything, to allow for
        # very large groupings of values
        for key, pairs_for_key in itertools.groupby(pairs, lambda k_v: k_v[0]):
            values = (value for _, value in pairs_for_key)
            for k, v in task(key, values) or ():
                yield k, v

        if task_final:
            for k, v in task_final() or ():
                yield k, v

    def run_spark(self, step_num):
        """Run the Spark code for the given step.

        :type step_num: int
        :param step_num: which step to run (0-indexed)

        Called from :py:meth:`run`. You'd probably only want to call this
        directly from automated tests.
        """
        step = self._get_step(step_num, SparkStep)

        if len(self.options.args) != 2:
            raise ValueError('Wrong number of args')
        input_path, output_path = self.options.args

        spark_method = step.spark
        spark_method(input_path, output_path)

    def _steps_desc(self):
        step_descs = []
        for step_num, step in enumerate(self.steps()):
            step_descs.append(step.description(step_num))
        return step_descs

    @classmethod
    def mr_job_script(cls):
        """Path of this script. This returns the file containing
        this class, or ``None`` if there isn't any (e.g. it was
        defined from the command line interface.)"""
        try:
            return inspect.getsourcefile(cls)
        except TypeError:
            return None

    ### Other useful utilities ###

    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.options.args or ['-']

        for path in paths:
            if path == '-':
                for line in self.stdin:
                    yield line
            else:
                with open(path, 'rb') as f:
                    for line in to_lines(decompress(f, path)):
                        yield line

    def _wrap_protocols(self, step_num, step_type):
        """Pick the protocol classes to use for reading and writing
        for the given step.

        Returns a tuple of ``(read_lines, write_line)``

        ``read_lines()`` is a function that reads lines from input, decodes
            them, and yields key, value pairs.
        ``write_line()`` is a function that takes key and value as args,
            encodes them, and writes a line to output.

        :param step_num: which step to run (e.g. 0)
        :param step_type: ``'mapper'``, ``'reducer'``, or ``'combiner'`` from
                          :py:mod:`mrjob.step`
        """
        read, write = self.pick_protocols(step_num, step_type)

        def read_lines():
            for line in self._read_input():
                key, value = read(line.rstrip(b'\r\n'))
                yield key, value

        def write_line(key, value):
            self.stdout.write(write(key, value))
            self.stdout.write(b'\n')

        return read_lines, write_line

    def _step_key(self, step_num, step_type):
        return '%d-%s' % (step_num, step_type)

    def _script_step_mapping(self, steps_desc):
        """Return a mapping of ``self._step_key(step_num, step_type)`` ->
        (place in sort order of all *script* steps), for the purposes of
        choosing which protocols to use for input and output.

        Non-script steps do not appear in the mapping.
        """
        mapping = {}
        script_step_num = 0
        for i, step in enumerate(steps_desc):

            if 'mapper' in step and step['mapper']['type'] == 'script':
                k = self._step_key(i, 'mapper')
                mapping[k] = script_step_num
                script_step_num += 1

            if 'reducer' in step and step['reducer']['type'] == 'script':
                k = self._step_key(i, 'reducer')
                mapping[k] = script_step_num
                script_step_num += 1

        return mapping

    def _mapper_output_protocol(self, step_num, step_map):
        map_key = self._step_key(step_num, 'mapper')
        if map_key in step_map:
            if step_map[map_key] >= (len(step_map) - 1):
                return self.output_protocol()
            else:
                return self.internal_protocol()
        else:
            # mapper is not a script substep, so protocols don't apply at all
            return RawValueProtocol()

    def _pick_protocol_instances(self, step_num, step_type):
        steps_desc = self._steps_desc()

        step_map = self._script_step_mapping(steps_desc)

        # pick input protocol

        if step_type == 'combiner':
            # Combiners read and write the mapper's output protocol because
            # they have to be able to run 0-inf times without changing the
            # format of the data.
            # Combiners for non-script substeps can't use protocols, so this
            # function will just give us RawValueProtocol() in that case.
            previous_mapper_output = self._mapper_output_protocol(
                step_num, step_map)
            return previous_mapper_output, previous_mapper_output
        else:
            step_key = self._step_key(step_num, step_type)

            if step_key not in step_map:
                raise ValueError(
                    "Can't pick a protocol for a non-script step")

            real_num = step_map[step_key]
            if real_num == (len(step_map) - 1):
                write = self.output_protocol()
            else:
                write = self.internal_protocol()

            if real_num == 0:
                read = self.input_protocol()
            else:
                read = self.internal_protocol()
            return read, write

    def pick_protocols(self, step_num, step_type):
        """Pick the protocol classes to use for reading and writing for the
        given step.

        :type step_num: int
        :param step_num: which step to run (e.g. ``0`` for the first step)
        :type step_type: str
        :param step_type: one of `'mapper'`, `'combiner'`, or `'reducer'`
        :return: (read_function, write_function)

        By default, we use one protocol for reading input, one
        internal protocol for communication between steps, and one
        protocol for final output (which is usually the same as the
        internal protocol). Protocols can be controlled by setting
        :py:attr:`INPUT_PROTOCOL`, :py:attr:`INTERNAL_PROTOCOL`, and
        :py:attr:`OUTPUT_PROTOCOL`.

        Re-define this if you need fine control over which protocols
        are used by which steps.
        """

        # wrapping functionality like this makes testing much simpler
        p_read, p_write = self._pick_protocol_instances(step_num, step_type)

        return p_read.read, p_write.write

    ### Command-line arguments ###

    def configure_args(self):
        """Define arguments for this script. Called from :py:meth:`__init__()`.

        Re-define to define custom command-line arguments or pass
        through existing ones::

            def configure_args(self):
                super(MRYourJob, self).configure_args()

                self.add_passthru_arg(...)
                self.add_file_arg(...)
                self.pass_arg_through(...)
                ...
        """
        self.arg_parser.add_argument(
            dest='args', nargs='*',
            help=('input paths to read (or stdin if not set). If --spark'
                  ' is set, the input and output path for the spark job.'))

        _add_basic_args(self.arg_parser)
        _add_job_args(self.arg_parser)
        _add_runner_args(self.arg_parser)
        _add_step_args(self.arg_parser, include_deprecated=True)

    def load_args(self, args):
        """Load command-line options into ``self.options``.

        Called from :py:meth:`__init__()` after :py:meth:`configure_args`.

        :type args: list of str
        :param args: a list of command line arguments. ``None`` will be
                     treated the same as ``[]``.

        Re-define if you want to post-process command-line arguments::

            def load_args(self, args):
                super(MRYourJob, self).load_args(args)

                self.stop_words = self.options.stop_words.split(',')
                ...
        """
        if hasattr(self.arg_parser, 'parse_intermixed_args'):
            # restore old optparse behavior on Python 3.7+. See #1701
            self.options = self.arg_parser.parse_intermixed_args(args)
        else:
            self.options = self.arg_parser.parse_args(args)

        if self.options.help:
            self._print_help(self.options)
            sys.exit(0)

    def add_file_arg(self, *args, **kwargs):
        """Add a command-line option that sends an external file
        (e.g. a SQLite DB) to Hadoop::

             def configure_args(self):
                super(MRYourJob, self).configure_args()
                self.add_file_arg('--scoring-db', help=...)

        This does the right thing: the file will be uploaded to the working
        dir of the script on Hadoop, and the script will be passed the same
        option, but with the local name of the file in the script's working
        directory.

        .. note::

           If you pass a file to a job, best practice is to lazy-load its
           contents (e.g. make a method that opens the file the first time
           you call it) rather than loading it in your job's constructor or
           :py:meth:`load_args`. Not only is this more efficient, it's
           necessary if you want to run your job in a Spark executor
           (because the file may not be in the same place in a Spark driver).

        .. note::

           We suggest against sending Berkeley DBs to your job, as
           Berkeley DB is not forwards-compatible (so a Berkeley DB that you
           construct on your computer may not be readable from within
           Hadoop). Use SQLite databases instead. If all you need is an on-disk
           hash table, try out the :py:mod:`sqlite3dbm` module.

        .. versionchanged:: 0.6.6

           now accepts explicit ``type=str``

        .. versionchanged:: 0.6.8

           fully supported on Spark, including ``local[*]`` master
        """
        if kwargs.get('type') not in (None, str):
            raise ArgumentTypeError(
                'file options must take strings')

        if kwargs.get('action') not in (None, 'append', 'store'):
            raise ArgumentTypeError(
                "file options must use the actions 'store' or 'append'")

        pass_opt = self.arg_parser.add_argument(*args, **kwargs)

        self._file_arg_dests.add(pass_opt.dest)

    def add_passthru_arg(self, *args, **kwargs):
        """Function to create options which both the job runner
        and the job itself respect (we use this for protocols, for example).

        Use it like you would use
        :py:func:`argparse.ArgumentParser.add_argument`::

            def configure_args(self):
                super(MRYourJob, self).configure_args()
                self.add_passthru_arg(
                    '--max-ngram-size', type=int, default=4, help='...')

        If you want to pass files through to the mapper/reducer, use
        :py:meth:`add_file_arg` instead.

        If you want to pass through a built-in option (e.g. ``--runner``, use
        :py:meth:`pass_arg_through` instead.
        """
        pass_opt = self.arg_parser.add_argument(*args, **kwargs)

        self._passthru_arg_dests.add(pass_opt.dest)

    def pass_arg_through(self, opt_str):
        """Pass the given argument through to the job."""

        # _actions is hidden but the interface appears to be stable,
        # and there's no non-hidden interface we can use
        for action in self.arg_parser._actions:
            if opt_str in action.option_strings or opt_str == action.dest:
                self._passthru_arg_dests.add(action.dest)
                break
        else:
            raise ValueError('unknown arg: %s', opt_str)

    def is_task(self):
        """True if this is a mapper, combiner, reducer, or Spark script.

        This is mostly useful inside :py:meth:`load_args`, to disable
        loading args when we aren't running inside Hadoop.
        """
        return (self.options.run_mapper or
                self.options.run_combiner or
                self.options.run_reducer or
                self.options.run_spark)

    ### protocols ###

    def input_protocol(self):
        """Instance of the protocol to use to convert input lines to Python
        objects. Default behavior is to return an instance of
        :py:attr:`INPUT_PROTOCOL`.
        """
        if not isinstance(self.INPUT_PROTOCOL, type):
            log.warning('INPUT_PROTOCOL should be a class, not %s' %
                        self.INPUT_PROTOCOL)
        return self.INPUT_PROTOCOL()

    def internal_protocol(self):
        """Instance of the protocol to use to communicate between steps.
        Default behavior is to return an instance of
        :py:attr:`INTERNAL_PROTOCOL`.
        """
        if not isinstance(self.INTERNAL_PROTOCOL, type):
            log.warning('INTERNAL_PROTOCOL should be a class, not %s' %
                        self.INTERNAL_PROTOCOL)
        return self.INTERNAL_PROTOCOL()

    def output_protocol(self):
        """Instance of the protocol to use to convert Python objects to output
        lines. Default behavior is to return an instance of
        :py:attr:`OUTPUT_PROTOCOL`.
        """
        if not isinstance(self.OUTPUT_PROTOCOL, type):
            log.warning('OUTPUT_PROTOCOL should be a class, not %s' %
                        self.OUTPUT_PROTOCOL)
        return self.OUTPUT_PROTOCOL()

    #: Protocol for reading input to the first mapper in your job.
    #: Default: :py:class:`RawValueProtocol`.
    #:
    #: For example you know your input data were in JSON format, you could
    #: set::
    #:
    #:     INPUT_PROTOCOL = JSONValueProtocol
    #:
    #: in your class, and your initial mapper would receive decoded JSONs
    #: rather than strings.
    #:
    #: See :py:data:`mrjob.protocol` for the full list of protocols.
    INPUT_PROTOCOL = RawValueProtocol

    #: Protocol for communication between steps and final output.
    #: Default: :py:class:`JSONProtocol`.
    #:
    #: For example if your step output weren't JSON-encodable, you could set::
    #:
    #:     INTERNAL_PROTOCOL = PickleProtocol
    #:
    #: and step output would be encoded as string-escaped pickles.
    #:
    #: See :py:data:`mrjob.protocol` for the full list of protocols.
    INTERNAL_PROTOCOL = JSONProtocol

    #: Protocol to use for writing output. Default: :py:class:`JSONProtocol`.
    #:
    #: For example, if you wanted the final output in repr, you could set::
    #:
    #:     OUTPUT_PROTOCOL = ReprProtocol
    #:
    #: See :py:data:`mrjob.protocol` for the full list of protocols.
    OUTPUT_PROTOCOL = JSONProtocol

    def parse_output(self, chunks):
        """Parse the final output of this MRJob (as a stream of byte chunks)
        into a stream of ``(key, value)``.
        """
        read = self.output_protocol().read

        for line in to_lines(chunks):
            yield read(line)

    ### Hadoop Input/Output Formats ###

    #: Optional name of an optional Hadoop ``InputFormat`` class, e.g.
    #: ``'org.apache.hadoop.mapred.lib.NLineInputFormat'``.
    #:
    #: Passed to Hadoop with the *first* step of this job with the
    #: ``-inputformat`` option.
    #:
    #: If you require more sophisticated behavior, try
    #: :py:meth:`hadoop_input_format` or the *hadoop_input_format* argument to
    #: :py:meth:`mrjob.runner.MRJobRunner.__init__`.
    HADOOP_INPUT_FORMAT = None

    def hadoop_input_format(self):
        """Optional Hadoop ``InputFormat`` class to parse input for
        the first step of the job.

        Normally, setting :py:attr:`HADOOP_INPUT_FORMAT` is sufficient;
        redefining this method is only for when you want to get fancy.
        """
        return self.HADOOP_INPUT_FORMAT

    #: Optional name of an optional Hadoop ``OutputFormat`` class, e.g.
    #: ``'org.apache.hadoop.mapred.FileOutputFormat'``.
    #:
    #: Passed to Hadoop with the *last* step of this job with the
    #: ``-outputformat`` option.
    #:
    #: If you require more sophisticated behavior, try
    #: :py:meth:`hadoop_output_format` or the *hadoop_output_format* argument
    #: to :py:meth:`mrjob.runner.MRJobRunner.__init__`.
    HADOOP_OUTPUT_FORMAT = None

    def hadoop_output_format(self):
        """Optional Hadoop ``OutputFormat`` class to write output for
        the last step of the job.

        Normally, setting :py:attr:`HADOOP_OUTPUT_FORMAT` is sufficient;
        redefining this method is only for when you want to get fancy.
        """
        return self.HADOOP_OUTPUT_FORMAT

    ### Libjars ###

    #: Optional list of paths of jar files to run our job with using Hadoop's
    #: ``-libjars`` option.
    #:
    #: ``~`` and environment variables
    #: in paths be expanded, and relative paths will be interpreted as
    #: relative to the directory containing the script (not the current
    #: working directory).
    #:
    #: If you require more sophisticated behavior, try overriding
    #: :py:meth:`libjars`.
    LIBJARS = []

    def libjars(self):
        """Optional list of paths of jar files to run our job with using
        Hadoop's ``-libjars`` option. Normally setting :py:attr:`LIBJARS`
        is sufficient. Paths from :py:attr:`LIBJARS` are interpreted as
        relative to the the directory containing the script (paths from the
        command-line are relative to the current working directory).

        Note that ``~`` and environment variables in paths will always be
        expanded by the job runner (see :mrjob-opt:`libjars`).

        .. versionchanged:: 0.6.6

           re-defining this no longer clobbers the command-line
           ``--libjars`` option
        """
        script_dir = os.path.dirname(self.mr_job_script())

        paths = []

        # libjar paths will eventually be combined with combine_path_lists,
        # which will expand environment variables. We don't want to assume
        # a path like $MY_DIR/some.jar is always relative ($MY_DIR could start
        # with /), but we also don't want to expand environment variables
        # prematurely.
        for path in self.LIBJARS or []:
            if os.path.isabs(expand_path(path)):
                paths.append(path)
            else:
                paths.append(os.path.join(script_dir, path))

        return paths

    ### Partitioning ###

    #: Optional Hadoop partitioner class to use to determine how mapper
    #: output should be sorted and distributed to reducers. For example:
    #: ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
    #:
    #: If you require more sophisticated behavior, try :py:meth:`partitioner`.
    PARTITIONER = None

    def partitioner(self):
        """Optional Hadoop partitioner class to use to determine how mapper
        output should be sorted and distributed to reducers.

        By default, returns :py:attr:`PARTITIONER`.

        You probably don't need to re-define this; it's just here for
        completeness.
        """
        return self.PARTITIONER

    ### Uploading support files ###

    #: Optional list of archives to upload and unpack in the job's working
    #: directory. These can be URIs or paths on the local filesystem.
    #:
    #: Relative paths will be interpreted as relative to the directory
    #: containing the script (not the current working directory).
    #
    #: Environment variables and ``~`` in paths will be expanded.
    #:
    #: By default, the directory will have the same name as the archive
    #: (e.g. ``foo.tar.gz/``). To change the directory's name, append
    #: ``#<name>``::
    #:
    #:     ARCHIVES = ['data/foo.tar.gz#foo']
    #:
    #: If you need to dynamically generate a list of files, override
    #: :py:meth:`archives` instead.
    #:
    #: .. versionadded:: 0.6.4
    ARCHIVES = []

    #: Optional list of directories to upload to the job's working directory.
    #: These can be URIs or paths on the local filesystem.
    #:
    #: Relative paths will be interpreted as relative to the directory
    #: containing the script (not the current working directory).
    #
    #: Environment variables and ``~`` in paths will be expanded.
    #:
    #: If you want a directory to be copied with a name other than it's own,
    #: append ``#<name>`` (e.g. ``data/foo#bar``).
    #:
    #: If you need to dynamically generate a list of files, override
    #: :py:meth:`dirs` instead.
    #:
    #: .. versionadded:: 0.6.4
    DIRS = []

    #: Optional list of files to upload to the job's working directory.
    #: These can be URIs or paths on the local filesystem.
    #:
    #: Relative paths will be interpreted as relative to the directory
    #: containing the script (not the current working directory).
    #
    #: Environment variables and ``~`` in paths will be expanded.
    #:
    #: If you want a file to be uploaded to a filename other than it's own,
    #: append ``#<name>`` (e.g. ``data/foo.json#bar.json``).
    #:
    #: If you need to dynamically generate a list of files, override
    #: :py:meth:`files` instead.
    #:
    #: .. versionadded:: 0.6.4
    FILES = []

    def archives(self):
        """Like :py:attr:`ARCHIVES`, except that it can return a dynamically
        generated list of archives to upload and unpack. Overriding
        this method disables :py:attr:`ARCHIVES`.

        Paths returned by this method are relative to the working directory
        (not the script). Note that the job runner will *always* expand
        environment variables and ``~`` in paths returned by this method.

        You do not have to worry about inadvertently disabling ``--archives``;
        this switch is handled separately.

        .. versionadded:: 0.6.4
        """
        return self._upload_attr('ARCHIVES')

    def dirs(self):
        """Like :py:attr:`DIRS`, except that it can return a dynamically
        generated list of directories to upload. Overriding
        this method disables :py:attr:`DIRS`.

        Paths returned by this method are relative to the working directory
        (not the script). Note that the job runner will *always* expand
        environment variables and ``~`` in paths returned by this method.

        You do not have to worry about inadvertently disabling ``--dirs``;
        this switch is handled separately.

        .. versionadded:: 0.6.4
        """
        return self._upload_attr('DIRS')

    def files(self):
        """Like :py:attr:`FILES`, except that it can return a dynamically
        generated list of files to upload. Overriding
        this method disables :py:attr:`FILES`.

        Paths returned by this method are relative to the working directory
        (not the script). Note that the job runner will *always* expand
        environment variables and ``~`` in paths returned by this method.

        You do not have to worry about inadvertently disabling ``--files``;
        this switch is handled separately.

        .. versionadded:: 0.6.4
        """
        return self._upload_attr('FILES')

    def _upload_attr(self, attr_name):
        """Helper for :py:meth:`archives`, :py:meth:`dirs`, and
        :py:meth:`files`"""
        attr_value = getattr(self, attr_name)

        # catch path instead of a list of paths
        if isinstance(attr_value, string_types):
            raise TypeError('%s must be a list or other sequence.' % attr_name)

        script_dir = os.path.dirname(self.mr_job_script())
        paths = []

        for path in attr_value:
            expanded_path = expand_path(path)

            if os.path.isabs(expanded_path):
                paths.append(path)
            else:
                # relative subdirs are confusing; people will expect them
                # to appear in a subdir, not the same directory as the script,
                # but Hadoop doesn't work that way
                if os.sep in path.rstrip(os.sep) and '#' not in path:
                    log.warning(
                        '%s: %s will appear in same directory as job script,'
                        ' not a subdirectory' % (attr_name, path))

                paths.append(os.path.join(script_dir, path))

        return paths

    ### Jobconf ###

    #: Optional jobconf arguments we should always pass to Hadoop. This
    #: is a map from property name to value. e.g.:
    #:
    #: ``{'stream.num.map.output.key.fields': '4'}``
    #:
    #: It's recommended that you only use this to hard-code things that
    #: affect the semantics of your job, and leave performance tweaks to
    #: the command line or whatever you use to launch your job.
    JOBCONF = {}

    def jobconf(self):
        """``-D`` args to pass to hadoop streaming. This should be a map
        from property name to value. By default, returns :py:attr:`JOBCONF`.

        .. versionchanged:: 0.6.6

           re-defining longer clobbers command-line
           ``--jobconf`` options.
        """
        return dict(self.JOBCONF)

    ### Secondary Sort ###

    #: Set this to ``True`` if you would like reducers to receive the values
    #: associated with any key in sorted order (sorted by their *encoded*
    #: value). Also known as secondary sort.
    #:
    #: This can be useful if you expect more values than you can fit in memory
    #: to be associated with one key, but you want to apply information in
    #: a small subset of these values to information in the other values.
    #: For example, you may want to convert counts to percentages, and to do
    #: this you first need to know the total count.
    #:
    #: Even though values are sorted by their encoded value, most encodings
    #: will sort strings in order. For example, you could have values like:
    #: ``['A', <total>]``, ``['B', <count_name>, <count>]``, and the value
    #: containing the total should come first regardless of what protocol
    #: you're using.
    #:
    #: See :py:meth:`jobconf()` and :py:meth:`partitioner()` for more about
    SORT_VALUES = None

    def sort_values(self):
        """A method that by default, just returns the value of
        :py:attr:`SORT_VALUES`. Mostly exists for the sake
        of consistency, but you could override it if you wanted to make
        secondary sort configurable."""
        return self.SORT_VALUES

    ### Testing ###

    def sandbox(self, stdin=None, stdout=None, stderr=None):
        """Redirect stdin, stdout, and stderr for automated testing.

        You can set stdin, stdout, and stderr to file objects. By
        default, they'll be set to empty ``BytesIO`` objects.
        You can then access the job's file handles through ``self.stdin``,
        ``self.stdout``, and ``self.stderr``. See :ref:`testing` for more
        information about testing.

        You may call sandbox multiple times (this will essentially clear
        the file handles).

        ``stdin`` is empty by default. You can set it to anything that yields
        lines::

            mr_job.sandbox(stdin=BytesIO(b'some_data\\n'))

        or, equivalently::

            mr_job.sandbox(stdin=[b'some_data\\n'])

        For convenience, this sandbox() returns self, so you can do::

            mr_job = MRJobClassToTest().sandbox()

        Simple testing example::

            mr_job = MRYourJob.sandbox()
            self.assertEqual(list(mr_job.reducer('foo', ['a', 'b'])), [...])

        More complex testing example::

            from BytesIO import BytesIO

            from mrjob.parse import parse_mr_job_stderr
            from mrjob.protocol import JSONProtocol

            mr_job = MRYourJob(args=[...])

            fake_input = '"foo"\\t"bar"\\n"foo"\\t"baz"\\n'
            mr_job.sandbox(stdin=BytesIO(fake_input))

            mr_job.run_reducer(link_num=0)

            self.assertEqual(mrjob.stdout.getvalue(), ...)
            self.assertEqual(parse_mr_job_stderr(mr_job.stderr), ...)

        .. note::

           If you are using Spark, it's recommended you only pass in
           :py:class:`io.BytesIO` or other serializable alternatives to file
           objects. *stdin*, *stdout*, and *stderr* get stored as job
           attributes, which means if they aren't serializable, neither
           is the job instance or its methods.
        """
        self._stdin = stdin or BytesIO()
        self._stdout = stdout or BytesIO()
        self._stderr = stderr or BytesIO()

        return self


if __name__ == '__main__':
    MRJob.run()