GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/job.py
2025-06-22 13:36:01 +05:30

1661 lines
60 KiB
Python

# Copyright 2009-2017 Yelp and Contributors
# Copyright 2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Class to inherit your MapReduce jobs from. See :doc:`guides/writing-mrjobs`
for more information."""
# don't add imports here that aren't part of the standard Python library,
# since MRJobs need to run in Amazon's generic EMR environment
import codecs
import inspect
import itertools
import logging
import os
import os.path
import sys
import time
from io import BytesIO
from argparse import ArgumentParser
from argparse import ArgumentTypeError
# don't use relative imports, to allow this script to be invoked as __main__
from mrjob.cat import decompress
from mrjob.conf import combine_dicts
from mrjob.conf import combine_lists
from mrjob.options import _add_basic_args
from mrjob.options import _add_job_args
from mrjob.options import _add_runner_args
from mrjob.options import _add_step_args
from mrjob.options import _parse_raw_args
from mrjob.options import _print_basic_help
from mrjob.options import _print_help_for_runner
from mrjob.options import _RUNNER_OPTS
from mrjob.protocol import JSONProtocol
from mrjob.protocol import RawValueProtocol
from mrjob.py2 import integer_types
from mrjob.py2 import string_types
from mrjob.runner import _runner_class
from mrjob.setup import parse_legacy_hash_path
from mrjob.step import _JOB_STEP_FUNC_PARAMS
from mrjob.step import MRStep
from mrjob.step import SparkStep
from mrjob.step import StepFailedException
from mrjob.util import expand_path
from mrjob.util import log_to_null
from mrjob.util import log_to_stream
from mrjob.util import to_lines
log = logging.getLogger(__name__)
# sentinel value; used when running MRJob as a script
_READ_ARGS_FROM_SYS_ARGV = '_READ_ARGS_FROM_SYS_ARGV'
class UsageError(Exception):
pass
def _im_func(f):
"""Wrapper to get at the underlying function belonging to a method.
Python 2 is slightly different because classes have "unbound methods"
which wrap the underlying function, whereas on Python 3 they're just
functions. (Methods work the same way on both versions.)
"""
# "im_func" is the old Python 2 name for __func__
if hasattr(f, '__func__'):
return f.__func__
else:
return f
class MRJob(object):
"""The base class for all MapReduce jobs. See :py:meth:`__init__`
for details."""
def __init__(self, args=None):
"""Entry point for running your job from other Python code.
You can pass in command-line arguments, and the job will act the same
way it would if it were run from the command line. For example, to
run your job on EMR::
mr_job = MRYourJob(args=['-r', 'emr'])
with mr_job.make_runner() as runner:
...
Passing in ``None`` is the same as passing in ``sys.argv[1:]``
For a full list of command-line arguments, run:
``python -m mrjob.job --help``
:param args: Arguments to your script (switches and input files)
.. versionchanged:: 0.7.0
Previously, *args* set to ``None`` was equivalent to ``[]``.
"""
# make sure we respect the $TZ (time zone) environment variable
if hasattr(time, 'tzset'):
time.tzset()
# argument dests for args to pass through
self._passthru_arg_dests = set()
self._file_arg_dests = set()
self.arg_parser = ArgumentParser(usage=self._usage(),
add_help=False)
self.configure_args()
if args is None:
self._cl_args = sys.argv[1:]
else:
# don't pass sys.argv to self.arg_parser, and have it
# raise an exception on error rather than printing to stderr
# and exiting.
self._cl_args = args
def error(msg):
raise ValueError(msg)
self.arg_parser.error = error
self.load_args(self._cl_args)
# Make it possible to redirect stdin, stdout, and stderr, for testing
# See stdin, stdout, stderr properties and sandbox(), below.
self._stdin = None
self._stdout = None
self._stderr = None
# by default, self.stdin, self.stdout, and self.stderr are sys.std*.buffer
# if it exists, and otherwise sys.std* otherwise (they should always deal
# with bytes, not Unicode).
#
# *buffer* is pretty much a Python 3 thing, though some platforms
# (notably Jupyterhub) don't have it. See #1441
@property
def stdin(self):
return self._stdin or getattr(sys.stdin, 'buffer', sys.stdin)
@property
def stdout(self):
return self._stdout or getattr(sys.stdout, 'buffer', sys.stdout)
@property
def stderr(self):
return self._stderr or getattr(sys.stderr, 'buffer', sys.stderr)
def _usage(self):
return "%(prog)s [options] [input files]"
def _print_help(self, options):
"""Print help for this job. This will either print runner
or basic help. Override to allow other kinds of help."""
if options.runner:
_print_help_for_runner(
self._runner_opt_names_for_help(), options.deprecated)
else:
_print_basic_help(self.arg_parser,
self._usage(),
options.deprecated,
options.verbose)
def _runner_opt_names_for_help(self):
opts = set(self._runner_class().OPT_NAMES)
if self.options.runner == 'spark':
# specific to Spark runner, but command-line only, so it doesn't
# appear in SparkMRJobRunner.OPT_NAMES (see #2040)
opts.add('max_output_files')
return opts
def _non_option_kwargs(self):
"""Keyword arguments to runner constructor that can't be set
in mrjob.conf.
These should match the (named) arguments to
:py:meth:`~mrjob.runner.MRJobRunner.__init__`.
"""
# build extra_args
raw_args = _parse_raw_args(self.arg_parser, self._cl_args)
extra_args = []
for dest, option_string, args in raw_args:
if dest in self._file_arg_dests:
extra_args.append(option_string)
extra_args.append(parse_legacy_hash_path('file', args[0]))
elif dest in self._passthru_arg_dests:
# special case for --hadoop-args=-verbose etc.
if (option_string and len(args) == 1 and
args[0].startswith('-')):
extra_args.append('%s=%s' % (option_string, args[0]))
else:
if option_string:
extra_args.append(option_string)
extra_args.extend(args)
# max_output_files is added by _add_runner_args() but can only
# be set from the command line, so we add it here (see #2040)
return dict(
conf_paths=self.options.conf_paths,
extra_args=extra_args,
hadoop_input_format=self.hadoop_input_format(),
hadoop_output_format=self.hadoop_output_format(),
input_paths=self.options.args,
max_output_files=self.options.max_output_files,
mr_job_script=self.mr_job_script(),
output_dir=self.options.output_dir,
partitioner=self.partitioner(),
stdin=self.stdin,
step_output_dir=self.options.step_output_dir,
)
def _kwargs_from_switches(self, keys):
return dict(
(key, getattr(self.options, key))
for key in keys if hasattr(self.options, key)
)
def _job_kwargs(self):
"""Keyword arguments to the runner class that can be specified
by the job/launcher itself."""
# use the most basic combiners; leave magic like resolving paths
# and blanking out jobconf values to the runner
return dict(
# command-line has the final say on jobconf and libjars
jobconf=combine_dicts(
self.jobconf(), self.options.jobconf),
libjars=combine_lists(
self.libjars(), self.options.libjars),
partitioner=self.partitioner(),
sort_values=self.sort_values(),
# TODO: should probably put self.options last below for consistency
upload_archives=combine_lists(
self.options.upload_archives, self.archives()),
upload_dirs=combine_lists(
self.options.upload_dirs, self.dirs()),
upload_files=combine_lists(
self.options.upload_files, self.files()),
)
### Defining one-step streaming jobs ###
def mapper(self, key, value):
"""Re-define this to define the mapper for a one-step job.
Yields zero or more tuples of ``(out_key, out_value)``.
:param key: A value parsed from input.
:param value: A value parsed from input.
If you don't re-define this, your job will have a mapper that simply
yields ``(key, value)`` as-is.
By default (if you don't mess with :ref:`job-protocols`):
- ``key`` will be ``None``
- ``value`` will be the raw input line, with newline stripped.
- ``out_key`` and ``out_value`` must be JSON-encodable: numeric,
unicode, boolean, ``None``, list, or dict whose keys are unicodes.
"""
raise NotImplementedError
def reducer(self, key, values):
"""Re-define this to define the reducer for a one-step job.
Yields one or more tuples of ``(out_key, out_value)``
:param key: A key which was yielded by the mapper
:param value: A generator which yields all values yielded by the
mapper which correspond to ``key``.
By default (if you don't mess with :ref:`job-protocols`):
- ``out_key`` and ``out_value`` must be JSON-encodable.
- ``key`` and ``value`` will have been decoded from JSON (so tuples
will become lists).
"""
raise NotImplementedError
def combiner(self, key, values):
"""Re-define this to define the combiner for a one-step job.
Yields one or more tuples of ``(out_key, out_value)``
:param key: A key which was yielded by the mapper
:param value: A generator which yields all values yielded by one mapper
task/node which correspond to ``key``.
By default (if you don't mess with :ref:`job-protocols`):
- ``out_key`` and ``out_value`` must be JSON-encodable.
- ``key`` and ``value`` will have been decoded from JSON (so tuples
will become lists).
"""
raise NotImplementedError
def mapper_init(self):
"""Re-define this to define an action to run before the mapper
processes any input.
One use for this function is to initialize mapper-specific helper
structures.
Yields one or more tuples of ``(out_key, out_value)``.
By default, ``out_key`` and ``out_value`` must be JSON-encodable;
re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
"""
raise NotImplementedError
def mapper_final(self):
"""Re-define this to define an action to run after the mapper reaches
the end of input.
One way to use this is to store a total in an instance variable, and
output it after reading all input data. See :py:mod:`mrjob.examples`
for an example.
Yields one or more tuples of ``(out_key, out_value)``.
By default, ``out_key`` and ``out_value`` must be JSON-encodable;
re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
"""
raise NotImplementedError
def mapper_cmd(self):
"""Re-define this to define the mapper for a one-step job **as a shell
command.** If you define your mapper this way, the command will be
passed unchanged to Hadoop Streaming, with some minor exceptions. For
important specifics, see :ref:`cmd-steps`.
Basic example::
def mapper_cmd(self):
return 'cat'
"""
raise NotImplementedError
def mapper_pre_filter(self):
"""Re-define this to specify a shell command to filter the mapper's
input before it gets to your job's mapper in a one-step job. For
important specifics, see :ref:`cmd-filters`.
Basic example::
def mapper_pre_filter(self):
return 'grep "ponies"'
"""
raise NotImplementedError
def mapper_raw(self, input_path, input_uri):
"""Re-define this to make Hadoop pass one input file to each
mapper.
:param input_path: a local path that the input file has been copied to
:param input_uri: the URI of the input file on HDFS, S3, etc
.. versionadded:: 0.6.3
"""
raise NotImplementedError
def reducer_init(self):
"""Re-define this to define an action to run before the reducer
processes any input.
One use for this function is to initialize reducer-specific helper
structures.
Yields one or more tuples of ``(out_key, out_value)``.
By default, ``out_key`` and ``out_value`` must be JSON-encodable;
re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
"""
raise NotImplementedError
def reducer_final(self):
"""Re-define this to define an action to run after the reducer reaches
the end of input.
Yields one or more tuples of ``(out_key, out_value)``.
By default, ``out_key`` and ``out_value`` must be JSON-encodable;
re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
"""
raise NotImplementedError
def reducer_cmd(self):
"""Re-define this to define the reducer for a one-step job **as a shell
command.** If you define your mapper this way, the command will be
passed unchanged to Hadoop Streaming, with some minor exceptions. For
specifics, see :ref:`cmd-steps`.
Basic example::
def reducer_cmd(self):
return 'cat'
"""
raise NotImplementedError
def reducer_pre_filter(self):
"""Re-define this to specify a shell command to filter the reducer's
input before it gets to your job's reducer in a one-step job. For
important specifics, see :ref:`cmd-filters`.
Basic example::
def reducer_pre_filter(self):
return 'grep "ponies"'
"""
raise NotImplementedError
def combiner_init(self):
"""Re-define this to define an action to run before the combiner
processes any input.
One use for this function is to initialize combiner-specific helper
structures.
Yields one or more tuples of ``(out_key, out_value)``.
By default, ``out_key`` and ``out_value`` must be JSON-encodable;
re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
"""
raise NotImplementedError
def combiner_final(self):
"""Re-define this to define an action to run after the combiner reaches
the end of input.
Yields one or more tuples of ``(out_key, out_value)``.
By default, ``out_key`` and ``out_value`` must be JSON-encodable;
re-define :py:attr:`INTERNAL_PROTOCOL` to change this.
"""
raise NotImplementedError
def combiner_cmd(self):
"""Re-define this to define the combiner for a one-step job **as a
shell command.** If you define your mapper this way, the command will
be passed unchanged to Hadoop Streaming, with some minor exceptions.
For specifics, see :ref:`cmd-steps`.
Basic example::
def combiner_cmd(self):
return 'cat'
"""
raise NotImplementedError
def combiner_pre_filter(self):
"""Re-define this to specify a shell command to filter the combiner's
input before it gets to your job's combiner in a one-step job. For
important specifics, see :ref:`cmd-filters`.
Basic example::
def combiner_pre_filter(self):
return 'grep "ponies"'
"""
raise NotImplementedError
### Defining one-step Spark jobs ###
def spark(self, input_path, output_path):
"""Re-define this with Spark code to run. You can read input
with *input_path* and output with *output_path*.
.. warning::
Prior to v0.6.8, to pass job methods into Spark
(``rdd.flatMap(self.some_method)``), you first had to call
:py:meth:`self.sandbox() <mrjob.job.MRJob.sandbox>`; otherwise
Spark would error because *self* was not serializable.
"""
raise NotImplementedError
def spark_args(self):
"""Redefine this to pass custom arguments to Spark."""
return []
### Defining multi-step jobs ###
def steps(self):
"""Re-define this to make a multi-step job.
If you don't re-define this, we'll automatically create a one-step
job using any of :py:meth:`mapper`, :py:meth:`mapper_init`,
:py:meth:`mapper_final`, :py:meth:`reducer_init`,
:py:meth:`reducer_final`, and :py:meth:`reducer` that you've
re-defined. For example::
def steps(self):
return [MRStep(mapper=self.transform_input,
reducer=self.consolidate_1),
MRStep(reducer_init=self.log_mapper_init,
reducer=self.consolidate_2)]
:return: a list of steps constructed with
:py:class:`~mrjob.step.MRStep` or other classes in
:py:mod:`mrjob.step`.
"""
# only include methods that have been redefined
kwargs = dict(
(func_name, getattr(self, func_name))
for func_name in _JOB_STEP_FUNC_PARAMS + ('spark',)
if (_im_func(getattr(self, func_name)) is not
_im_func(getattr(MRJob, func_name))))
# special case for spark()
# TODO: support jobconf as well
if 'spark' in kwargs:
if sorted(kwargs) != ['spark']:
raise ValueError(
"Can't mix spark() and streaming functions")
return [SparkStep(
spark=kwargs['spark'],
spark_args=self.spark_args())]
# MRStep takes commands as strings, but the user defines them in the
# class as functions that return strings, so call the functions.
updates = {}
for k, v in kwargs.items():
if k.endswith('_cmd') or k.endswith('_pre_filter'):
updates[k] = v()
kwargs.update(updates)
if kwargs:
return [MRStep(**kwargs)]
else:
return []
def increment_counter(self, group, counter, amount=1):
"""Increment a counter in Hadoop streaming by printing to stderr.
:type group: str
:param group: counter group
:type counter: str
:param counter: description of the counter
:type amount: int
:param amount: how much to increment the counter by
Commas in ``counter`` or ``group`` will be automatically replaced
with semicolons (commas confuse Hadoop streaming).
"""
# don't allow people to pass in floats
if not isinstance(amount, integer_types):
raise TypeError('amount must be an integer, not %r' % (amount,))
# cast non-strings to strings (if people pass in exceptions, etc)
if not isinstance(group, string_types):
group = str(group)
if not isinstance(counter, string_types):
counter = str(counter)
# Extra commas screw up hadoop and there's no way to escape them. So
# replace them with the next best thing: semicolons!
#
# The relevant Hadoop code is incrCounter(), here:
# http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup # noqa
group = group.replace(',', ';')
counter = counter.replace(',', ';')
line = 'reporter:counter:%s,%s,%d\n' % (group, counter, amount)
if not isinstance(line, bytes):
line = line.encode('utf_8')
self.stderr.write(line)
self.stderr.flush()
def set_status(self, msg):
"""Set the job status in hadoop streaming by printing to stderr.
This is also a good way of doing a keepalive for a job that goes a
long time between outputs; Hadoop streaming usually times out jobs
that give no output for longer than 10 minutes.
"""
line = 'reporter:status:%s\n' % (msg,)
if not isinstance(line, bytes):
line = line.encode('utf_8')
self.stderr.write(line)
self.stderr.flush()
### Running the job ###
@classmethod
def run(cls):
"""Entry point for running job from the command-line.
This is also the entry point when a mapper or reducer is run
by Hadoop Streaming.
Does one of:
* Run a mapper (:option:`--mapper`). See :py:meth:`run_mapper`
* Run a combiner (:option:`--combiner`). See :py:meth:`run_combiner`
* Run a reducer (:option:`--reducer`). See :py:meth:`run_reducer`
* Run the entire job. See :py:meth:`run_job`
"""
# load options from the command line
cls().execute()
def run_job(self):
"""Run the all steps of the job, logging errors (and debugging output
if :option:`--verbose` is specified) to STDERR and streaming the
output to STDOUT.
Called from :py:meth:`run`. You'd probably only want to call this
directly from automated tests.
"""
# self.stderr is strictly binary, need to wrap it so it's possible
# to log to it in Python 3
log_stream = codecs.getwriter('utf_8')(self.stderr)
self.set_up_logging(quiet=self.options.quiet,
verbose=self.options.verbose,
stream=log_stream)
with self.make_runner() as runner:
try:
runner.run()
except StepFailedException as e:
# no need for a runner stacktrace if step failed; runners will
# log more useful information anyway
log.error(str(e))
sys.exit(1)
if self._should_cat_output():
for chunk in runner.cat_output():
self.stdout.write(chunk)
self.stdout.flush()
@classmethod
def set_up_logging(cls, quiet=False, verbose=False, stream=None):
"""Set up logging when running from the command line. This is also
used by the various command-line utilities.
:param bool quiet: If true, don't log. Overrides *verbose*.
:param bool verbose: If true, set log level to ``DEBUG`` (default is
``INFO``)
:param bool stream: Stream to log to (default is ``sys.stderr``)
"""
if quiet:
log_to_null(name='mrjob')
log_to_null(name='__main__')
else:
log_to_stream(name='mrjob', debug=verbose, stream=stream)
log_to_stream(name='__main__', debug=verbose, stream=stream)
def _should_cat_output(self):
if self.options.cat_output is None:
return not self.options.output_dir
else:
return self.options.cat_output
def execute(self):
# MRJob does Hadoop Streaming stuff, or defers to its superclass
# (MRJobLauncher) if not otherwise instructed
if self.options.run_mapper:
self.run_mapper(self.options.step_num)
elif self.options.run_combiner:
self.run_combiner(self.options.step_num)
elif self.options.run_reducer:
self.run_reducer(self.options.step_num)
elif self.options.run_spark:
self.run_spark(self.options.step_num)
else:
self.run_job()
def make_runner(self):
"""Make a runner based on command-line arguments, so we can
launch this job on EMR, on Hadoop, or locally.
:rtype: :py:class:`mrjob.runner.MRJobRunner`
"""
bad_words = (
'--mapper', '--reducer', '--combiner', '--step-num', '--spark')
for w in bad_words:
if w in sys.argv:
raise UsageError("make_runner() was called with %s. This"
" probably means you tried to use it from"
" __main__, which doesn't work." % w)
runner_class = self._runner_class()
kwargs = self._runner_kwargs()
# screen out most false-ish args so that it's readable
log.debug('making runner: %s(%s, ...)' % (
runner_class.__name__,
', '.join('%s=%s' % (k, v)
for k, v in sorted(kwargs.items())
if v not in (None, [], {}))))
return self._runner_class()(**self._runner_kwargs())
def _runner_class(self):
"""Runner class as indicated by ``--runner``. Defaults to ``'inline'``.
"""
return _runner_class(self.options.runner or 'inline')
def _runner_kwargs(self):
"""If we're building an inline or Spark runner,
include mrjob_cls in kwargs."""
kwargs = combine_dicts(
self._non_option_kwargs(),
# don't screen out irrelevant opts (see #1898)
self._kwargs_from_switches(set(_RUNNER_OPTS)),
self._job_kwargs(),
)
if self._runner_class().alias in ('inline', 'spark'):
kwargs = dict(mrjob_cls=self.__class__, **kwargs)
# pass steps to runner (see #1845)
kwargs = dict(steps=self._steps_desc(), **kwargs)
return kwargs
def _get_step(self, step_num, expected_type):
"""Helper for run_* methods"""
steps = self.steps()
if not 0 <= step_num < len(steps):
raise ValueError('Out-of-range step: %d' % step_num)
step = steps[step_num]
if not isinstance(step, expected_type):
raise TypeError('Step %d is not a %s', expected_type.__name__)
return step
def run_mapper(self, step_num=0):
"""Run the mapper and final mapper action for the given step.
:type step_num: int
:param step_num: which step to run (0-indexed)
Called from :py:meth:`run`. You'd probably only want to call this
directly from automated tests.
"""
# pick input and output protocol
read_lines, write_line = self._wrap_protocols(step_num, 'mapper')
for k, v in self.map_pairs(read_lines(), step_num=step_num):
write_line(k, v)
def run_combiner(self, step_num=0):
"""Run the combiner for the given step.
:type step_num: int
:param step_num: which step to run (0-indexed)
If we encounter a line that can't be decoded by our input protocol,
or a tuple that can't be encoded by our output protocol, we'll
increment a counter rather than raising an exception. If
--strict-protocols is set, then an exception is raised
Called from :py:meth:`run`. You'd probably only want to call this
directly from automated tests.
"""
# pick input and output protocol
read_lines, write_line = self._wrap_protocols(step_num, 'combiner')
for k, v in self.combine_pairs(read_lines(), step_num=step_num):
write_line(k, v)
def run_reducer(self, step_num=0):
"""Run the reducer for the given step.
:type step_num: int
:param step_num: which step to run (0-indexed)
Called from :py:meth:`run`. You'd probably only want to call this
directly from automated tests.
"""
# pick input and output protocol
read_lines, write_line = self._wrap_protocols(step_num, 'reducer')
for k, v in self.reduce_pairs(read_lines(), step_num=step_num):
write_line(k, v)
def map_pairs(self, pairs, step_num=0):
"""Runs :py:meth:`mapper_init`,
:py:meth:`mapper`/:py:meth:`mapper_raw`, and :py:meth:`mapper_final`
for one map task in one step.
Takes in a sequence of (key, value) pairs as input, and yields
(key, value) pairs as output.
:py:meth:`run_mapper` essentially wraps this method with code to handle
reading/decoding input and writing/encoding output.
.. versionadded:: 0.6.7
"""
step = self._get_step(step_num, MRStep)
mapper = step['mapper']
mapper_raw = step['mapper_raw']
mapper_init = step['mapper_init']
mapper_final = step['mapper_final']
if mapper_init:
for k, v in mapper_init() or ():
yield k, v
if mapper_raw:
if len(self.options.args) != 2:
raise ValueError('Wrong number of args')
input_path, input_uri = self.options.args
for k, v in mapper_raw(input_path, input_uri) or ():
yield k, v
else:
for key, value in pairs:
for k, v in mapper(key, value) or ():
yield k, v
if mapper_final:
for k, v in mapper_final() or ():
yield k, v
def combine_pairs(self, pairs, step_num=0):
"""Runs :py:meth:`combiner_init`,
:py:meth:`combiner`, and :py:meth:`combiner_final`
for one reduce task in one step.
Takes in a sequence of (key, value) pairs as input, and yields
(key, value) pairs as output.
:py:meth:`run_combiner` essentially wraps this method with code to
handle reading/decoding input and writing/encoding output.
.. versionadded:: 0.6.7
"""
for k, v in self._combine_or_reduce_pairs(pairs, 'combiner', step_num):
yield k, v
def reduce_pairs(self, pairs, step_num=0):
"""Runs :py:meth:`reducer_init`,
:py:meth:`reducer`, and :py:meth:`reducer_final`
for one reduce task in one step.
Takes in a sequence of (key, value) pairs as input, and yields
(key, value) pairs as output.
:py:meth:`run_reducer` essentially wraps this method with code to
handle reading/decoding input and writing/encoding output.
.. versionadded:: 0.6.7
"""
for k, v in self._combine_or_reduce_pairs(pairs, 'reducer', step_num):
yield k, v
def _combine_or_reduce_pairs(self, pairs, mrc, step_num=0):
"""Helper for :py:meth:`combine_pairs` and :py:meth:`reduce_pairs`."""
step = self._get_step(step_num, MRStep)
task = step[mrc]
task_init = step[mrc + '_init']
task_final = step[mrc + '_final']
if task is None:
raise ValueError('No %s in step %d' % (mrc, step_num))
if task_init:
for k, v in task_init() or ():
yield k, v
# group all values of the same key together, and pass to the reducer
#
# be careful to use generators for everything, to allow for
# very large groupings of values
for key, pairs_for_key in itertools.groupby(pairs, lambda k_v: k_v[0]):
values = (value for _, value in pairs_for_key)
for k, v in task(key, values) or ():
yield k, v
if task_final:
for k, v in task_final() or ():
yield k, v
def run_spark(self, step_num):
"""Run the Spark code for the given step.
:type step_num: int
:param step_num: which step to run (0-indexed)
Called from :py:meth:`run`. You'd probably only want to call this
directly from automated tests.
"""
step = self._get_step(step_num, SparkStep)
if len(self.options.args) != 2:
raise ValueError('Wrong number of args')
input_path, output_path = self.options.args
spark_method = step.spark
spark_method(input_path, output_path)
def _steps_desc(self):
step_descs = []
for step_num, step in enumerate(self.steps()):
step_descs.append(step.description(step_num))
return step_descs
@classmethod
def mr_job_script(cls):
"""Path of this script. This returns the file containing
this class, or ``None`` if there isn't any (e.g. it was
defined from the command line interface.)"""
try:
return inspect.getsourcefile(cls)
except TypeError:
return None
### Other useful utilities ###
def _read_input(self):
"""Read from stdin, or one more files, or directories.
Yield one line at time.
- Resolve globs (``foo_*.gz``).
- Decompress ``.gz`` and ``.bz2`` files.
- If path is ``-``, read from STDIN.
- Recursively read all files in a directory
"""
paths = self.options.args or ['-']
for path in paths:
if path == '-':
for line in self.stdin:
yield line
else:
with open(path, 'rb') as f:
for line in to_lines(decompress(f, path)):
yield line
def _wrap_protocols(self, step_num, step_type):
"""Pick the protocol classes to use for reading and writing
for the given step.
Returns a tuple of ``(read_lines, write_line)``
``read_lines()`` is a function that reads lines from input, decodes
them, and yields key, value pairs.
``write_line()`` is a function that takes key and value as args,
encodes them, and writes a line to output.
:param step_num: which step to run (e.g. 0)
:param step_type: ``'mapper'``, ``'reducer'``, or ``'combiner'`` from
:py:mod:`mrjob.step`
"""
read, write = self.pick_protocols(step_num, step_type)
def read_lines():
for line in self._read_input():
key, value = read(line.rstrip(b'\r\n'))
yield key, value
def write_line(key, value):
self.stdout.write(write(key, value))
self.stdout.write(b'\n')
return read_lines, write_line
def _step_key(self, step_num, step_type):
return '%d-%s' % (step_num, step_type)
def _script_step_mapping(self, steps_desc):
"""Return a mapping of ``self._step_key(step_num, step_type)`` ->
(place in sort order of all *script* steps), for the purposes of
choosing which protocols to use for input and output.
Non-script steps do not appear in the mapping.
"""
mapping = {}
script_step_num = 0
for i, step in enumerate(steps_desc):
if 'mapper' in step and step['mapper']['type'] == 'script':
k = self._step_key(i, 'mapper')
mapping[k] = script_step_num
script_step_num += 1
if 'reducer' in step and step['reducer']['type'] == 'script':
k = self._step_key(i, 'reducer')
mapping[k] = script_step_num
script_step_num += 1
return mapping
def _mapper_output_protocol(self, step_num, step_map):
map_key = self._step_key(step_num, 'mapper')
if map_key in step_map:
if step_map[map_key] >= (len(step_map) - 1):
return self.output_protocol()
else:
return self.internal_protocol()
else:
# mapper is not a script substep, so protocols don't apply at all
return RawValueProtocol()
def _pick_protocol_instances(self, step_num, step_type):
steps_desc = self._steps_desc()
step_map = self._script_step_mapping(steps_desc)
# pick input protocol
if step_type == 'combiner':
# Combiners read and write the mapper's output protocol because
# they have to be able to run 0-inf times without changing the
# format of the data.
# Combiners for non-script substeps can't use protocols, so this
# function will just give us RawValueProtocol() in that case.
previous_mapper_output = self._mapper_output_protocol(
step_num, step_map)
return previous_mapper_output, previous_mapper_output
else:
step_key = self._step_key(step_num, step_type)
if step_key not in step_map:
raise ValueError(
"Can't pick a protocol for a non-script step")
real_num = step_map[step_key]
if real_num == (len(step_map) - 1):
write = self.output_protocol()
else:
write = self.internal_protocol()
if real_num == 0:
read = self.input_protocol()
else:
read = self.internal_protocol()
return read, write
def pick_protocols(self, step_num, step_type):
"""Pick the protocol classes to use for reading and writing for the
given step.
:type step_num: int
:param step_num: which step to run (e.g. ``0`` for the first step)
:type step_type: str
:param step_type: one of `'mapper'`, `'combiner'`, or `'reducer'`
:return: (read_function, write_function)
By default, we use one protocol for reading input, one
internal protocol for communication between steps, and one
protocol for final output (which is usually the same as the
internal protocol). Protocols can be controlled by setting
:py:attr:`INPUT_PROTOCOL`, :py:attr:`INTERNAL_PROTOCOL`, and
:py:attr:`OUTPUT_PROTOCOL`.
Re-define this if you need fine control over which protocols
are used by which steps.
"""
# wrapping functionality like this makes testing much simpler
p_read, p_write = self._pick_protocol_instances(step_num, step_type)
return p_read.read, p_write.write
### Command-line arguments ###
def configure_args(self):
"""Define arguments for this script. Called from :py:meth:`__init__()`.
Re-define to define custom command-line arguments or pass
through existing ones::
def configure_args(self):
super(MRYourJob, self).configure_args()
self.add_passthru_arg(...)
self.add_file_arg(...)
self.pass_arg_through(...)
...
"""
self.arg_parser.add_argument(
dest='args', nargs='*',
help=('input paths to read (or stdin if not set). If --spark'
' is set, the input and output path for the spark job.'))
_add_basic_args(self.arg_parser)
_add_job_args(self.arg_parser)
_add_runner_args(self.arg_parser)
_add_step_args(self.arg_parser, include_deprecated=True)
def load_args(self, args):
"""Load command-line options into ``self.options``.
Called from :py:meth:`__init__()` after :py:meth:`configure_args`.
:type args: list of str
:param args: a list of command line arguments. ``None`` will be
treated the same as ``[]``.
Re-define if you want to post-process command-line arguments::
def load_args(self, args):
super(MRYourJob, self).load_args(args)
self.stop_words = self.options.stop_words.split(',')
...
"""
if hasattr(self.arg_parser, 'parse_intermixed_args'):
# restore old optparse behavior on Python 3.7+. See #1701
self.options = self.arg_parser.parse_intermixed_args(args)
else:
self.options = self.arg_parser.parse_args(args)
if self.options.help:
self._print_help(self.options)
sys.exit(0)
def add_file_arg(self, *args, **kwargs):
"""Add a command-line option that sends an external file
(e.g. a SQLite DB) to Hadoop::
def configure_args(self):
super(MRYourJob, self).configure_args()
self.add_file_arg('--scoring-db', help=...)
This does the right thing: the file will be uploaded to the working
dir of the script on Hadoop, and the script will be passed the same
option, but with the local name of the file in the script's working
directory.
.. note::
If you pass a file to a job, best practice is to lazy-load its
contents (e.g. make a method that opens the file the first time
you call it) rather than loading it in your job's constructor or
:py:meth:`load_args`. Not only is this more efficient, it's
necessary if you want to run your job in a Spark executor
(because the file may not be in the same place in a Spark driver).
.. note::
We suggest against sending Berkeley DBs to your job, as
Berkeley DB is not forwards-compatible (so a Berkeley DB that you
construct on your computer may not be readable from within
Hadoop). Use SQLite databases instead. If all you need is an on-disk
hash table, try out the :py:mod:`sqlite3dbm` module.
.. versionchanged:: 0.6.6
now accepts explicit ``type=str``
.. versionchanged:: 0.6.8
fully supported on Spark, including ``local[*]`` master
"""
if kwargs.get('type') not in (None, str):
raise ArgumentTypeError(
'file options must take strings')
if kwargs.get('action') not in (None, 'append', 'store'):
raise ArgumentTypeError(
"file options must use the actions 'store' or 'append'")
pass_opt = self.arg_parser.add_argument(*args, **kwargs)
self._file_arg_dests.add(pass_opt.dest)
def add_passthru_arg(self, *args, **kwargs):
"""Function to create options which both the job runner
and the job itself respect (we use this for protocols, for example).
Use it like you would use
:py:func:`argparse.ArgumentParser.add_argument`::
def configure_args(self):
super(MRYourJob, self).configure_args()
self.add_passthru_arg(
'--max-ngram-size', type=int, default=4, help='...')
If you want to pass files through to the mapper/reducer, use
:py:meth:`add_file_arg` instead.
If you want to pass through a built-in option (e.g. ``--runner``, use
:py:meth:`pass_arg_through` instead.
"""
pass_opt = self.arg_parser.add_argument(*args, **kwargs)
self._passthru_arg_dests.add(pass_opt.dest)
def pass_arg_through(self, opt_str):
"""Pass the given argument through to the job."""
# _actions is hidden but the interface appears to be stable,
# and there's no non-hidden interface we can use
for action in self.arg_parser._actions:
if opt_str in action.option_strings or opt_str == action.dest:
self._passthru_arg_dests.add(action.dest)
break
else:
raise ValueError('unknown arg: %s', opt_str)
def is_task(self):
"""True if this is a mapper, combiner, reducer, or Spark script.
This is mostly useful inside :py:meth:`load_args`, to disable
loading args when we aren't running inside Hadoop.
"""
return (self.options.run_mapper or
self.options.run_combiner or
self.options.run_reducer or
self.options.run_spark)
### protocols ###
def input_protocol(self):
"""Instance of the protocol to use to convert input lines to Python
objects. Default behavior is to return an instance of
:py:attr:`INPUT_PROTOCOL`.
"""
if not isinstance(self.INPUT_PROTOCOL, type):
log.warning('INPUT_PROTOCOL should be a class, not %s' %
self.INPUT_PROTOCOL)
return self.INPUT_PROTOCOL()
def internal_protocol(self):
"""Instance of the protocol to use to communicate between steps.
Default behavior is to return an instance of
:py:attr:`INTERNAL_PROTOCOL`.
"""
if not isinstance(self.INTERNAL_PROTOCOL, type):
log.warning('INTERNAL_PROTOCOL should be a class, not %s' %
self.INTERNAL_PROTOCOL)
return self.INTERNAL_PROTOCOL()
def output_protocol(self):
"""Instance of the protocol to use to convert Python objects to output
lines. Default behavior is to return an instance of
:py:attr:`OUTPUT_PROTOCOL`.
"""
if not isinstance(self.OUTPUT_PROTOCOL, type):
log.warning('OUTPUT_PROTOCOL should be a class, not %s' %
self.OUTPUT_PROTOCOL)
return self.OUTPUT_PROTOCOL()
#: Protocol for reading input to the first mapper in your job.
#: Default: :py:class:`RawValueProtocol`.
#:
#: For example you know your input data were in JSON format, you could
#: set::
#:
#: INPUT_PROTOCOL = JSONValueProtocol
#:
#: in your class, and your initial mapper would receive decoded JSONs
#: rather than strings.
#:
#: See :py:data:`mrjob.protocol` for the full list of protocols.
INPUT_PROTOCOL = RawValueProtocol
#: Protocol for communication between steps and final output.
#: Default: :py:class:`JSONProtocol`.
#:
#: For example if your step output weren't JSON-encodable, you could set::
#:
#: INTERNAL_PROTOCOL = PickleProtocol
#:
#: and step output would be encoded as string-escaped pickles.
#:
#: See :py:data:`mrjob.protocol` for the full list of protocols.
INTERNAL_PROTOCOL = JSONProtocol
#: Protocol to use for writing output. Default: :py:class:`JSONProtocol`.
#:
#: For example, if you wanted the final output in repr, you could set::
#:
#: OUTPUT_PROTOCOL = ReprProtocol
#:
#: See :py:data:`mrjob.protocol` for the full list of protocols.
OUTPUT_PROTOCOL = JSONProtocol
def parse_output(self, chunks):
"""Parse the final output of this MRJob (as a stream of byte chunks)
into a stream of ``(key, value)``.
"""
read = self.output_protocol().read
for line in to_lines(chunks):
yield read(line)
### Hadoop Input/Output Formats ###
#: Optional name of an optional Hadoop ``InputFormat`` class, e.g.
#: ``'org.apache.hadoop.mapred.lib.NLineInputFormat'``.
#:
#: Passed to Hadoop with the *first* step of this job with the
#: ``-inputformat`` option.
#:
#: If you require more sophisticated behavior, try
#: :py:meth:`hadoop_input_format` or the *hadoop_input_format* argument to
#: :py:meth:`mrjob.runner.MRJobRunner.__init__`.
HADOOP_INPUT_FORMAT = None
def hadoop_input_format(self):
"""Optional Hadoop ``InputFormat`` class to parse input for
the first step of the job.
Normally, setting :py:attr:`HADOOP_INPUT_FORMAT` is sufficient;
redefining this method is only for when you want to get fancy.
"""
return self.HADOOP_INPUT_FORMAT
#: Optional name of an optional Hadoop ``OutputFormat`` class, e.g.
#: ``'org.apache.hadoop.mapred.FileOutputFormat'``.
#:
#: Passed to Hadoop with the *last* step of this job with the
#: ``-outputformat`` option.
#:
#: If you require more sophisticated behavior, try
#: :py:meth:`hadoop_output_format` or the *hadoop_output_format* argument
#: to :py:meth:`mrjob.runner.MRJobRunner.__init__`.
HADOOP_OUTPUT_FORMAT = None
def hadoop_output_format(self):
"""Optional Hadoop ``OutputFormat`` class to write output for
the last step of the job.
Normally, setting :py:attr:`HADOOP_OUTPUT_FORMAT` is sufficient;
redefining this method is only for when you want to get fancy.
"""
return self.HADOOP_OUTPUT_FORMAT
### Libjars ###
#: Optional list of paths of jar files to run our job with using Hadoop's
#: ``-libjars`` option.
#:
#: ``~`` and environment variables
#: in paths be expanded, and relative paths will be interpreted as
#: relative to the directory containing the script (not the current
#: working directory).
#:
#: If you require more sophisticated behavior, try overriding
#: :py:meth:`libjars`.
LIBJARS = []
def libjars(self):
"""Optional list of paths of jar files to run our job with using
Hadoop's ``-libjars`` option. Normally setting :py:attr:`LIBJARS`
is sufficient. Paths from :py:attr:`LIBJARS` are interpreted as
relative to the the directory containing the script (paths from the
command-line are relative to the current working directory).
Note that ``~`` and environment variables in paths will always be
expanded by the job runner (see :mrjob-opt:`libjars`).
.. versionchanged:: 0.6.6
re-defining this no longer clobbers the command-line
``--libjars`` option
"""
script_dir = os.path.dirname(self.mr_job_script())
paths = []
# libjar paths will eventually be combined with combine_path_lists,
# which will expand environment variables. We don't want to assume
# a path like $MY_DIR/some.jar is always relative ($MY_DIR could start
# with /), but we also don't want to expand environment variables
# prematurely.
for path in self.LIBJARS or []:
if os.path.isabs(expand_path(path)):
paths.append(path)
else:
paths.append(os.path.join(script_dir, path))
return paths
### Partitioning ###
#: Optional Hadoop partitioner class to use to determine how mapper
#: output should be sorted and distributed to reducers. For example:
#: ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
#:
#: If you require more sophisticated behavior, try :py:meth:`partitioner`.
PARTITIONER = None
def partitioner(self):
"""Optional Hadoop partitioner class to use to determine how mapper
output should be sorted and distributed to reducers.
By default, returns :py:attr:`PARTITIONER`.
You probably don't need to re-define this; it's just here for
completeness.
"""
return self.PARTITIONER
### Uploading support files ###
#: Optional list of archives to upload and unpack in the job's working
#: directory. These can be URIs or paths on the local filesystem.
#:
#: Relative paths will be interpreted as relative to the directory
#: containing the script (not the current working directory).
#
#: Environment variables and ``~`` in paths will be expanded.
#:
#: By default, the directory will have the same name as the archive
#: (e.g. ``foo.tar.gz/``). To change the directory's name, append
#: ``#<name>``::
#:
#: ARCHIVES = ['data/foo.tar.gz#foo']
#:
#: If you need to dynamically generate a list of files, override
#: :py:meth:`archives` instead.
#:
#: .. versionadded:: 0.6.4
ARCHIVES = []
#: Optional list of directories to upload to the job's working directory.
#: These can be URIs or paths on the local filesystem.
#:
#: Relative paths will be interpreted as relative to the directory
#: containing the script (not the current working directory).
#
#: Environment variables and ``~`` in paths will be expanded.
#:
#: If you want a directory to be copied with a name other than it's own,
#: append ``#<name>`` (e.g. ``data/foo#bar``).
#:
#: If you need to dynamically generate a list of files, override
#: :py:meth:`dirs` instead.
#:
#: .. versionadded:: 0.6.4
DIRS = []
#: Optional list of files to upload to the job's working directory.
#: These can be URIs or paths on the local filesystem.
#:
#: Relative paths will be interpreted as relative to the directory
#: containing the script (not the current working directory).
#
#: Environment variables and ``~`` in paths will be expanded.
#:
#: If you want a file to be uploaded to a filename other than it's own,
#: append ``#<name>`` (e.g. ``data/foo.json#bar.json``).
#:
#: If you need to dynamically generate a list of files, override
#: :py:meth:`files` instead.
#:
#: .. versionadded:: 0.6.4
FILES = []
def archives(self):
"""Like :py:attr:`ARCHIVES`, except that it can return a dynamically
generated list of archives to upload and unpack. Overriding
this method disables :py:attr:`ARCHIVES`.
Paths returned by this method are relative to the working directory
(not the script). Note that the job runner will *always* expand
environment variables and ``~`` in paths returned by this method.
You do not have to worry about inadvertently disabling ``--archives``;
this switch is handled separately.
.. versionadded:: 0.6.4
"""
return self._upload_attr('ARCHIVES')
def dirs(self):
"""Like :py:attr:`DIRS`, except that it can return a dynamically
generated list of directories to upload. Overriding
this method disables :py:attr:`DIRS`.
Paths returned by this method are relative to the working directory
(not the script). Note that the job runner will *always* expand
environment variables and ``~`` in paths returned by this method.
You do not have to worry about inadvertently disabling ``--dirs``;
this switch is handled separately.
.. versionadded:: 0.6.4
"""
return self._upload_attr('DIRS')
def files(self):
"""Like :py:attr:`FILES`, except that it can return a dynamically
generated list of files to upload. Overriding
this method disables :py:attr:`FILES`.
Paths returned by this method are relative to the working directory
(not the script). Note that the job runner will *always* expand
environment variables and ``~`` in paths returned by this method.
You do not have to worry about inadvertently disabling ``--files``;
this switch is handled separately.
.. versionadded:: 0.6.4
"""
return self._upload_attr('FILES')
def _upload_attr(self, attr_name):
"""Helper for :py:meth:`archives`, :py:meth:`dirs`, and
:py:meth:`files`"""
attr_value = getattr(self, attr_name)
# catch path instead of a list of paths
if isinstance(attr_value, string_types):
raise TypeError('%s must be a list or other sequence.' % attr_name)
script_dir = os.path.dirname(self.mr_job_script())
paths = []
for path in attr_value:
expanded_path = expand_path(path)
if os.path.isabs(expanded_path):
paths.append(path)
else:
# relative subdirs are confusing; people will expect them
# to appear in a subdir, not the same directory as the script,
# but Hadoop doesn't work that way
if os.sep in path.rstrip(os.sep) and '#' not in path:
log.warning(
'%s: %s will appear in same directory as job script,'
' not a subdirectory' % (attr_name, path))
paths.append(os.path.join(script_dir, path))
return paths
### Jobconf ###
#: Optional jobconf arguments we should always pass to Hadoop. This
#: is a map from property name to value. e.g.:
#:
#: ``{'stream.num.map.output.key.fields': '4'}``
#:
#: It's recommended that you only use this to hard-code things that
#: affect the semantics of your job, and leave performance tweaks to
#: the command line or whatever you use to launch your job.
JOBCONF = {}
def jobconf(self):
"""``-D`` args to pass to hadoop streaming. This should be a map
from property name to value. By default, returns :py:attr:`JOBCONF`.
.. versionchanged:: 0.6.6
re-defining longer clobbers command-line
``--jobconf`` options.
"""
return dict(self.JOBCONF)
### Secondary Sort ###
#: Set this to ``True`` if you would like reducers to receive the values
#: associated with any key in sorted order (sorted by their *encoded*
#: value). Also known as secondary sort.
#:
#: This can be useful if you expect more values than you can fit in memory
#: to be associated with one key, but you want to apply information in
#: a small subset of these values to information in the other values.
#: For example, you may want to convert counts to percentages, and to do
#: this you first need to know the total count.
#:
#: Even though values are sorted by their encoded value, most encodings
#: will sort strings in order. For example, you could have values like:
#: ``['A', <total>]``, ``['B', <count_name>, <count>]``, and the value
#: containing the total should come first regardless of what protocol
#: you're using.
#:
#: See :py:meth:`jobconf()` and :py:meth:`partitioner()` for more about
SORT_VALUES = None
def sort_values(self):
"""A method that by default, just returns the value of
:py:attr:`SORT_VALUES`. Mostly exists for the sake
of consistency, but you could override it if you wanted to make
secondary sort configurable."""
return self.SORT_VALUES
### Testing ###
def sandbox(self, stdin=None, stdout=None, stderr=None):
"""Redirect stdin, stdout, and stderr for automated testing.
You can set stdin, stdout, and stderr to file objects. By
default, they'll be set to empty ``BytesIO`` objects.
You can then access the job's file handles through ``self.stdin``,
``self.stdout``, and ``self.stderr``. See :ref:`testing` for more
information about testing.
You may call sandbox multiple times (this will essentially clear
the file handles).
``stdin`` is empty by default. You can set it to anything that yields
lines::
mr_job.sandbox(stdin=BytesIO(b'some_data\\n'))
or, equivalently::
mr_job.sandbox(stdin=[b'some_data\\n'])
For convenience, this sandbox() returns self, so you can do::
mr_job = MRJobClassToTest().sandbox()
Simple testing example::
mr_job = MRYourJob.sandbox()
self.assertEqual(list(mr_job.reducer('foo', ['a', 'b'])), [...])
More complex testing example::
from BytesIO import BytesIO
from mrjob.parse import parse_mr_job_stderr
from mrjob.protocol import JSONProtocol
mr_job = MRYourJob(args=[...])
fake_input = '"foo"\\t"bar"\\n"foo"\\t"baz"\\n'
mr_job.sandbox(stdin=BytesIO(fake_input))
mr_job.run_reducer(link_num=0)
self.assertEqual(mrjob.stdout.getvalue(), ...)
self.assertEqual(parse_mr_job_stderr(mr_job.stderr), ...)
.. note::
If you are using Spark, it's recommended you only pass in
:py:class:`io.BytesIO` or other serializable alternatives to file
objects. *stdin*, *stdout*, and *stderr* get stored as job
attributes, which means if they aren't serializable, neither
is the job instance or its methods.
"""
self._stdin = stdin or BytesIO()
self._stdout = stdout or BytesIO()
self._stderr = stderr or BytesIO()
return self
if __name__ == '__main__':
MRJob.run()