# Copyright 2012 Yelp and Contributors # Copyright 2013 David Marin and Contributors # Copyright 2015-2018 Yelp # Copyright 2019 Yelp # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Representations of job steps, to use in your :py:class:`~mrjob.job.MRJob`'s :py:meth:`~mrjob.job.MRJob.steps` method. Because :py:class:`the runner ` just needs to know how to invoke your MRJob script, not how it works insternally, each step instance's ``description()`` method produces a simplified, JSON-able description of the step, to pass to the runner. """ import logging from mrjob.py2 import string_types from mrjob.util import cmd_line STEP_TYPES = ('jar', 'spark', 'spark_jar', 'spark_script', 'streaming') # Function names mapping to mapper, reducer, and combiner operations _MAPPER_FUNCS = ('mapper', 'mapper_init', 'mapper_final', 'mapper_cmd', 'mapper_pre_filter', 'mapper_raw') _COMBINER_FUNCS = ('combiner', 'combiner_init', 'combiner_final', 'combiner_cmd', 'combiner_pre_filter') _REDUCER_FUNCS = ('reducer', 'reducer_init', 'reducer_final', 'reducer_cmd', 'reducer_pre_filter') _HADOOP_OPTS = ('jobconf',) # params to specify how to run the step. need at least one of these _JOB_STEP_FUNC_PARAMS = _MAPPER_FUNCS + _COMBINER_FUNCS + _REDUCER_FUNCS # all allowable MRStep params _JOB_STEP_PARAMS = _JOB_STEP_FUNC_PARAMS + _HADOOP_OPTS # all allowable JarStep constructor keyword args _JAR_STEP_KWARGS = ['args', 'main_class'] # all allowable SparkStep constructor keyword args _SPARK_STEP_KWARGS = ['spark', 'spark_args'] # all allowable SparkJarStep constructor keyword args _SPARK_JAR_STEP_KWARGS = ['args', 'jar', 'main_class', 'spark_args'] # all allowable SparkScriptStep constructor keyword args _SPARK_SCRIPT_STEP_KWARGS = ['args', 'script', 'spark_args'] #: If passed as an argument to :py:class:`JarStep`, :py:class:`SparkJarStep`, #: or :py:class:`SparkScriptStep`, it'll be replaced with the step's input #: path(s). If there are multiple paths, they'll be joined with commas. INPUT = '' #: If this is passed as an argument to :py:class:`JarStep`, #: :py:class:`SparkJarStep`, or :py:class:`SparkScriptStep`, it'll be replaced #: with the step's output path OUTPUT = '' #: If this is passed as an argument to :py:class:`JarStep`, #: it'll be replaced with generic hadoop args (-D and -libjars) GENERIC_ARGS = '' log = logging.getLogger(__name__) # used by MRStep below, to fake no mapper def _IDENTITY_MAPPER(key, value): yield key, value # used by MRStep below, to fake no reducer def _IDENTITY_REDUCER(key, values): for value in values: yield key, value class StepFailedException(Exception): """Exception to throw when a step fails. This will automatically be caught and converted to an error message by :py:meth:`mrjob.job.MRJob.run`, but you may wish to catch it if you :ref:`run your job programatically `. """ _FIELDS = ('reason', 'step_num', 'num_steps', 'step_desc') def __init__( self, reason=None, step_num=None, num_steps=None, step_desc=None, last_step_num=None): """Initialize a reason for step failure. :param string reason: brief explanation of which step failed :param int step_num: which step failed (0-indexed) :param int num_steps: number of steps in the job :param string step_desc: description of step (if we don't like the default "Step X of Y") :param int last_step_num: if one of a range of steps failed, the (0-indexed) last step in that range. If this equals *step_num*, will be ignored. *reason* should not be several lines long; use ``log.error(...)`` for that. """ self.reason = reason self.step_num = step_num self.num_steps = num_steps self.step_desc = step_desc # we only need this for streaming steps run by the Spark harness, # so don't create noise if last_step_num is None or last_step_num == step_num: self.last_step_num = None else: self.last_step_num = last_step_num def __str__(self): """Human-readable version of the exception. Note that this 1-indexes *step_num*.""" if self.step_desc: step_desc = self.step_desc else: if self.step_num is not None: # 1-index step numbers if self.last_step_num is not None: step_name = 'Steps %d-%d' % ( self.step_num + 1, self.last_step_num + 1) else: step_name = 'Step %d' % (self.step_num + 1) if self.num_steps: step_desc = '%s of %d' % (step_name, self.num_steps) else: step_desc = step_name else: step_desc = 'Step' if self.reason: return '%s failed: %s' % (step_desc, self.reason) else: return '%s failed' % step_desc def __repr__(self): return '%s(%s)' % ( self.__class__.__name__, ', '.join(('%s=%r' % (k, getattr(self, k)) for k in self._FIELDS if getattr(self, k) is not None))) class MRStep(object): # this docstring excludes mapper_cmd, etc. """Represents steps handled by the script containing your job. Used by :py:meth:`MRJob.steps `. See :ref:`writing-multi-step-jobs` for sample usage. Takes the following keyword arguments: `combiner`, `combiner_cmd`, `combiner_final`, `combiner_init`, `combiner_pre_filter`, `mapper`, `mapper_cmd`, `mapper_final`, `mapper_init`, `mapper_pre_filter`, `mapper_raw`, `reducer`, `reducer_cmd`, `reducer_final`, `reducer_init`, `reducer_pre_filter`. These should be set to ``None`` or a function with the same signature as the corresponding method in :py:class:`~mrjob.job.MRJob`. Also accepts `jobconf`, a dictionary with custom jobconf arguments to pass to hadoop. A MRStep's description looks like:: { 'type': 'streaming', 'mapper': { ... }, 'combiner': { ... }, 'reducer': { ... }, 'jobconf': { ... }, # dict of Hadoop configuration properties } At least one of ``mapper``, ``combiner`` and ``reducer`` need be included. ``jobconf`` is completely optional. ``mapper``, ``combiner``, and ``reducer`` are either handled by the script containing your job definition, in which case they look like:: { 'type': 'script', 'pre_filter': 'grep -v bad', # optional cmd to filter input } or they simply run a command, which looks like:: { 'type': 'command', 'command': 'cut -f 1-2', # command to run, as a string } """ def __init__(self, **kwargs): # limit which keyword args can be specified bad_kwargs = sorted(set(kwargs) - set(_JOB_STEP_PARAMS)) if bad_kwargs: raise TypeError('MRStep() got an unexpected keyword argument %r' % bad_kwargs[0]) if not set(kwargs) & set(_JOB_STEP_FUNC_PARAMS): raise ValueError("Step has no mappers and no reducers") self.has_explicit_mapper = any( value for name, value in kwargs.items() if name in _MAPPER_FUNCS) self.has_explicit_combiner = any( value for name, value in kwargs.items() if name in _COMBINER_FUNCS) self.has_explicit_reducer = any( value for name, value in kwargs.items() if name in _REDUCER_FUNCS) steps = dict((f, None) for f in _JOB_STEP_PARAMS) steps.update(kwargs) def _check_conflict(func, other_funcs): if steps[func]: for other_func in other_funcs: if steps[other_func] and other_func != func: raise ValueError("Can't specify both %s and %s" % ( func, other_func)) _check_conflict('mapper_cmd', _MAPPER_FUNCS) _check_conflict('mapper_raw', ('mapper', 'mapper_pre_filter')) _check_conflict('combiner_cmd', _COMBINER_FUNCS) _check_conflict('reducer_cmd', _REDUCER_FUNCS) self._steps = steps def __repr__(self): not_none = dict((k, v) for k, v in self._steps.items() if v is not None) return '%s(%s)' % ( self.__class__.__name__, ', '.join('%s=%r' % (k, v) for k, v in not_none.items())) def __eq__(self, other): return (isinstance(other, MRStep) and self._steps == other._steps) def __getitem__(self, key): # always be prepared to run a mapper, since Hadoop Streaming requires # it if key == 'mapper' and self._steps['mapper'] is None: return _IDENTITY_MAPPER # identity reducer should only show up if you specified 'reducer_init', # 'reducer_final', or 'reducer_pre_filter', but not 'reducer' itself if (key == 'reducer' and self._steps['reducer'] is None and self.has_explicit_reducer): return _IDENTITY_REDUCER # identity combiner should only show up if you specified # 'combiner_init', 'combiner_final', or 'combiner_pre_filter', but not # 'combiner' itself if (key == 'combiner' and self._steps['combiner'] is None and self.has_explicit_combiner): return _IDENTITY_REDUCER return self._steps[key] def _render_substep(self, cmd_key, pre_filter_key): if self._steps[cmd_key]: cmd = self._steps[cmd_key] if not isinstance(cmd, string_types): cmd = cmd_line(cmd) if (pre_filter_key and self._steps[pre_filter_key]): raise ValueError('Cannot specify both %s and %s' % ( cmd_key, pre_filter_key)) return {'type': 'command', 'command': cmd} else: substep = {'type': 'script'} if (pre_filter_key and self._steps[pre_filter_key]): substep['pre_filter'] = self._steps[pre_filter_key] return substep def render_mapper(self): return self._render_substep('mapper_cmd', 'mapper_pre_filter') def render_combiner(self): return self._render_substep('combiner_cmd', 'combiner_pre_filter') def render_reducer(self): return self._render_substep('reducer_cmd', 'reducer_pre_filter') def description(self, step_num=0): desc = {'type': 'streaming'} # Use a mapper if: # - the user writes one # - it is the first step and we don't want to mess up protocols # - there are only combiners and we don't want to mess up protocols if (step_num == 0 or self.has_explicit_mapper or self.has_explicit_combiner): desc['mapper'] = self.render_mapper() if self.has_explicit_combiner: desc['combiner'] = self.render_combiner() if self.has_explicit_reducer: desc['reducer'] = self.render_reducer() if self._steps['mapper_raw']: desc['input_manifest'] = True # TODO: verify this is a dict, convert booleans to strings if self._steps['jobconf']: desc['jobconf'] = self._steps['jobconf'] return desc class _Step(object): """Generic implementation of steps which are basically just simple objects that hold attributes.""" # MRStep is different enough that I'm going to leave it as-is for now. # unique string for this step type (e.g. 'jar'). Redefine in your subclass _STEP_TYPE = None # all keyword arguments we accept. Redefine in your subclass _STEP_ATTRS = [] # attributes that don't show up in the step description because they # are handled by the job, not the runner _HIDDEN_ATTRS = [] # map from keyword argument to type(s), if we check. You can also use # "callable" (which is actually a builtin, not a type) for callables _STEP_ATTR_TYPES = { 'args': (list, tuple), 'jar': string_types, 'jobconf': dict, 'main_class': string_types, 'script': string_types, 'spark': callable, 'spark_args': (list, tuple), } # map from keyword argument to constructor that produces # default values _STEP_ATTR_DEFAULTS = { 'args': list, 'jobconf': dict, 'spark_args': list, } # use your own __init__() method to make arguments required def __init__(self, **kwargs): """Set all attributes to the corresponding value in *kwargs*, or the default value. Raise :py:class:`TypeError` for unknown arguments or values with the wrong type.""" bad_kwargs = sorted(set(kwargs) - set(self._STEP_ATTRS)) if bad_kwargs: raise TypeError('%s() got unexpected keyword arguments: %s' % ( self.__class__.__name__, ', '.join(bad_kwargs))) for k in self._STEP_ATTRS: v = kwargs.get(k) if v is None: v = self._default(k) elif k in self._STEP_ATTR_TYPES: attr_type = self._STEP_ATTR_TYPES[k] if attr_type is callable: if not callable(v): raise TypeError('%s is not callable: %r' % (k, v)) elif not isinstance(v, attr_type): raise TypeError('%s is not an instance of %r: %r' % ( k, self._STEP_ATTR_TYPES[k], v)) setattr(self, k, v) def __repr__(self): kwargs = dict( (k, getattr(self, k)) for k in self._STEP_ATTR_TYPES if hasattr(self, k)) return '%s(%s)' % ( self.__class__.__name__, ', '.join( '%s=%s' % (k, v) for k, v in sorted(kwargs.items()) if v != self._default(k))) def __eq__(self, other): return (isinstance(other, self.__class__) and all(getattr(self, key) == getattr(other, key) for key in set(self._STEP_ATTRS))) def _default(self, k): if k in self._STEP_ATTR_DEFAULTS: return self._STEP_ATTR_DEFAULTS[k]() else: return None def description(self, step_num=0): """Return a dictionary representation of this step. See :ref:`steps-format` for examples.""" result = dict( (k, getattr(self, k)) for k in self._STEP_ATTRS if k not in self._HIDDEN_ATTRS ) result['type'] = self._STEP_TYPE return result class JarStep(_Step): """Represents a running a custom Jar as a step. Accepts the following keyword arguments: :param jar: The local path to the Jar. On EMR, this can also be an ``s3://`` URI, or ``file://`` to reference a jar on the local filesystem of your EMR instance(s). :param args: (optional) A list of arguments to the jar. Use :py:data:`mrjob.step.INPUT` and :py:data:`OUTPUT` to interpolate input and output paths. :param jobconf: (optional) A dictionary of Hadoop properties :param main_class: (optional) The main class to run from the jar. If not specified, Hadoop will use the main class in the jar's manifest file. *jar* can also be passed as a positional argument See :ref:`non-hadoop-streaming-jar-steps` for sample usage. Sample description of a JarStep:: { 'type': 'jar', 'jar': 'binks.jar.jar', 'main_class': 'MyMainMan', # optional 'args': ['argh', 'argh'] # optional 'jobconf': { ... } # optional } To give your jar access to input files, an empty output directory, configuration properties, and libjars managed by mrjob, you may include :py:data:`INPUT`, :py:data:`OUTPUT`, and :py:data:`GENERIC_ARGS` in *args*. """ _STEP_TYPE = 'jar' _STEP_ATTRS = ['args', 'jar', 'jobconf', 'main_class'] def __init__(self, jar, **kwargs): super(JarStep, self).__init__(jar=jar, **kwargs) class SparkStep(_Step): """Represents running a Spark step defined in your job. Accepts the following keyword arguments: :param spark: function containing your Spark code with same function signature as :py:meth:`~mrjob.job.MRJob.spark` :param jobconf: (optional) A dictionary of Hadoop properties :param spark_args: (optional) an array of arguments to pass to spark-submit (e.g. ``['--executor-memory', '2G']``). Sample description of a SparkStep:: { 'type': 'spark', 'jobconf': { ... }, # optional 'spark_args': ['--executor-memory', '2G'], # optional } """ _STEP_TYPE = 'spark' _STEP_ATTRS = ['jobconf', 'spark', 'spark_args'] _HIDDEN_ATTRS = ['spark'] def __init__(self, spark, **kwargs): super(SparkStep, self).__init__(spark=spark, **kwargs) class SparkJarStep(_Step): """Represents a running a separate Jar through Spark Accepts the following keyword arguments: :param jar: The local path to the Python script to run. On EMR, this can also be an ``s3://`` URI, or ``file://`` to reference a jar on the local filesystem of your EMR instance(s). :param main_class: Your application's main class (e.g. ``'org.apache.spark.examples.SparkPi'``) :param args: (optional) A list of arguments to the script. Use :py:data:`mrjob.step.INPUT` and :py:data:`OUTPUT` to interpolate input and output paths. :param jobconf: (optional) A dictionary of Hadoop properties :param spark_args: (optional) an array of arguments to pass to spark-submit (e.g. ``['--executor-memory', '2G']``). *jar* and *main_class* can also be passed as positional arguments Sample description of a SparkJarStep:: { 'type': 'spark_jar', 'jar': 'binks.jar.jar', 'main_class': 'MyMainMan', # optional 'args': ['argh', 'argh'], # optional 'jobconf': { ... }, # optional 'spark_args': ['--executor-memory', '2G'], # optional } To give your Spark JAR access to input files and an empty output directory managed by mrjob, you may include :py:data:`INPUT` and :py:data:`OUTPUT` in *args*. """ _STEP_TYPE = 'spark_jar' _STEP_ATTRS = ['args', 'jar', 'jobconf', 'main_class', 'spark_args'] def __init__(self, jar, main_class, **kwargs): super(SparkJarStep, self).__init__( jar=jar, main_class=main_class, **kwargs) class SparkScriptStep(_Step): """Represents a running a separate Python script through Spark Accepts the following keyword arguments: :param script: The local path to the Python script to run. On EMR, this can also be an ``s3://`` URI, or ``file://`` to reference a jar on the local filesystem of your EMR instance(s). :param args: (optional) A list of arguments to the script. Use :py:data:`mrjob.step.INPUT` and :py:data:`OUTPUT` to interpolate input and output paths. :param jobconf: (optional) A dictionary of Hadoop properties :param spark_args: (optional) an array of arguments to pass to spark-submit (e.g. ``['--executor-memory', '2G']``). *script* can also be passed as a positional argument Sample description of a ScriptStep:: { 'type': 'spark_script', 'script': 'my_spark_script.py', 'args': ['script_arg1', 'script_arg2'], 'jobconf': { ... }, # optional 'spark_args': ['--executor-memory', '2G'], # optional } To give your Spark script access to input files and an empty output directory managed by mrjob, you may include :py:data:`INPUT` and :py:data:`OUTPUT` in *args*. """ _STEP_TYPE = 'spark_script' _STEP_ATTRS = ['args', 'jobconf', 'script', 'spark_args'] def __init__(self, script, **kwargs): super(SparkScriptStep, self).__init__(script=script, **kwargs) def _is_spark_step_type(step_type): """Does the given step type indicate that it uses Spark?""" return step_type.split('_')[0] == 'spark' def _is_pyspark_step_type(step_type): """Does the given step type indicate that it uses Spark and Python?""" return step_type in ('spark', 'spark_script')