# -*- coding: utf-8 -*- # Copyright 2009-2017 Yelp and Contributors # Copyright 2018 Yelp and Google, Inc. # Copyright 2019 Yelp # Copyright 2020 Affirm, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Base class for all runners.""" import copy import datetime import getpass import logging import os import os.path import posixpath import pprint import re import sys import tarfile import tempfile from shutil import rmtree from mrjob.compat import translate_jobconf from mrjob.compat import translate_jobconf_dict from mrjob.compat import translate_jobconf_for_all_versions from mrjob.conf import ClearedValue from mrjob.conf import combine_jobconfs from mrjob.conf import combine_opts from mrjob.conf import load_opts_from_mrjob_confs from mrjob.fs.composite import CompositeFilesystem from mrjob.fs.local import LocalFilesystem from mrjob.options import _combiners from mrjob.options import _deprecated_aliases from mrjob.options import CLEANUP_CHOICES from mrjob.parse import is_uri from mrjob.parse import to_uri from mrjob.py2 import PY2 from mrjob.py2 import string_types from mrjob.setup import WorkingDirManager from mrjob.setup import name_uniquely from mrjob.setup import parse_legacy_hash_path from mrjob.step import INPUT from mrjob.step import OUTPUT from mrjob.step import _is_spark_step_type from mrjob.step import _is_pyspark_step_type log = logging.getLogger(__name__) # use to detect globs and break into the part before and after the glob GLOB_RE = re.compile(r'^(.*?)([\[\*\?].*)$') # buffer for piping files into sort on Windows _BUFFER_SIZE = 4096 # jobconf options for implementing SORT_VALUES _SORT_VALUES_JOBCONF = { 'mapreduce.partition.keypartitioner.options': '-k1,1', 'stream.num.map.output.key.fields': 2 } # partitioner for sort_values _SORT_VALUES_PARTITIONER = \ 'org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' class MRJobRunner(object): """Abstract base class for all runners""" # this class handles the basic runner framework, options and config files, # arguments to mrjobs, and setting up job working dirs and environments. # this will put files from setup scripts, py_files, and bootstrap_mrjob # into the job's working dir, but won't actually run/import them # # command lines to run substeps (including Spark) are handled by # mrjob.bin.MRJobBinRunner #: alias for this runner, used on the command line with ``-r`` alias = None # libjars is only here because the job can set it; might want to # handle this with a warning from the launcher instead OPT_NAMES = { 'bootstrap_mrjob', 'check_input_paths', 'cleanup', 'cleanup_on_failure', 'cmdenv', 'jobconf', 'label', 'libjars', 'local_tmp_dir', # no max_output_files because it doesn't go in self._opts 'owner', 'py_files', 'read_logs', 'setup', 'upload_archives', 'upload_dirs', 'upload_files' } # re-define this as a set of step types supported by your runner _STEP_TYPES = None ### methods to call from your batch script ### def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, steps=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :param steps: a list of descriptions of steps to run (see :doc:`step` for description formats) :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)] ) log.debug('Active configuration:') log.debug(pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None if self._emulate_archives_on_spark(): # keep Spark from auto-uncompressing tarballs archive_file_suffix = '.file' else: # otherwise, leave as-is, so that --archive will # work properly archive_file_suffix = '' self._working_dir_mgr = WorkingDirManager( archive_file_suffix=archive_file_suffix) # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # set this to an :py:class:`~mrjob.setup.UploadDirManager` in # runners that upload files to HDFS, S3, etc. # # this manager should not handle files belonging to # self._working_dir_mgr, # which, if they are uploaded, will go into self._wd_upload_dir() self._upload_mgr = None self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key() # extra args to our job self._extra_args = list(extra_args) if extra_args else [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if extra_arg.get('type') != 'file': raise NotImplementedError self._working_dir_mgr.add(**extra_arg) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add( 'archive', archive_path, name=ud['name']) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where to keep the input manifest self._input_manifest_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # check and store *steps* self._steps = [] if steps: self._check_steps(steps) self._steps = copy.deepcopy(steps) # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False ### Options #### @classmethod def _default_opts(cls): try: owner = getpass.getuser() except: owner = None return dict( check_input_paths=True, cleanup=['ALL'], cleanup_on_failure=['NONE'], owner=owner, ) def _combine_confs(self, source_and_opt_list): """Combine several opt dictionaries into one. *source_and_opt_list* is a list of tuples of *source*, *opts* where *opts* is a dictionary and *source* is either None or a description of where the opts came from (usually a path). Only override this if you need truly fine-grained control, including knowledge of the options' source. """ opt_list = [ self._fix_opts(opts, source) for source, opts in source_and_opt_list ] return self._combine_opts(opt_list) def _combine_opts(self, opt_list): """Combine several opt dictionaries into one. *opt_list* is a list of dictionaries containing validated options Override this if you need to base options off the values of other options, but don't need to issue warnings etc. about the options' source. """ return combine_opts(self._opt_combiners(), *opt_list) def _opt_combiners(self): """A dictionary mapping opt name to combiner funciton. This won't necessarily include every opt name (we default to :py:func:`~mrjob.conf.combine_value`). """ return _combiners(self.OPT_NAMES) def _fix_opts(self, opts, source=None): """Take an options dictionary, and either return a sanitized version of it, or raise an exception. *source* is either a string describing where the opts came from or None. This ensures that opt dictionaries are really dictionaries and handles deprecated options. """ if source is None: source = 'defaults' # defaults shouldn't trigger warnings if not isinstance(opts, dict): raise TypeError( 'options for %s (from %s) must be a dict' % (self.alias, source)) deprecated_aliases = _deprecated_aliases(self.OPT_NAMES) results = {} for k, v in sorted(opts.items()): # rewrite deprecated aliases if k in deprecated_aliases: if v is None: # don't care continue aliased_opt = deprecated_aliases log.warning('Deprecated option %s (from %s) has been renamed' ' to %s and will be removed in v0.7.0' % ( k, source, aliased_opt)) if opts.get(aliased_opt) is not None: return # don't overwrite non-aliased opt k = aliased_opt if k in self.OPT_NAMES: if v is None: fixed_v = None elif isinstance(v, ClearedValue): # _fix_opt() doesn't need to know about !clear (see #2102) fixed_v = ClearedValue(self._fix_opt(k, v.value, source)) else: fixed_v = self._fix_opt(k, v, source) results[k] = fixed_v elif v: log.warning('Unexpected option %s (from %s)' % (k, source)) return results def _fix_opt(self, opt_key, opt_value, source): """Fix a single option, returning its correct value or raising an exception. This is not called for options that are ``None``. This currently handles cleanup opts. Override this if you require additional opt validation or cleanup. """ if opt_key in ('cleanup', 'cleanup_on_failure'): return self._fix_cleanup_opt(opt_key, opt_value, source) else: return opt_value def _fix_cleanup_opt(self, opt_key, opt_value, source): """Fix a cleanup option, or raise ValueError.""" if isinstance(opt_value, string_types): opt_value = [opt_value] if 'NONE' in opt_value and len(set(opt_value)) > 1: raise ValueError( 'Cannot clean up both nothing and something!' ' (%s option from %s)' % (opt_key, source)) for cleanup_type in opt_value: if cleanup_type not in CLEANUP_CHOICES: raise ValueError( '%s must be one of %s, not %s (from %s)' % ( opt_key, ', '.join(CLEANUP_CHOICES), opt_value, source)) return opt_value def _obfuscate_opt(self, opt_key, opt_value): """Return value of opt to show in debug printout. Used to obfuscate credentials, etc.""" return opt_value ### Filesystem object ### @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. """ if self._fs is None: # wrap LocalFilesystem in LocalFilesystem to get IOError # on URIs (see #1185) self._fs = CompositeFilesystem() self._fs.add_fs('local', LocalFilesystem()) return self._fs ### Running the job and parsing output ### def run(self): """Run the job, and block until it finishes. Raise :py:class:`~mrjob.step.StepFailedException` if there are any problems (except on :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the actual exception that caused the step to fail). """ if self._ran_job: raise ValueError('Job already ran!') if self._num_steps() == 0: raise ValueError('Job has no steps!') self._create_dir_archives() # TODO: no point in checking input paths if we're going to # make a manifest out of them self._check_input_paths() self._add_input_files_for_upload() self._create_input_manifest_if_needed() self._run() self._ran_job = True last_step = self._get_steps()[-1] # only print this message if the last step uses our output dir if 'args' not in last_step or OUTPUT in last_step['args']: log.info('job output is in %s' % self._output_dir) def cat_output(self): """Stream the job's output, as a stream of ``bytes``. If there are multiple output files, there will be an empty bytestring (``b''``) between them. Like Hadoop input formats, we ignore files and subdirectories whose names start with ``"_"`` or ``"."`` (e.g. ``_SUCCESS``, ``_logs/``, ``.part-00000.crc``. .. versionchanged:: 0.6.8 Ignore file/dirnames starting with ``"."`` as well as ``"_"``. """ output_dir = self.get_output_dir() if output_dir is None: raise ValueError('Run the job before streaming output') if self._closed is True: log.warning( 'WARNING! Trying to stream output from a closed runner, output' ' will probably be empty.') log.info('Streaming final output from %s...' % output_dir) def split_path(path): while True: base, name = os.path.split(path) # no more elements if not name: break yield name path = base def ls_output(): for filename in self.fs.ls(output_dir): subpath = filename[len(output_dir):] # Hadoop ignores files and dirs inside the output dir # whose names start with '_' or '.'. See #1337. if not (any(name[0] in '_.' for name in split_path(subpath))): yield filename for i, filename in enumerate(ls_output()): if i > 0: yield b'' # EOF of previous file for chunk in self.fs._cat_file(filename): yield chunk def _cleanup_mode(self, mode=None): """Actual cleanup action to take based on various options""" if self._script_path and not self._ran_job: return mode or self._opts['cleanup_on_failure'] else: return mode or self._opts['cleanup'] def _cleanup_cloud_tmp(self): """Cleanup any files/directories on cloud storage (e.g. S3) we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only EMR runner does this def _cleanup_hadoop_tmp(self): """Cleanup any files/directories on HDFS we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only Hadoop runner does this def _cleanup_local_tmp(self): """Cleanup any files/directories on the local machine we created while running this job. Should be safe to run this at any time, or multiple times. This particular function removes any local tmp directories added to the list self._local_tmp_dirs This won't remove output_dir if it's outside of our tmp dir. """ if self._local_tmp_dir: log.info('Removing temp directory %s...' % self._local_tmp_dir) try: rmtree(self._local_tmp_dir) except OSError as e: log.exception(e) self._local_tmp_dir = None def _cleanup_cluster(self): """Terminate the cluster if there is one.""" pass # this only happens on EMR def _cleanup_logs(self): """Cleanup any log files that are created as a side-effect of the job. """ pass # this only happens on EMR def _cleanup_job(self): """Stop any jobs that we created that are still running.""" pass # currently disabled (see #1241) def cleanup(self, mode=None): """Clean up running jobs, temp files, and logs, subject to the *cleanup* option passed to the constructor. If you create your runner in a ``with`` block, :py:meth:`cleanup` will be called automatically:: with mr_job.make_runner() as runner: ... # cleanup() called automatically here :param mode: override *cleanup* passed into the constructor. Should be a list of strings from :py:data:`~mrjob.options.CLEANUP_CHOICES` """ mode = self._cleanup_mode(mode) def mode_has(*args): return any((choice in mode) for choice in args) if self._script_path and not self._ran_job: if mode_has('CLUSTER', 'ALL'): self._cleanup_cluster() if mode_has('JOB', 'ALL'): self._cleanup_job() if mode_has('ALL', 'TMP', 'CLOUD_TMP'): self._cleanup_cloud_tmp() if mode_has('ALL', 'TMP', 'HADOOP_TMP'): self._cleanup_hadoop_tmp() if mode_has('ALL', 'TMP', 'LOCAL_TMP'): self._cleanup_local_tmp() if mode_has('ALL', 'LOGS'): self._cleanup_logs() self._closed = True def counters(self): """Get counters associated with this run in this form:: [{'group name': {'counter1': 1, 'counter2': 2}}, {'group name': ...}] The list contains an entry for every step of the current job. """ raise NotImplementedError ### hooks for the with statement ### def __enter__(self): """Don't do anything special at start of with block""" return self def __exit__(self, type, value, traceback): """Call self.cleanup() at end of with block.""" self.cleanup() ### more runner information ### def get_opts(self): """Get options set for this runner, as a dict.""" log.warning('get_opts() is deprecated and will be removed in v0.7.0') return copy.deepcopy(self._opts) def get_job_key(self): """Get the unique key for the job run by this runner. This has the format ``label.owner.date.time.microseconds`` """ return self._job_key def get_output_dir(self): """Find the directory containing the job output. If the job hasn't run yet, returns None""" if self._script_path and not self._ran_job: return None return self._output_dir ### other methods you need to implement in your subclass ### def get_hadoop_version(self): """Return the version number of the Hadoop environment as a string if Hadoop is being used or simulated. Return None if not applicable. :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster. :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an additional `hadoop_version` option to specify which version it simulates. :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at all. """ return None # you'll probably wan't to add your own __init__() and cleanup() as well def _run(self): """Run the job.""" raise NotImplementedError ### internal utilities for implementing MRJobRunners ### def _get_local_tmp_dir(self): """Create a tmp directory on the local filesystem that will be cleaned up by self.cleanup()""" if not self._local_tmp_dir: tmp_dir = (self._opts['local_tmp_dir'] or tempfile.gettempdir()) path = os.path.join(tmp_dir, self._job_key) log.info('Creating temp directory %s' % path) if os.path.isdir(path): rmtree(path) os.makedirs(path) self._local_tmp_dir = path return self._local_tmp_dir def _make_unique_job_key(self, label=None, owner=None): """Come up with a useful unique ID for this job. Optionally, you can specify a custom label or owner (otherwise we use :py:meth:`_label` and :py:meth:`_owner`. We use this to choose the output directory, etc. for the job. """ if label is None: label = self._label() if owner is None: owner = self._owner() now = datetime.datetime.utcnow() return '%s.%s.%s.%06d' % ( label, owner, now.strftime('%Y%m%d.%H%M%S'), now.microsecond) def _label(self): """Return *label* opt, or if not set, the name of the file containing the MRJob, minus extension, or if none, ``'no_script'``""" if self._opts['label']: return self._opts['label'] elif self._script_path: return os.path.basename(self._script_path).split('.')[0] else: return 'no_script' def _owner(self): """Return *owner* opt (which defaults to :py:func:`getpass.getuser`), or ``'no_user'`` if not set.""" if self._opts['owner']: # owner opt defaults to getpass.getuser() return self._opts['owner'] else: return 'no_user' def _get_steps(self): """Returns ``self._steps``. """ # TODO: remove this return self._steps def _check_steps(self, steps): """Look at the step definition (*steps*). If it is not supported by the runner, raise :py:class:`NotImplementedError`. If it is not supported by mrjob, raise :py:class:`ValueError`. """ if not self._STEP_TYPES: # use __class__.__name__ because only MRJobRunner would # trigger this raise NotImplementedError( '%s cannot run steps!' % self.__class__.__name__) for step_num, step in enumerate(steps): self._check_step(step, step_num) def _check_step(self, step, step_num): """Raise an exception if the given step is invalid (:py:class:`ValueError`) or not handled by this runner (:py:class:`NotImplementedError`). By default, we check that *step* has a support step type, only uses an input manifest if it's the first step, and that :py:attr:`_script_path` exists if necessary. You can re-define this in your subclass. """ if step.get('type') not in self._STEP_TYPES: raise NotImplementedError( 'step %d has type %r, but %s runner only supports:' ' %s' % (step_num, step.get('type'), self.alias, ', '.join(sorted(self._STEP_TYPES)))) if step.get('input_manifest') and step_num != 0: raise ValueError( 'step %d may not take an input manifest (only' ' first step can' % step_num) # some step types assume a MRJob script if not self._script_path: if step['type'] == 'spark': raise ValueError( "SparkStep (step %d) can't run without a MRJob script" " (try SparkScriptStep instead)" % step_num) elif step['type'] == 'streaming': for mrc in ('mapper', 'combiner', 'reducer'): if not step.get(mrc): continue substep = step[mrc] if substep['type'] == 'script': raise ValueError( "%s (step %d) can't run without a MRJob" " script" % (mrc, step_num)) def _get_step(self, step_num): """Get a single step (calls :py:meth:`_get_steps`).""" return self._get_steps()[step_num] def _num_steps(self): """Get the number of steps (calls :py:meth:`get_steps`).""" return len(self._get_steps()) def _uses_input_manifest(self): """Does the first step take an input manifest?""" return bool(self._get_step(0).get('input_manifest')) def _has_hadoop_streaming_steps(self): """Are any of our steps Hadoop Streaming steps?""" return any(step['type'] == 'streaming' for step in self._get_steps()) def _has_spark_steps(self): """Are any of our steps Spark steps? (e.g. spark, spark_jar, spark_script) Generally used to determine if we need to install Spark on a cluster. """ return any(self._step_type_uses_spark(step['type']) for step in self._get_steps()) def _has_pyspark_steps(self): """Do any of our steps involve running Python on Spark? Includes spark and spark_script types, but not spark_jar. Generally used to tell if we need a Spark setup script. """ return any(self._step_type_uses_pyspark(step['type']) for step in self._get_steps()) def _step_type_uses_spark(self, step_type): """Does this step run on Spark? (This is re-defined in the Spark runner to include streaming steps, and used by mrjob.logs.mixin) """ return _is_spark_step_type(step_type) def _step_type_uses_pyspark(self, step_type): """Does this step involve running Python on Spark? (This is re-defined in the Spark runner to include streaming steps, and used by mrjob.logs.mixin) """ return _is_pyspark_step_type(step_type) def _spark_master(self): return self._opts.get('spark_master') or 'local[*]' def _spark_deploy_mode(self): return self._opts.get('spark_deploy_mode') or 'client' def _spark_driver_has_own_wd(self): """Does the spark driver have a working directory different from the one *spark-submit* was run in? (Only true in cluster mode.) """ return (self._spark_deploy_mode() == 'cluster' and self._spark_executors_have_own_wd()) def _spark_executors_have_own_wd(self): """Do spark executors have a working directory different from the one *spark-submit* was run in? (True on everything but local.) """ # note: local-cluster[...] master does in fact have working dirs return self._spark_master().split('[')[0] != 'local' def _emulate_archives_on_spark(self): """True if spark-submit's --archives doesn't work on the given Spark master, which means we'll need to emulate archives in setup scripts. """ return self._spark_master() != 'yarn' def _args_for_task(self, step_num, mrc): return [ '--step-num=%d' % step_num, '--%s' % mrc, ] + self._mr_job_extra_args() def _mr_job_extra_args(self, local=False): """Return arguments to add to every invocation of MRJob. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ result = [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if local: result.append(extra_arg['path']) else: result.append(self._working_dir_mgr.name(**extra_arg)) else: result.append(extra_arg) return result def _spark_script_args(self, step_num, last_step_num=None): """A list of args to the spark script/jar/MRJob, used by _args_for_spark_step(). *last_step_num* is only used by the Spark runner, where multiple streaming steps are run in a single Spark job.""" step = self._get_step(step_num) if step['type'] == 'spark': # if on local[*] master, keep file upload args as-is (see #2031) local = not self._spark_executors_have_own_wd() args = ( [ '--step-num=%d' % step_num, '--spark', ] + self._mr_job_extra_args(local=local) + [ INPUT, OUTPUT, ] ) elif step['type'] in ('spark_jar', 'spark_script'): args = step['args'] else: raise TypeError('Bad step type: %r' % step['type']) return self._interpolate_step_args(args, step_num) def _interpolate_step_args(self, args, step_num): """Replace :py:data:`~mrjob.step.INPUT` and :py:data:`~mrjob.step.OUTPUT` in arguments to a jar or Spark step. """ result = [] for arg in args: if arg == INPUT: result.append( ','.join(self._step_input_uris(step_num))) elif arg == OUTPUT: result.append( self._step_output_uri(step_num)) else: result.append(arg) return result def _dir_archive_path(self, dir_path): """Assign a path for the archive of *dir_path* but don't actually create anything.""" if dir_path not in self._dir_to_archive_path: # we can check local paths now if not (is_uri(dir_path) or os.path.isdir(dir_path)): raise OSError('%s is not a directory!' % dir_path) name = name_uniquely( dir_path, names_taken=self._dir_archive_names_taken) self._dir_archive_names_taken.add(name) self._dir_to_archive_path[dir_path] = os.path.join( self._get_local_tmp_dir(), 'archives', name + '.tar.gz') return self._dir_to_archive_path[dir_path] def _create_dir_archives(self): """Call this to create all dir archives""" for dir_path in sorted(set(self._dir_to_archive_path)): self._create_dir_archive(dir_path) def _create_dir_archive(self, dir_path): """Helper for :py:meth:`archive_dir`""" if not self.fs.exists(dir_path): raise OSError('%s does not exist') tar_gz_path = self._dir_archive_path(dir_path) if tar_gz_path in self._dir_archives_created: return # already created if not os.path.isdir(os.path.dirname(tar_gz_path)): os.makedirs(os.path.dirname(tar_gz_path)) # for remote files tmp_download_path = os.path.join( self._get_local_tmp_dir(), 'tmp-download') log.info('Archiving %s -> %s' % (dir_path, tar_gz_path)) with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz: for path in self.fs.ls(dir_path): # fs.ls() only lists files if path == dir_path: raise OSError('%s is a file, not a directory!' % dir_path) # TODO: do we need this? if os.path.realpath(path) == os.path.realpath(tar_gz_path): raise OSError( 'attempted to archive %s into itself!' % tar_gz_path) if is_uri(path): path_in_tar_gz = path[len(dir_path):].lstrip('/') log.info(' downloading %s -> %s' % ( path, tmp_download_path)) with open(tmp_download_path, 'wb') as f: for chunk in self.fs.cat(path): f.write(chunk) local_path = tmp_download_path else: path_in_tar_gz = path[len(dir_path):].lstrip(os.sep) local_path = path log.debug(' adding %s to %s' % (path, tar_gz_path)) tar_gz.add(local_path, path_in_tar_gz, recursive=False) self._dir_archives_created.add(tar_gz_path) def _bootstrap_mrjob(self): """Should we bootstrap mrjob?""" if self._opts['bootstrap_mrjob'] is None: return True else: return bool(self._opts['bootstrap_mrjob']) def _get_input_paths(self): """Get the paths to input files, dumping STDIN to a local file if need be.""" if self._input_manifest_path: return [self._input_manifest_path] if '-' in self._input_paths: if self._stdin_path is None: # prompt user, so they don't think the process has stalled log.info('reading from STDIN') stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') log.debug('dumping stdin to local file %s' % stdin_path) with open(stdin_path, 'wb') as stdin_file: for line in self._stdin: # catch missing newlines (often happens with test data) if not line.endswith(b'\n'): line += b'\n' stdin_file.write(line) self._stdin_path = stdin_path return [self._stdin_path if p == '-' else p for p in self._input_paths] def _create_input_manifest_if_needed(self): """Create a file with a list of URIs of input files.""" if self._input_manifest_path or not self._uses_input_manifest(): return uris = [] log.info('finding input files to add to manifest...') for path in self._get_input_paths(): log.debug(' in %s' % path) if is_uri(path): # URIs might be globs for uri in self.fs.ls(path): uris.append(uri) else: # local paths are expected to be single files # (shell would resolve globs) if self._upload_mgr: uris.append(self._upload_mgr.uri(path)) else: # just make sure job can find files from its working dir uris.append(os.path.abspath(path)) log.info('found %d input files' % len(uris)) path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt') self._write_script(uris, path, 'input manifest') self._input_manifest_path = path if self._upload_mgr: self._upload_mgr.add(self._input_manifest_path) def _check_input_paths(self): """Check that input exists prior to running the job, if the `check_input_paths` option is true.""" if not self._opts['check_input_paths']: return for path in self._input_paths: self._check_input_path(path) def _check_input_path(self, path): """Raise :py:class:`IOError` if the given input does not exist or is otherwise invalid. Override this to provide custom check behavior.""" if path == '-': return # STDIN always exists if not self.fs.can_handle_path(path): return # no way to check (e.g. non-S3 URIs on EMR) if not self.fs.exists(path): raise IOError( 'Input path %s does not exist!' % (path,)) def _add_input_files_for_upload(self): """If there is an upload manager, add input files to it.""" if self._upload_mgr: for path in self._get_input_paths(): self._upload_mgr.add(path) def _upload_local_files(self): self._copy_files_to_wd_mirror() if self._upload_mgr: self.fs.mkdir(self._upload_mgr.prefix) log.info('Copying other local files to %s' % self._upload_mgr.prefix) for src_path, uri in self._upload_mgr.path_to_uri().items(): log.debug(' %s -> %s' % (src_path, uri)) self.fs.put(src_path, uri) def _wd_mirror(self): """A directory to upload files belonging to :py:attr:`_working_dir_mgr`. This will be a subdir of ``self._upload_mgr.prefix`, if it exists, and otherwise will be ``None``.""" if self._upload_mgr and is_uri(self._upload_mgr.prefix): return posixpath.join(self._upload_mgr.prefix, 'wd') elif (self._has_spark_steps() and self._spark_executors_have_own_wd()): return os.path.join(self._get_local_tmp_dir(), 'wd-mirror') else: return None def _wd_filenames_must_match(self): """When we tell Hadoop/Spark to put files in the working directory, must they have the same names as the files in the working dir? This basically only happens with Spark on non-YARN masters. YARN/Hadoop allows you to specify a name for each file (``path#name_in_wd``). """ return self._has_spark_steps() and self._spark_master() != 'yarn' def _dest_in_wd_mirror(self, path, name): """Return the URI of where to upload *path* so it can appear in the working dir as *name*, or ``None`` if it doesn't need to be uploaded. """ dest_dir = self._wd_mirror() if not dest_dir: return None # the only reason to re-upload a URI is if it has the wrong name # # similarly, the only point of a local working dir mirror is # to rename things if (is_uri(path) or not is_uri(dest_dir)) and ( posixpath.basename(path) == name or not self._wd_filenames_must_match()): return None return posixpath.join(dest_dir, name) def _copy_file_to_wd_mirror(self, path, name): """Upload/copy *path* to the appropriate place in the working dir mirror, if necessary. We don't track whether something has already been uploaded. """ dest = self._dest_in_wd_mirror(path, name) if not dest: return if is_uri(path): # file is visible to non-YARN Spark, but has the wrong name, so # download and re-upload it wd_tmp = os.path.join(self._get_local_tmp_dir(), 'wd-mirror') self.fs.mkdir(wd_tmp) tmp_path = os.path.join(wd_tmp, name) log.debug(' %s <- %s' % (tmp_path, path)) try: with open(tmp_path, 'wb') as tmp_f: for chunk in self.fs.cat(path): tmp_f.write(chunk) log.debug(' %s -> %s' % (tmp_path, dest)) self.fs.put(tmp_path, dest) finally: os.remove(tmp_path) else: # upload it log.debug(' %s -> %s' % (path, dest)) self.fs.put(path, dest) def _copy_files_to_wd_mirror(self): """Upload working dir files to the working dir mirror, if necessary. This does not handle archives, which we always rename with hash paths anyhow (see #2059). """ wd_mirror = self._wd_mirror() if not wd_mirror: return self.fs.mkdir(wd_mirror) log.info('%s working dir files to %s...' % ('uploading' if is_uri(wd_mirror) else 'copying', wd_mirror)) for name, path in sorted( self._working_dir_mgr.name_to_path('file').items()): self._copy_file_to_wd_mirror(path, name) for name, path in sorted( self._working_dir_mgr.name_to_path('archive_file').items()): self._copy_file_to_wd_mirror(path, name) def _upload_part_size(self): """Part size for uploads, in bytes, or ``None``, from :mrjob-opt:`cloud_part_size_mb`""" if self._opts.get('cloud_part_size_mb'): return int(self._opts['cloud_part_size_mb'] * 1024 * 1024) else: return None def _intermediate_output_dir(self, step_num, local=False): """A directory for intermediate output for the given step number.""" join = os.path.join if local else posixpath.join return join( self._step_output_dir or self._default_step_output_dir(), '%04d' % step_num) def _default_step_output_dir(self): """Where to put output for steps other than the last one, if not specified by the *output_dir* constructor keyword. Usually you want this to be on HDFS (most efficient). Define this in your runner subclass. """ raise NotImplementedError def _step_input_uris(self, step_num): """A list of URIs to use as input for the given step. For all except the first step, this list will have a single item (a directory).""" if step_num == 0: return [self._upload_mgr.uri(path) if self._upload_mgr else to_uri(path) for path in self._get_input_paths()] else: return [to_uri(self._intermediate_output_dir(step_num - 1))] def _step_output_uri(self, step_num): """URI to use as output for the given step. This is either an intermediate dir (see :py:meth:`intermediate_output_uri`) or ``self._output_dir`` for the final step.""" if step_num == len(self._get_steps()) - 1: return to_uri(self._output_dir) else: return to_uri(self._intermediate_output_dir(step_num)) def _cmdenv(self): """Return a copy of ``self._opts['cmdenv']``. This exists so we can instrument cmdenv in runner subclasses.""" return dict(self._opts['cmdenv']) def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) # _sort_values_jobconf() isn't relevant to Spark, # but it doesn't do any harm either jobconf = combine_jobconfs(self._sort_values_jobconf(), self._opts['jobconf'], step.get('jobconf')) # if user is using the wrong jobconfs, add in the correct ones # and log a warning hadoop_version = self.get_hadoop_version() if hadoop_version: jobconf = translate_jobconf_dict(jobconf, hadoop_version) return jobconf def _sort_values_jobconf(self): """Jobconf dictionary to enable sorting by value. """ if not self._sort_values: return {} # translate _SORT_VALUES_JOBCONF to the correct Hadoop version, # without logging a warning hadoop_version = self.get_hadoop_version() jobconf = {} for k, v in _SORT_VALUES_JOBCONF.items(): if hadoop_version: jobconf[translate_jobconf(k, hadoop_version)] = v else: for j in translate_jobconf_for_all_versions(k): jobconf[j] = v return jobconf def _sort_values_partitioner(self): """Partitioner to use with *sort_values* keyword to the constructor.""" if self._sort_values: return _SORT_VALUES_PARTITIONER else: return None def _upload_args(self): # just upload every file and archive in the working dir manager return self._upload_args_helper('-files', None, '-archives', None) def _upload_args_helper( self, files_opt_str, files, archives_opt_str, archives, always_use_hash=True, emulate_archives=False): args = [] file_hash_paths = list( self._file_arg_hash_paths(files, always_use_hash=always_use_hash)) # if emulating --archives, upload archives with files (we'll unpack # them later with a setup script) if emulate_archives: file_hash_paths.extend( self._file_archive_hash_paths(archives)) # --files ... if file_hash_paths: args.append(files_opt_str) args.append(','.join(file_hash_paths)) if not emulate_archives: archive_hash_paths = list(self._archive_arg_hash_paths(archives)) # --archives ... if archive_hash_paths: args.append(archives_opt_str) args.append(','.join(archive_hash_paths)) return args def _file_arg_hash_paths(self, named_paths=None, always_use_hash=True): """Helper function for the *upload_args methods. The names of all arguments to ``-files`` (or ``--files`` on Spark). If *always_use_hash* is false, only use ``path#name`` syntax when the name is different. """ if named_paths is None: # just return every file managed by _working_dir_mgr named_paths = sorted( self._working_dir_mgr.name_to_path('file').items()) for name, path in named_paths: if not name: name = self._working_dir_mgr.name('file', path) uri = self._dest_in_wd_mirror(path, name) or path if not always_use_hash and _basename(uri) == name: yield uri else: yield '%s#%s' % (uri, name) def _file_archive_hash_paths(self, named_paths=None): """Helper function for the *upload_args methods. The names of archives to pass to the ``--files`` switch of ``spark-submit``, since we can't use ``--archives``. The names in *named_paths* should be of the archive destination (the 'archive' type in WorkingDirManager) not of the filename we're going to copy the archive to before unpacking it into its destination (the 'archive_file' type). """ if named_paths is None: named_paths = sorted( self._working_dir_mgr.name_to_path('archive').items()) for name, path in named_paths: if not name: name = self._working_dir_mgr.name('archive', path) archive_file_name = self._working_dir_mgr.name( 'archive_file', path) uri = self._dest_in_wd_mirror(path, archive_file_name) or path yield uri def _archive_arg_hash_paths(self, named_paths=None): """Helper function for the *upload_args methods. The names of all arguments to ``-archives`` (or ``--archives`` on Spark). """ # we always use path#name syntax, even on Spark, because unlike # with --files, Spark will either accept that syntax with --archives # (if we're on YARN) or ignore --archives completely (if we're on # any other Spark master) if named_paths is None: # just return every archive managed by _working_dir_mgr named_paths = sorted( self._working_dir_mgr.name_to_path('archive').items()) for name, path in named_paths: if not name: name = self._working_dir_mgr.name('archive', path) # archives are uploaded to the working dir mirror by the # name of the original archive file, not the dir it unpacks into archive_file_name = self._working_dir_mgr.name( 'archive_file', path) uri = self._dest_in_wd_mirror(path, archive_file_name) or path yield '%s#%s' % (uri, name) def _write_script(self, lines, path, description): """Write text of a setup script, input manifest, etc. to the given file. By default, this writes binary data. Redefine :py:meth:`write_lines` to use other line endings. :param lines: a list of lines as ``str`` :param path: path of file to write to :param description: what we're writing to, for debug messages """ log.debug('Writing %s to %s:' % (description, path)) for line in lines: log.debug(' ' + line) self._write_script_lines(lines, path) def _write_script_lines(self, lines, path): """Write text to the given file. By default, this writes binary data, but can be redefined to use local line endings.""" with open(path, 'wb') as f: for line in lines: f.write((line + '\n').encode('utf-8')) def _fix_env(env): """Convert environment dictionary to strings (Python 2.7 on Windows doesn't allow unicode).""" def _to_str(s): if isinstance(s, string_types) and not isinstance(s, str): return s.encode('utf_8') else: return s return dict((_to_str(k), _to_str(v)) for k, v in env.items()) def _blank_out_conflicting_opts(opt_list, opt_names, conflicting_opts=None): """Utility for :py:meth:`MRJobRunner._combine_opts()`: if multiple configs specify conflicting opts, blank them out in all but the last config (so, for example, the command line beats the config file). This returns a copy of *opt_list* """ conflicting_opts = set(conflicting_opts or ()) | set(opt_names) # copy opt_list so we can modify it opt_list = [dict(opts) for opts in opt_list] # blank out region/zone before the last config where they are set blank_out = False for opts in reversed(opt_list): if blank_out: for opt_name in opt_names: opts[opt_name] = None elif any(opts.get(opt_name) is not None for opt_name in conflicting_opts): blank_out = True return opt_list def _runner_class(alias): """Get the runner subclass corresponding to the given alias (importing code only as needed).""" if alias == 'dataproc': from mrjob.dataproc import DataprocJobRunner return DataprocJobRunner elif alias == 'emr': from mrjob.emr import EMRJobRunner return EMRJobRunner elif alias == 'hadoop': from mrjob.hadoop import HadoopJobRunner return HadoopJobRunner elif alias == 'inline': from mrjob.inline import InlineMRJobRunner return InlineMRJobRunner elif alias == 'local': from mrjob.local import LocalMRJobRunner return LocalMRJobRunner elif alias == 'spark': from mrjob.spark.runner import SparkMRJobRunner return SparkMRJobRunner else: raise ValueError('bad runner alias: %s' % alias) def _basename(path_or_uri): if is_uri(path_or_uri): return posixpath.basename(path_or_uri) else: return os.path.basename(path_or_uri)