mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-08 19:21:28 +00:00
1063 lines
38 KiB
Python
1063 lines
38 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2009-2017 Yelp and Contributors
|
|
# Copyright 2018-2019 Yelp
|
|
# Copyright 2020 Affirm, Inc. and Contributors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Abstract base class for all runners that execute binaries/scripts
|
|
(that is, everything but inline mode).
|
|
"""
|
|
import logging
|
|
import os
|
|
import os.path
|
|
import pipes
|
|
import re
|
|
import sys
|
|
from mrjob.py2 import PY2
|
|
from platform import python_implementation
|
|
from subprocess import Popen
|
|
from subprocess import PIPE
|
|
|
|
try:
|
|
import pty
|
|
pty # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
pty = None
|
|
|
|
try:
|
|
import pyspark
|
|
pyspark # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
pyspark = None
|
|
|
|
import mrjob.step
|
|
from mrjob.compat import translate_jobconf
|
|
from mrjob.conf import combine_cmds
|
|
from mrjob.conf import combine_dicts
|
|
from mrjob.logs.log4j import _parse_hadoop_log4j_records
|
|
from mrjob.logs.spark import _parse_spark_log
|
|
from mrjob.logs.step import _eio_to_eof
|
|
from mrjob.py2 import string_types
|
|
from mrjob.runner import MRJobRunner
|
|
from mrjob.setup import parse_setup_cmd
|
|
from mrjob.util import cmd_line
|
|
from mrjob.util import shlex_split
|
|
from mrjob.util import unique
|
|
from mrjob.util import which
|
|
from mrjob.util import zip_dir
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# no need to escape arguments that only include these characters
|
|
_HADOOP_SAFE_ARG_RE = re.compile(r'^[\w\./=-]*$')
|
|
|
|
# used to handle manifest files
|
|
_MANIFEST_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'
|
|
|
|
# map archive file extensions to the command used to unarchive them
|
|
_EXT_TO_UNARCHIVE_CMD = {
|
|
'.zip': 'unzip -o %(file)s -d %(dir)s',
|
|
'.tar': 'mkdir %(dir)s; tar xf %(file)s -C %(dir)s',
|
|
'.tar.gz': 'mkdir %(dir)s; tar xfz %(file)s -C %(dir)s',
|
|
'.tgz': 'mkdir %(dir)s; tar xfz %(file)s -C %(dir)s',
|
|
}
|
|
|
|
|
|
def _unarchive_cmd(path):
|
|
"""Look up the unarchive command to use with the given file extension,
|
|
or raise KeyError if there is no matching command."""
|
|
for ext, unarchive_cmd in sorted(_EXT_TO_UNARCHIVE_CMD.items()):
|
|
# use this so we can match e.g. mrjob-0.7.0.tar.gz
|
|
if path.endswith(ext):
|
|
return unarchive_cmd
|
|
|
|
raise KeyError('unknown archive type: %s' % path)
|
|
|
|
|
|
class MRJobBinRunner(MRJobRunner):
|
|
|
|
OPT_NAMES = MRJobRunner.OPT_NAMES | {
|
|
'python_bin',
|
|
'sh_bin',
|
|
'spark_args',
|
|
'spark_submit_bin',
|
|
'task_python_bin',
|
|
}
|
|
|
|
def __init__(self, **kwargs):
|
|
super(MRJobBinRunner, self).__init__(**kwargs)
|
|
|
|
# where a zip file of the mrjob library is stored locally
|
|
self._mrjob_zip_path = None
|
|
|
|
# we'll create the setup wrapper scripts later
|
|
self._setup_wrapper_script_path = None
|
|
self._manifest_setup_script_path = None
|
|
self._spark_python_wrapper_path = None
|
|
|
|
# self._setup is a list of shell commands with path dicts
|
|
# interleaved; see mrjob.setup.parse_setup_cmd() for details
|
|
self._setup = [parse_setup_cmd(cmd) for cmd in self._opts['setup']]
|
|
|
|
for cmd in self._setup:
|
|
for token in cmd:
|
|
if isinstance(token, dict):
|
|
# convert dir archives tokens to archives
|
|
if token['type'] == 'dir':
|
|
# feed the archive's path to self._working_dir_mgr
|
|
token['path'] = self._dir_archive_path(token['path'])
|
|
token['type'] = 'archive'
|
|
|
|
self._working_dir_mgr.add(**token)
|
|
|
|
# warning: no setup scripts on Spark when no working dir
|
|
if self._setup and self._has_pyspark_steps() and not(
|
|
self._spark_executors_have_own_wd()):
|
|
log.warning("setup commands aren't supported on Spark master %r" %
|
|
self._spark_master())
|
|
|
|
# --py-files on Spark doesn't allow '#' (see #1375)
|
|
if any('#' in path for path in self._opts['py_files']):
|
|
raise ValueError("py_files cannot contain '#'")
|
|
|
|
# Keep track of where the spark-submit binary is
|
|
self._spark_submit_bin = self._opts['spark_submit_bin']
|
|
|
|
@classmethod
|
|
def _default_opts(cls):
|
|
return combine_dicts(
|
|
super(MRJobBinRunner, cls)._default_opts(),
|
|
dict(
|
|
read_logs=True,
|
|
)
|
|
)
|
|
|
|
def _fix_opt(self, opt_key, opt_value, source):
|
|
"""Check sh_bin"""
|
|
opt_value = super(MRJobBinRunner, self)._fix_opt(
|
|
opt_key, opt_value, source)
|
|
|
|
# check that sh_bin doesn't have too many args
|
|
if opt_key == 'sh_bin':
|
|
# opt_value is usually a string, combiner makes it a list of args
|
|
sh_bin = combine_cmds(opt_value)
|
|
|
|
# empty sh_bin just means to use the default, see #1926
|
|
|
|
# make these hard requirements in v0.7.0?
|
|
if len(sh_bin) > 1 and not os.path.isabs(sh_bin[0]):
|
|
log.warning('sh_bin (from %s) should use an absolute path'
|
|
' if you want it to take arguments' % source)
|
|
elif len(sh_bin) > 2:
|
|
log.warning('sh_bin (from %s) should not take more than one'
|
|
' argument' % source)
|
|
|
|
return opt_value
|
|
|
|
### python binary ###
|
|
|
|
def _python_bin(self):
|
|
"""Python binary used for everything other than invoking the job.
|
|
|
|
For running job tasks (e.g. ``--mapper``, ``--spark``), we use
|
|
:py:meth:`_task_python_bin`, which can be set to a different value
|
|
by setting :mrjob-opt:`task_python_bin`.
|
|
|
|
Ways mrjob uses Python other than running tasks:
|
|
* file locking in setup wrapper scripts
|
|
* finding site-packages dir to bootstrap mrjob on clusters
|
|
* invoking ``cat.py`` in local mode
|
|
* the Python binary for Spark (``$PYSPARK_PYTHON``)
|
|
"""
|
|
# python_bin isn't an option for inline runners
|
|
return self._opts['python_bin'] or self._default_python_bin()
|
|
|
|
def _task_python_bin(self):
|
|
"""Python binary used to invoke job with ``--mapper``,
|
|
``--reducer``, ``--spark``, etc."""
|
|
return (self._opts['task_python_bin'] or
|
|
self._python_bin())
|
|
|
|
def _default_python_bin(self, local=False):
|
|
"""The default python command. If local is true, try to use
|
|
sys.executable. Otherwise use 'python2.7' or 'python3' as appropriate.
|
|
|
|
This returns a single-item list (because it's a command).
|
|
"""
|
|
is_pypy = (python_implementation() == 'PyPy')
|
|
|
|
if local and sys.executable:
|
|
return [sys.executable]
|
|
else:
|
|
if PY2:
|
|
return ['pypy'] if is_pypy else ['python2.7']
|
|
else:
|
|
return ['pypy3'] if is_pypy else ['python3']
|
|
|
|
### running MRJob scripts ###
|
|
|
|
def _script_args_for_step(self, step_num, mrc, input_manifest=False):
|
|
args = (self._task_python_bin() +
|
|
[self._working_dir_mgr.name('file', self._script_path)] +
|
|
self._args_for_task(step_num, mrc))
|
|
|
|
if input_manifest and mrc == 'mapper':
|
|
wrapper = self._manifest_setup_script_path
|
|
elif self._setup_wrapper_script_path:
|
|
wrapper = self._setup_wrapper_script_path
|
|
else:
|
|
return args
|
|
|
|
return (self._sh_bin() + [
|
|
self._working_dir_mgr.name('file', wrapper)] + args)
|
|
|
|
def _substep_args(self, step_num, mrc):
|
|
step = self._get_step(step_num)
|
|
|
|
if step[mrc]['type'] == 'command':
|
|
cmd = step[mrc]['command']
|
|
|
|
# never wrap custom hadoop streaming commands in bash
|
|
if isinstance(cmd, string_types):
|
|
return shlex_split(cmd)
|
|
else:
|
|
return cmd
|
|
|
|
elif step[mrc]['type'] == 'script':
|
|
script_args = self._script_args_for_step(
|
|
step_num, mrc, input_manifest=step.get('input_manifest'))
|
|
|
|
if 'pre_filter' in step[mrc]:
|
|
return self._sh_wrap(
|
|
'%s | %s' % (step[mrc]['pre_filter'],
|
|
cmd_line(script_args)))
|
|
else:
|
|
return script_args
|
|
else:
|
|
raise ValueError("Invalid %s step %d: %r" % (
|
|
mrc, step_num, step[mrc]))
|
|
|
|
### hadoop streaming ###
|
|
|
|
def _render_substep(self, step_num, mrc):
|
|
step = self._get_step(step_num)
|
|
|
|
if mrc in step:
|
|
# cmd_line() does things that shell is fine with but
|
|
# Hadoop Streaming finds confusing.
|
|
return _hadoop_cmd_line(self._substep_args(step_num, mrc))
|
|
else:
|
|
if mrc == 'mapper':
|
|
return 'cat'
|
|
else:
|
|
return None
|
|
|
|
def _hadoop_args_for_step(self, step_num):
|
|
"""Build a list of extra arguments to the hadoop binary.
|
|
|
|
This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
|
|
*hadoop_output_format*, *jobconf*, and *partitioner*.
|
|
|
|
This doesn't handle input, output, mappers, reducers, or uploading
|
|
files.
|
|
"""
|
|
args = []
|
|
|
|
# -libjars, -D
|
|
args.extend(self._hadoop_generic_args_for_step(step_num))
|
|
|
|
# hadoop_extra_args (if defined; it's not for sim runners)
|
|
# this has to come after -D because it may include streaming-specific
|
|
# args (see #1332).
|
|
args.extend(self._opts.get('hadoop_extra_args', ()))
|
|
|
|
# partitioner
|
|
partitioner = self._partitioner or self._sort_values_partitioner()
|
|
if partitioner:
|
|
args.extend(['-partitioner', partitioner])
|
|
|
|
# cmdenv
|
|
for key, value in sorted(self._cmdenv().items()):
|
|
args.append('-cmdenv')
|
|
args.append('%s=%s' % (key, value))
|
|
|
|
# hadoop_input_format
|
|
if step_num == 0:
|
|
if self._uses_input_manifest():
|
|
args.extend(['-inputformat', _MANIFEST_INPUT_FORMAT])
|
|
elif self._hadoop_input_format:
|
|
args.extend(['-inputformat', self._hadoop_input_format])
|
|
|
|
# hadoop_output_format
|
|
if (step_num == self._num_steps() - 1 and self._hadoop_output_format):
|
|
args.extend(['-outputformat', self._hadoop_output_format])
|
|
|
|
return args
|
|
|
|
def _hadoop_streaming_jar_args(self, step_num):
|
|
"""The arguments that come after ``hadoop jar <streaming jar path>``
|
|
when running a Hadoop streaming job."""
|
|
args = []
|
|
|
|
# get command for each part of the job
|
|
mapper, combiner, reducer = (
|
|
self._hadoop_streaming_commands(step_num))
|
|
|
|
# set up uploading from HDFS/cloud storage to the working dir
|
|
args.extend(self._upload_args())
|
|
|
|
# if no reducer, shut off reducer tasks. This has to come before
|
|
# extra hadoop args, which could contain jar-specific args
|
|
# (e.g. -outputformat). See #1331.
|
|
#
|
|
# might want to just integrate this into _hadoop_args_for_step?
|
|
if not reducer:
|
|
args.extend(['-D', ('%s=0' % translate_jobconf(
|
|
'mapreduce.job.reduces', self.get_hadoop_version()))])
|
|
|
|
# Add extra hadoop args first as hadoop args could be a hadoop
|
|
# specific argument which must come before job
|
|
# specific args.
|
|
args.extend(self._hadoop_args_for_step(step_num))
|
|
|
|
# set up input
|
|
for input_uri in self._step_input_uris(step_num):
|
|
args.extend(['-input', input_uri])
|
|
|
|
# set up output
|
|
args.append('-output')
|
|
args.append(self._step_output_uri(step_num))
|
|
|
|
args.append('-mapper')
|
|
args.append(mapper)
|
|
|
|
if combiner:
|
|
args.append('-combiner')
|
|
args.append(combiner)
|
|
|
|
if reducer:
|
|
args.append('-reducer')
|
|
args.append(reducer)
|
|
|
|
return args
|
|
|
|
def _hadoop_streaming_commands(self, step_num):
|
|
return (
|
|
self._render_substep(step_num, 'mapper'),
|
|
self._render_substep(step_num, 'combiner'),
|
|
self._render_substep(step_num, 'reducer'),
|
|
)
|
|
|
|
def _hadoop_generic_args_for_step(self, step_num):
|
|
"""Arguments like -D and -libjars that apply to every Hadoop
|
|
subcommand."""
|
|
args = []
|
|
|
|
# libjars (#198)
|
|
libjar_paths = self._libjar_paths()
|
|
if libjar_paths:
|
|
args.extend(['-libjars', ','.join(libjar_paths)])
|
|
|
|
# jobconf (-D)
|
|
jobconf = self._jobconf_for_step(step_num)
|
|
|
|
for key, value in sorted(jobconf.items()):
|
|
args.extend(['-D', '%s=%s' % (key, value)])
|
|
|
|
return args
|
|
|
|
def _libjar_paths(self):
|
|
"""Paths or URIs of libjars, from Hadoop/Spark's point of view.
|
|
|
|
Override this for non-local libjars (e.g. on EMR).
|
|
"""
|
|
return self._opts['libjars']
|
|
|
|
def _interpolate_jar_step_args(self, args, step_num):
|
|
"""Like :py:meth:`_interpolate_step_args` except it
|
|
also replaces `~mrjob.step.GENERIC_ARGS` with
|
|
:py:meth:`_hadoop_generic_args_for_step`. This only
|
|
makes sense for jar steps; Spark should raise an error
|
|
if `~mrjob.step.GENERIC_ARGS` is encountered.
|
|
"""
|
|
result = []
|
|
|
|
for arg in args:
|
|
if arg == mrjob.step.GENERIC_ARGS:
|
|
result.extend(
|
|
self._hadoop_generic_args_for_step(step_num))
|
|
else:
|
|
result.append(arg)
|
|
|
|
return self._interpolate_step_args(result, step_num)
|
|
|
|
### setup scripts ###
|
|
|
|
def _py_files(self):
|
|
"""Everything in the *py_files* opt, plus a .zip of the mrjob
|
|
library if needed.
|
|
"""
|
|
py_files = list(self._opts['py_files'])
|
|
|
|
if self._bootstrap_mrjob():
|
|
py_files.append(self._create_mrjob_zip())
|
|
|
|
return py_files
|
|
|
|
def _create_setup_wrapper_scripts(self):
|
|
"""Create the setup wrapper script, and write it into our local temp
|
|
directory (by default, to a file named setup-wrapper.sh).
|
|
|
|
This will set ``self._setup_wrapper_script_path``, and add it to
|
|
``self._working_dir_mgr``
|
|
|
|
This will do nothing if ``self._setup`` is empty or
|
|
this method has already been called.
|
|
|
|
If *local* is true, use local line endings (e.g. Windows). Otherwise,
|
|
use UNIX line endings (see #1071).
|
|
"""
|
|
if self._has_hadoop_streaming_steps():
|
|
streaming_setup = self._py_files_setup() + self._setup
|
|
|
|
if streaming_setup and not self._setup_wrapper_script_path:
|
|
|
|
self._setup_wrapper_script_path = self._write_setup_script(
|
|
streaming_setup, 'setup-wrapper.sh',
|
|
'streaming setup wrapper script')
|
|
|
|
if (self._uses_input_manifest() and not
|
|
self._manifest_setup_script_path):
|
|
|
|
self._manifest_setup_script_path = self._write_setup_script(
|
|
streaming_setup, 'manifest-setup.sh',
|
|
'manifest setup wrapper script',
|
|
manifest=True)
|
|
|
|
if (self._has_pyspark_steps() and
|
|
self._spark_executors_have_own_wd() and
|
|
not self._spark_python_wrapper_path):
|
|
|
|
pyspark_setup = self._pyspark_setup()
|
|
if pyspark_setup:
|
|
self._spark_python_wrapper_path = self._write_setup_script(
|
|
pyspark_setup,
|
|
'python-wrapper.sh', 'Spark Python wrapper script',
|
|
wrap_python=True)
|
|
|
|
def _pyspark_setup(self):
|
|
"""Like ``self._setup``, but prepends commands for archive
|
|
emulation if needed."""
|
|
setup = []
|
|
|
|
if self._emulate_archives_on_spark():
|
|
for name, path in sorted(
|
|
self._working_dir_mgr.name_to_path('archive').items()):
|
|
|
|
archive_file_name = self._working_dir_mgr.name(
|
|
'archive_file', path)
|
|
|
|
setup.append(_unarchive_cmd(path) % dict(
|
|
file=pipes.quote(archive_file_name),
|
|
dir=pipes.quote(name)))
|
|
|
|
setup.extend(self._setup)
|
|
|
|
return setup
|
|
|
|
def _py_files_setup(self):
|
|
"""A list of additional setup commands to emulate Spark's
|
|
--py-files option on Hadoop Streaming."""
|
|
result = []
|
|
|
|
for py_file in self._py_files():
|
|
path_dict = {'type': 'file', 'name': None, 'path': py_file}
|
|
self._working_dir_mgr.add(**path_dict)
|
|
result.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])
|
|
|
|
return result
|
|
|
|
def _write_setup_script(self, setup, filename, desc,
|
|
manifest=False, wrap_python=False):
|
|
"""Write a setup script and return its path."""
|
|
contents = self._setup_wrapper_script_content(
|
|
setup, manifest=manifest, wrap_python=wrap_python)
|
|
|
|
path = os.path.join(self._get_local_tmp_dir(), filename)
|
|
self._write_script(contents, path, desc)
|
|
|
|
self._working_dir_mgr.add('file', path)
|
|
|
|
return path
|
|
|
|
def _create_mrjob_zip(self):
|
|
"""Make a zip of the mrjob library, without .pyc or .pyo files,
|
|
This will also set ``self._mrjob_zip_path`` and return it.
|
|
|
|
Typically called from
|
|
:py:meth:`_create_setup_wrapper_scripts`.
|
|
|
|
It's safe to call this method multiple times (we'll only create
|
|
the zip file once.)
|
|
"""
|
|
if not self._mrjob_zip_path:
|
|
# find mrjob library
|
|
import mrjob
|
|
|
|
if not os.path.basename(mrjob.__file__).startswith('__init__.'):
|
|
raise Exception(
|
|
"Bad path for mrjob library: %s; can't bootstrap mrjob",
|
|
mrjob.__file__)
|
|
|
|
mrjob_dir = os.path.dirname(mrjob.__file__) or '.'
|
|
|
|
zip_path = os.path.join(self._get_local_tmp_dir(), 'mrjob.zip')
|
|
|
|
def filter_path(path):
|
|
filename = os.path.basename(path)
|
|
return not(filename.lower().endswith('.pyc') or
|
|
filename.lower().endswith('.pyo') or
|
|
# filter out emacs backup files
|
|
filename.endswith('~') or
|
|
# filter out emacs lock files
|
|
filename.startswith('.#') or
|
|
# filter out MacFuse resource forks
|
|
filename.startswith('._'))
|
|
|
|
log.debug('archiving %s -> %s as %s' % (
|
|
mrjob_dir, zip_path, os.path.join('mrjob', '')))
|
|
zip_dir(mrjob_dir, zip_path, filter=filter_path, prefix='mrjob')
|
|
|
|
self._mrjob_zip_path = zip_path
|
|
|
|
return self._mrjob_zip_path
|
|
|
|
def _setup_wrapper_script_content(
|
|
self, setup, manifest=False, wrap_python=False):
|
|
"""Return a (Bourne) shell script that runs the setup commands and then
|
|
executes whatever is passed to it (this will be our mapper/reducer),
|
|
as a list of strings (one for each line, including newlines).
|
|
|
|
We obtain a file lock so that two copies of the setup commands
|
|
cannot run simultaneously on the same machine (this helps for running
|
|
:command:`make` on a shared source code archive, for example).
|
|
"""
|
|
lines = []
|
|
|
|
# TODO: this is very similar to _start_of_sh_script() in cloud.py
|
|
|
|
if wrap_python:
|
|
# start with shebang
|
|
sh_bin = self._sh_bin()
|
|
|
|
if os.path.isabs(sh_bin[0]):
|
|
shebang_bin = sh_bin
|
|
else:
|
|
shebang_bin = ['/usr/bin/env'] + list(sh_bin)
|
|
|
|
if len(shebang_bin) > 2:
|
|
# Linux limits shebang to one binary and one arg
|
|
shebang_bin = shebang_bin[:2]
|
|
log.warning('Limiting shebang to two arguments:'
|
|
'#!%s' % cmd_line(shebang_bin))
|
|
|
|
lines.append('#!%s' % cmd_line(shebang_bin))
|
|
|
|
# hook for 'set -e', etc.
|
|
pre_commands = self._sh_pre_commands()
|
|
if pre_commands:
|
|
for cmd in pre_commands:
|
|
lines.append(cmd)
|
|
lines.append('')
|
|
|
|
if setup:
|
|
lines.extend(self._setup_cmd_content(setup))
|
|
|
|
# handle arguments to the script
|
|
if wrap_python:
|
|
# pretend to be python ($@ is arguments to the python binary)
|
|
python_bin = self._task_python_bin()
|
|
lines.append('%s "$@"' % cmd_line(python_bin))
|
|
elif manifest:
|
|
# arguments ($@) are a command
|
|
# eventually runs: "$@" $INPUT_PATH $INPUT_URI
|
|
lines.extend(self._manifest_download_content())
|
|
else:
|
|
# arguments ($@) are a command, just run it
|
|
lines.append('"$@"')
|
|
|
|
return lines
|
|
|
|
def _setup_cmd_content(self, setup):
|
|
"""Write setup script content to obtain a file lock, run setup
|
|
commands in a way that doesn't perturb the script, and then
|
|
release the lock and return to the original working directory."""
|
|
lines = []
|
|
|
|
lines.append('# store $PWD')
|
|
lines.append('__mrjob_PWD=$PWD')
|
|
lines.append('')
|
|
|
|
lines.append('# obtain exclusive file lock')
|
|
# Basically, we're going to tie file descriptor 9 to our lockfile,
|
|
# use a subprocess to obtain a lock (which we somehow inherit too),
|
|
# and then release the lock by closing the file descriptor.
|
|
# File descriptors 10 and higher are used internally by the shell,
|
|
# so 9 is as out-of-the-way as we can get.
|
|
lines.append('exec 9>/tmp/wrapper.lock.%s' % self._job_key)
|
|
# would use flock(1), but it's not always available
|
|
lines.append("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" %
|
|
cmd_line(self._python_bin()))
|
|
lines.append('')
|
|
|
|
lines.append('# setup commands')
|
|
# group setup commands so we can redirect their input/output (see
|
|
# below). Don't use parens; this would invoke a subshell, which would
|
|
# keep us from exporting environment variables to the task.
|
|
lines.append('{')
|
|
for cmd in setup:
|
|
# reconstruct the command line, substituting $__mrjob_PWD/<name>
|
|
# for path dicts
|
|
line = ' ' # indent, since these commands are in a group
|
|
for token in cmd:
|
|
if isinstance(token, dict):
|
|
# it's a path dictionary
|
|
line += '$__mrjob_PWD/'
|
|
line += pipes.quote(self._working_dir_mgr.name(**token))
|
|
else:
|
|
# it's raw script
|
|
line += token
|
|
lines.append(line)
|
|
# redirect setup commands' input/output so they don't interfere
|
|
# with the task (see Issue #803).
|
|
lines.append('} 0</dev/null 1>&2')
|
|
lines.append('')
|
|
|
|
lines.append('# release exclusive file lock')
|
|
lines.append('exec 9>&-')
|
|
lines.append('')
|
|
|
|
lines.append('# run task from the original working directory')
|
|
lines.append('cd $__mrjob_PWD')
|
|
|
|
return lines
|
|
|
|
def _manifest_download_content(self):
|
|
"""write the part of the manifest setup script after setup, that
|
|
downloads the input file, runs the script, and then deletes
|
|
the file."""
|
|
lines = []
|
|
|
|
lines.append('{')
|
|
|
|
# read URI from stdin
|
|
lines.append(' # read URI of input file from stdin')
|
|
lines.append(' INPUT_URI=$(cut -f 2)')
|
|
lines.append('')
|
|
|
|
# pick file extension (e.g. ".warc.gz")
|
|
lines.append(' # pick file extension')
|
|
lines.append(" FILE_EXT=$(basename $INPUT_URI | sed -e 's/^[^.]*//')")
|
|
lines.append('')
|
|
|
|
# pick a unique name in the current directory to download the file to
|
|
lines.append(' # pick filename to download to')
|
|
lines.append(' INPUT_PATH=$(mktemp ./input-XXXXXXXXXX$FILE_EXT)')
|
|
lines.append(' rm $INPUT_PATH')
|
|
lines.append('')
|
|
|
|
# download the file (using different commands depending on the path)
|
|
lines.append(' # download the input file')
|
|
lines.append(' case $INPUT_URI in')
|
|
download_cmds = (
|
|
list(self._manifest_download_commands()) + [('*', 'cp')])
|
|
for glob, cmd in download_cmds:
|
|
lines.append(' %s)' % glob)
|
|
lines.append(' %s $INPUT_URI $INPUT_PATH' % cmd)
|
|
lines.append(' ;;')
|
|
lines.append(' esac')
|
|
lines.append('')
|
|
|
|
# unpack .bz2 and .gz files
|
|
lines.append(' # if input file is compressed, unpack it')
|
|
lines.append(' case $INPUT_PATH in')
|
|
for ext, cmd in self._manifest_uncompress_commands():
|
|
lines.append(' *.%s)' % ext)
|
|
lines.append(' %s $INPUT_PATH' % cmd)
|
|
lines.append(" INPUT_PATH="
|
|
r"$(echo $INPUT_PATH | sed -e 's/\.%s$//')" % ext)
|
|
lines.append(' ;;')
|
|
lines.append(' esac')
|
|
lines.append('} 1>&2')
|
|
lines.append('')
|
|
|
|
# don't exit if script fails
|
|
lines.append('# run our mrjob script')
|
|
lines.append('set +e')
|
|
# pass input path and URI to script
|
|
lines.append('"$@" $INPUT_PATH $INPUT_URI')
|
|
lines.append('')
|
|
|
|
# save return code, turn off echo
|
|
lines.append('# if script fails, print input URI before exiting')
|
|
lines.append('{ RETURNCODE=$?; set +x; } 1>&2 2>/dev/null')
|
|
lines.append('')
|
|
|
|
lines.append('{')
|
|
|
|
# handle errors
|
|
lines.append(' if [ $RETURNCODE -ne 0 ]')
|
|
lines.append(' then')
|
|
lines.append(' echo')
|
|
lines.append(' echo "while reading input from $INPUT_URI"')
|
|
lines.append(' fi')
|
|
lines.append('')
|
|
|
|
# clean up input
|
|
lines.append(' rm $INPUT_PATH')
|
|
lines.append('} 1>&2')
|
|
lines.append('')
|
|
|
|
# exit with correct status
|
|
lines.append('exit $RETURNCODE')
|
|
|
|
return lines
|
|
|
|
def _manifest_download_commands(self):
|
|
"""Return a list of ``(glob, cmd)``, where *glob*
|
|
matches a path or URI to download, and download command is a command
|
|
to download it (e.g. ```hadoop fs -copyToLocal``), as a
|
|
string.
|
|
|
|
Redefine this in your subclass. More specific blobs should come first.
|
|
"""
|
|
return []
|
|
|
|
def _manifest_uncompress_commands(self):
|
|
"""Return a list of ``(ext, cmd)`` where ``ext`` is a file extension
|
|
(e.g. ``gz``) and ``cmd`` is a command to uncompress it (e.g.
|
|
``gunzip``)."""
|
|
return [
|
|
('bz2', 'bunzip2'),
|
|
('gz', 'gunzip'),
|
|
]
|
|
|
|
def _sh_bin(self):
|
|
"""The sh binary and any arguments, as a list. Override this
|
|
if, for example, a runner needs different default values
|
|
depending on circumstances (see :py:class:`~mrjob.emr.EMRJobRunner`).
|
|
"""
|
|
return self._opts['sh_bin'] or self._default_sh_bin()
|
|
|
|
def _default_sh_bin(self):
|
|
"""The default sh binary, if :mrjob-opt:`sh_bin` isn't set."""
|
|
return ['/bin/sh', '-ex']
|
|
|
|
def _sh_pre_commands(self):
|
|
"""A list of lines to put at the very start of any sh script
|
|
(e.g. ``set -e`` when ``sh -e`` wont work, see #1549)
|
|
"""
|
|
return []
|
|
|
|
def _sh_wrap(self, cmd_str):
|
|
"""Helper for _substep_args()
|
|
|
|
Wrap command in sh -c '...' to allow for pipes, etc.
|
|
Use *sh_bin* option."""
|
|
# prepend set -e etc.
|
|
cmd_str = '; '.join(self._sh_pre_commands() + [cmd_str])
|
|
|
|
return self._sh_bin() + ['-c', cmd_str]
|
|
|
|
### spark ###
|
|
|
|
def _args_for_spark_step(self, step_num, last_step_num=None):
|
|
"""The actual arguments used to run the spark-submit command.
|
|
|
|
This handles both all Spark step types (``spark``, ``spark_jar``,
|
|
and ``spark_script``).
|
|
|
|
*last_step_num* is only used by the Spark runner, where multiple
|
|
streaming steps are run in a single Spark job
|
|
"""
|
|
return (
|
|
self.get_spark_submit_bin() +
|
|
self._spark_submit_args(step_num) +
|
|
[self._spark_script_path(step_num)] +
|
|
self._spark_script_args(step_num, last_step_num)
|
|
)
|
|
|
|
def _run_spark_submit(self, spark_submit_args, env, record_callback):
|
|
"""Run the spark submit binary in a subprocess, using a PTY if possible
|
|
|
|
:param spark_submit_args: spark-submit binary and arguments, as as list
|
|
:param env: environment variables, as a dict
|
|
:param record_callback: a function that takes a single log4j record
|
|
as its argument (see
|
|
:py:func:`~mrjob.logs.log4j\
|
|
._parse_hadoop_log4j_records)
|
|
|
|
:return: tuple of the subprocess's return code and a
|
|
step interpretation dictionary
|
|
"""
|
|
log.debug('> %s' % cmd_line(spark_submit_args))
|
|
log.debug(' with environment: %r' % sorted(env.items()))
|
|
|
|
# these should always be set, but just in case
|
|
returncode = 0
|
|
step_interpretation = {}
|
|
|
|
# try to use a PTY if it's available
|
|
try:
|
|
pid, master_fd = pty.fork()
|
|
except (AttributeError, OSError):
|
|
# no PTYs, just use Popen
|
|
|
|
# user won't get much feedback for a while, so tell them
|
|
# spark-submit is running
|
|
log.debug('No PTY available, using Popen() to invoke spark-submit')
|
|
|
|
step_proc = Popen(
|
|
spark_submit_args, stdout=PIPE, stderr=PIPE, env=env)
|
|
|
|
# parse driver output
|
|
step_interpretation = _parse_spark_log(
|
|
step_proc.stderr, record_callback=record_callback)
|
|
|
|
# there shouldn't be much output on STDOUT, just echo it
|
|
for record in _parse_hadoop_log4j_records(step_proc.stdout):
|
|
record_callback(record)
|
|
|
|
step_proc.stdout.close()
|
|
step_proc.stderr.close()
|
|
|
|
returncode = step_proc.wait()
|
|
else:
|
|
# we have PTYs
|
|
if pid == 0: # we are the child process
|
|
try:
|
|
os.execvpe(spark_submit_args[0], spark_submit_args, env)
|
|
# now this process is no longer Python
|
|
except OSError as ex:
|
|
# use _exit() so we don't do cleanup, etc. that's
|
|
# the parent process's job
|
|
os._exit(ex.errno)
|
|
finally:
|
|
# if we get some other exception, still exit hard
|
|
os._exit(-1)
|
|
else:
|
|
log.debug('Invoking spark-submit via PTY')
|
|
|
|
with os.fdopen(master_fd, 'rb') as master:
|
|
step_interpretation = (
|
|
_parse_spark_log(
|
|
_eio_to_eof(master),
|
|
record_callback=record_callback))
|
|
|
|
_, returncode = os.waitpid(pid, 0)
|
|
|
|
return (returncode, step_interpretation)
|
|
|
|
def get_spark_submit_bin(self):
|
|
"""Return the location of the ``spark-submit`` binary, searching for it
|
|
if necessary."""
|
|
if not self._spark_submit_bin:
|
|
self._spark_submit_bin = self._find_spark_submit_bin()
|
|
return self._spark_submit_bin
|
|
|
|
def _find_spark_submit_bin(self):
|
|
"""Attempt to find the spark binary. Returns a list of arguments.
|
|
Defaults to ``['spark-submit']``.
|
|
|
|
Re-define this in your subclass if you already know where
|
|
to find spark-submit (e.g. on cloud services).
|
|
"""
|
|
for path in unique(self._spark_submit_bin_dirs()):
|
|
log.info('Looking for spark-submit binary in %s...' % (
|
|
path or '$PATH'))
|
|
|
|
spark_submit_bin = which('spark-submit', path=path)
|
|
|
|
if spark_submit_bin:
|
|
log.info('Found spark-submit binary: %s' % spark_submit_bin)
|
|
return [spark_submit_bin]
|
|
else:
|
|
log.info("Falling back to 'spark-submit'")
|
|
return ['spark-submit']
|
|
|
|
def _spark_submit_bin_dirs(self):
|
|
# $SPARK_HOME
|
|
spark_home = os.environ.get('SPARK_HOME')
|
|
if spark_home:
|
|
yield os.path.join(spark_home, 'bin')
|
|
|
|
yield None # use $PATH
|
|
|
|
# look for pyspark installation (see #1984)
|
|
if pyspark:
|
|
yield os.path.join(os.path.dirname(pyspark.__file__), 'bin')
|
|
|
|
# some other places recommended by install docs (see #1366)
|
|
yield '/usr/lib/spark/bin'
|
|
yield '/usr/local/spark/bin'
|
|
yield '/usr/local/lib/spark/bin'
|
|
|
|
def _spark_submit_args(self, step_num):
|
|
"""Build a list of extra args to the spark-submit binary for
|
|
the given spark or spark_script step."""
|
|
step = self._get_step(step_num)
|
|
|
|
args = []
|
|
|
|
# --conf arguments include python bin, cmdenv, jobconf. Make sure
|
|
# that we can always override these manually
|
|
jobconf = {}
|
|
for key, value in self._spark_cmdenv(step_num).items():
|
|
jobconf['spark.executorEnv.%s' % key] = value
|
|
if self._spark_master() == 'yarn': # YARN only, see #1919
|
|
jobconf['spark.yarn.appMasterEnv.%s' % key] = value
|
|
|
|
jobconf.update(self._jobconf_for_step(step_num))
|
|
|
|
for key, value in sorted(jobconf.items()):
|
|
args.extend(['--conf', '%s=%s' % (key, value)])
|
|
|
|
# add --class (JAR steps)
|
|
if step.get('main_class'):
|
|
args.extend(['--class', step['main_class']])
|
|
|
|
# add --jars, if any
|
|
libjar_paths = self._libjar_paths()
|
|
if libjar_paths:
|
|
args.extend(['--jars', ','.join(libjar_paths)])
|
|
|
|
# spark-submit treats --master and --deploy-mode as aliases for
|
|
# --conf spark.master=... and --conf spark.deploy-mode=... (see #2032).
|
|
#
|
|
# we never want jobconf to override spark master or deploy mode, so put
|
|
# these switches after --conf
|
|
|
|
# add --master
|
|
if self._spark_master():
|
|
args.extend(['--master', self._spark_master()])
|
|
|
|
# add --deploy-mode
|
|
if self._spark_deploy_mode():
|
|
args.extend(['--deploy-mode', self._spark_deploy_mode()])
|
|
|
|
# --files and --archives
|
|
args.extend(self._spark_upload_args())
|
|
|
|
# --py-files (Python only)
|
|
# spark runner can run 'streaming' steps, so just exclude
|
|
# non-Python steps
|
|
if 'jar' not in step['type']:
|
|
py_file_uris = self._py_files()
|
|
|
|
if self._upload_mgr:
|
|
# don't assume py_files are in _upload_mgr; for example,
|
|
# spark-submit doesn't need to upload them
|
|
path_to_uri = self._upload_mgr.path_to_uri()
|
|
py_file_uris = [path_to_uri.get(p, p) for p in py_file_uris]
|
|
|
|
if py_file_uris:
|
|
args.extend(['--py-files', ','.join(py_file_uris)])
|
|
|
|
# spark_args option
|
|
args.extend(self._opts['spark_args'])
|
|
|
|
# step spark_args
|
|
if step.get('spark_args'):
|
|
args.extend(step['spark_args'])
|
|
|
|
return args
|
|
|
|
def _spark_upload_args(self):
|
|
if not self._spark_executors_have_own_wd():
|
|
# don't bother, there's no working dir to upload to
|
|
return []
|
|
|
|
return self._upload_args_helper(
|
|
'--files', None,
|
|
'--archives', None,
|
|
always_use_hash=False,
|
|
emulate_archives=self._emulate_archives_on_spark())
|
|
|
|
def _spark_script_path(self, step_num):
|
|
"""The path of the spark script or JAR, used by
|
|
_args_for_spark_step()."""
|
|
step = self._get_step(step_num)
|
|
|
|
if step['type'] == 'spark':
|
|
path = self._script_path
|
|
elif step['type'] == 'spark_jar':
|
|
path = step['jar']
|
|
elif step['type'] == 'spark_script':
|
|
path = step['script']
|
|
else:
|
|
raise TypeError('Bad step type: %r' % step['type'])
|
|
|
|
return self._interpolate_spark_script_path(path)
|
|
|
|
def _interpolate_spark_script_path(self, path):
|
|
"""Redefine this in your subclass if the given path needs to be
|
|
translated to a URI when running spark (e.g. on EMR)."""
|
|
return path
|
|
|
|
def _spark_cmdenv(self, step_num):
|
|
"""Returns a dictionary mapping environment variable to value,
|
|
including mapping PYSPARK_PYTHON to self._python_bin()
|
|
"""
|
|
step = self._get_step(step_num)
|
|
|
|
cmdenv = {}
|
|
|
|
if self._step_type_uses_pyspark(step['type']):
|
|
driver_python = cmd_line(self._python_bin())
|
|
|
|
if self._spark_python_wrapper_path:
|
|
executor_python = './%s' % self._working_dir_mgr.name(
|
|
'file', self._spark_python_wrapper_path)
|
|
else:
|
|
executor_python = cmd_line(self._task_python_bin())
|
|
|
|
if self._spark_deploy_mode() == 'cluster':
|
|
# treat driver like executors (they run in same environment)
|
|
cmdenv['PYSPARK_PYTHON'] = executor_python
|
|
elif driver_python == executor_python:
|
|
# no difference, just set $PYSPARK_PYTHON
|
|
cmdenv['PYSPARK_PYTHON'] = driver_python
|
|
else:
|
|
# set different pythons for driver and executor
|
|
cmdenv['PYSPARK_PYTHON'] = executor_python
|
|
cmdenv['PYSPARK_DRIVER_PYTHON'] = driver_python
|
|
|
|
cmdenv.update(self._cmdenv())
|
|
return cmdenv
|
|
|
|
|
|
# these don't need to be methods
|
|
|
|
def _hadoop_cmd_line(args):
|
|
"""Escape args of a command line in a way that Hadoop can process
|
|
them."""
|
|
return ' '.join(_hadoop_escape_arg(arg) for arg in args)
|
|
|
|
|
|
def _hadoop_escape_arg(arg):
|
|
"""Escape a single command argument in a way that Hadoop can process it."""
|
|
if _HADOOP_SAFE_ARG_RE.match(arg):
|
|
return arg
|
|
else:
|
|
return "'%s'" % arg.replace("'", r"'\''")
|