mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-09 03:31:28 +00:00
332 lines
12 KiB
Python
332 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2009-2013 Yelp and Contributors
|
|
# Copyright 2015-2017 Yelp
|
|
# Copyright 2018 Yelp and Contributors
|
|
# Copyright 2019 Yelp
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Run an MRJob locally by forking off a bunch of processes and piping
|
|
them together. Useful for testing, not terrible for running medium-sized
|
|
jobs on all CPUs."""
|
|
import logging
|
|
import math
|
|
import os
|
|
import platform
|
|
from functools import partial
|
|
from multiprocessing import Pool
|
|
from subprocess import CalledProcessError
|
|
from subprocess import check_call
|
|
|
|
try:
|
|
import pty
|
|
pty # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
pty = None
|
|
|
|
from mrjob.bin import MRJobBinRunner
|
|
from mrjob.logs.errors import _log_probable_cause_of_failure
|
|
from mrjob.logs.errors import _pick_error
|
|
from mrjob.logs.step import _log_log4j_record
|
|
from mrjob.logs.task import _parse_task_stderr
|
|
from mrjob.py2 import string_types
|
|
from mrjob.sim import SimMRJobRunner
|
|
from mrjob.sim import _sort_lines_in_memory
|
|
from mrjob.step import StepFailedException
|
|
from mrjob.util import cmd_line
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
_DEFAULT_EXECUTOR_MEMORY = '1g'
|
|
|
|
|
|
class _TaskFailedException(StepFailedException):
|
|
"""Extension of :py:class:`~mrjob.step.StepFailedException` that
|
|
blames one particular task."""
|
|
_FIELDS = StepFailedException._FIELDS + ('task_type', 'task_num')
|
|
|
|
def __init__(
|
|
self, reason=None, step_num=None, num_steps=None, step_desc=None,
|
|
task_type=None, task_num=None):
|
|
super(_TaskFailedException, self).__init__(
|
|
reason=reason, step_num=step_num,
|
|
num_steps=num_steps, step_desc=step_desc)
|
|
|
|
self.task_type = task_type
|
|
self.task_num = task_num
|
|
|
|
|
|
class LocalMRJobRunner(SimMRJobRunner, MRJobBinRunner):
|
|
"""Runs an :py:class:`~mrjob.job.MRJob` locally, for testing purposes.
|
|
Invoked when you run your job with ``-r local``.
|
|
|
|
Unlike :py:class:`~mrjob.job.InlineMRJobRunner`, this actually spawns
|
|
multiple subprocesses for each task.
|
|
|
|
It's rare to need to instantiate this class directly (see
|
|
:py:meth:`~LocalMRJobRunner.__init__` for details).
|
|
|
|
.. versionadded:: 0.6.8
|
|
|
|
can run Spark steps as well, on the ``local-cluster`` Spark master.
|
|
"""
|
|
alias = 'local'
|
|
|
|
OPT_NAMES = SimMRJobRunner.OPT_NAMES | MRJobBinRunner.OPT_NAMES | {
|
|
'sort_bin',
|
|
}
|
|
|
|
_STEP_TYPES = (
|
|
SimMRJobRunner._STEP_TYPES | {'spark_jar', 'spark_script'})
|
|
|
|
def __init__(self, **kwargs):
|
|
"""Arguments to this constructor may also appear in :file:`mrjob.conf`
|
|
under ``runners/local``.
|
|
|
|
:py:class:`~mrjob.local.LocalMRJobRunner`'s constructor takes the
|
|
same keyword args as
|
|
:py:class:`~mrjob.runner.MRJobRunner`. However, please note:
|
|
|
|
* *cmdenv* is combined with :py:func:`~mrjob.conf.combine_local_envs`
|
|
* *python_bin* defaults to ``sys.executable`` (the current python
|
|
interpreter)
|
|
* *hadoop_input_format*, *hadoop_output_format*,
|
|
and *partitioner* are ignored because they
|
|
require Java. If you need to test these, consider starting up a
|
|
standalone Hadoop instance and running your job with ``-r hadoop``.
|
|
"""
|
|
super(LocalMRJobRunner, self).__init__(**kwargs)
|
|
|
|
def _invoke_task_func(self, task_type, step_num, task_num):
|
|
args = self._substep_args(step_num, task_type)
|
|
num_steps = self._num_steps()
|
|
|
|
# stdin, stdout, stderr, wd, and env will be passed in later
|
|
return partial(
|
|
_invoke_task_in_subprocess,
|
|
task_type, step_num, task_num,
|
|
args, num_steps)
|
|
|
|
def _run_step_on_spark(self, step, step_num):
|
|
if self._opts['upload_archives']:
|
|
log.warning('Spark master %r will probably ignore archives' %
|
|
self._spark_master())
|
|
|
|
spark_submit_args = self._args_for_spark_step(step_num)
|
|
|
|
env = dict(os.environ)
|
|
env.update(self._spark_cmdenv(step_num))
|
|
|
|
returncode, step_interpretation = self._run_spark_submit(
|
|
spark_submit_args, env, record_callback=_log_log4j_record)
|
|
|
|
if returncode:
|
|
error = _pick_error(dict(step=step_interpretation))
|
|
if error:
|
|
_log_probable_cause_of_failure(log, error)
|
|
|
|
reason = str(CalledProcessError(returncode, spark_submit_args))
|
|
raise StepFailedException(
|
|
reason=reason, step_num=step_num,
|
|
num_steps=self._num_steps())
|
|
|
|
def _run_multiple(self, funcs, num_processes=None):
|
|
"""Use multiprocessing to run in parallel."""
|
|
pool = Pool(processes=self._opts['num_cores'])
|
|
|
|
try:
|
|
results = [
|
|
pool.apply_async(partial(_pickle_safe, func))
|
|
for func in funcs
|
|
]
|
|
|
|
for result in results:
|
|
result.get()
|
|
|
|
# make sure that the pool (and its file descriptors, etc.)
|
|
# don't stay open. This doesn't matter much for individual jobs,
|
|
# but it makes our automated tasks run out of file descriptors.
|
|
|
|
pool.close()
|
|
except:
|
|
# if there's an error in one task, terminate all others
|
|
pool.terminate()
|
|
raise
|
|
finally:
|
|
pool.join()
|
|
|
|
def _log_cause_of_error(self, ex):
|
|
if not isinstance(ex, _TaskFailedException):
|
|
# if something went wrong inside mrjob, the stacktrace
|
|
# will bubble up to the top level
|
|
return
|
|
|
|
# not using LogInterpretationMixin because it would be overkill
|
|
|
|
if not self._opts['read_logs']:
|
|
return
|
|
|
|
input_path = self._task_input_path(
|
|
ex.task_type, ex.step_num, ex.task_num)
|
|
stderr_path = self._task_stderr_path(
|
|
ex.task_type, ex.step_num, ex.task_num)
|
|
|
|
if self.fs.exists(stderr_path): # it should, but just to be safe
|
|
# log-parsing code expects "str", not bytes; open in text mode
|
|
with open(stderr_path) as stderr:
|
|
task_error = _parse_task_stderr(stderr)
|
|
if task_error:
|
|
task_error['path'] = stderr_path
|
|
error = dict(
|
|
split=dict(path=input_path),
|
|
task_error=task_error)
|
|
_log_probable_cause_of_failure(log, error)
|
|
return
|
|
|
|
# fallback if we can't find the error (e.g. the job does something
|
|
# weird to stderr or stack traces)
|
|
log.error('Error while reading from %s:\n' % input_path)
|
|
|
|
def _default_python_bin(self, local=False):
|
|
"""Always return *sys.executable*, if defined"""
|
|
return super(LocalMRJobRunner, self)._default_python_bin(
|
|
local=True)
|
|
|
|
def _sort_input_func(self):
|
|
"""Try sorting with the :command:`sort` binary before falling
|
|
back to in-memory sort."""
|
|
if platform.system() == 'Windows': # we assume Unix sort
|
|
return super(LocalMRJobRunner, self)._sort_input_func()
|
|
else:
|
|
return partial(
|
|
_sort_lines_with_sort_bin,
|
|
sort_bin=self._sort_bin(),
|
|
tmp_dir=self._get_local_tmp_dir())
|
|
|
|
def _sort_bin(self):
|
|
"""The binary to use to sort input.
|
|
|
|
(On Windows, we go straight to sorting in memory.)
|
|
"""
|
|
if self._opts['sort_bin']:
|
|
return self._opts['sort_bin']
|
|
elif self._sort_values:
|
|
return ['sort']
|
|
else:
|
|
# only sort on the reducer key (see #660)
|
|
return ['sort', '-t', '\t', '-k', '1,1', '-s']
|
|
|
|
# Spark steps
|
|
|
|
# TODO: _spark_master() should probably take step_num, to allow for
|
|
# step-specific jobconf
|
|
def _spark_master(self):
|
|
"""Use the local-cluster master, which simulates a Spark cluster."""
|
|
# figure out the required parameters to local-cluster
|
|
num_executors = self._num_cores()
|
|
|
|
# for now assigning one core per executor, so we don't have to worry
|
|
# about a number of cores that's not evenly divisible
|
|
cores_per_executor = 1
|
|
|
|
executor_mem_bytes = _to_num_bytes(
|
|
self._opts['jobconf'].get('spark.executor.memory') or
|
|
_DEFAULT_EXECUTOR_MEMORY)
|
|
executor_mem_mb = math.ceil(executor_mem_bytes / 1024.0 / 1024.0)
|
|
|
|
return 'local-cluster[%d,%d,%d]' % (
|
|
num_executors, cores_per_executor, executor_mem_mb)
|
|
|
|
|
|
def _to_num_bytes(java_mem_str):
|
|
if isinstance(java_mem_str, string_types):
|
|
for i, magnitude in enumerate(('k', 'm', 'g', 't'), start=1):
|
|
if java_mem_str.lower().endswith(magnitude):
|
|
return int(java_mem_str[:-1]) * 1024 ** i
|
|
|
|
return int(java_mem_str)
|
|
|
|
|
|
# pickle utilities, to protect multiprocessing from itself
|
|
|
|
def _invoke_task_in_subprocess(
|
|
task_type, step_num, task_num,
|
|
args, num_steps,
|
|
stdin, stdout, stderr, wd, env):
|
|
"""A pickleable function that invokes a task in a subprocess."""
|
|
log.debug('> %s' % cmd_line(args))
|
|
|
|
try:
|
|
check_call(args, stdin=stdin, stdout=stdout, stderr=stderr,
|
|
cwd=wd, env=env)
|
|
except Exception as ex:
|
|
raise _TaskFailedException(
|
|
reason=str(ex),
|
|
step_num=step_num,
|
|
num_steps=num_steps,
|
|
task_type=task_type,
|
|
task_num=task_num,
|
|
)
|
|
|
|
|
|
def _pickle_safe(func):
|
|
"""Call no-args function *func*, returning *None* and ensuring
|
|
that any exception raised is pickleable."""
|
|
try:
|
|
func() # always return None
|
|
except _TaskFailedException:
|
|
raise # we know these are pickleable
|
|
except Exception as ex:
|
|
raise Exception(repr(ex)) # we know this is pickleable
|
|
|
|
|
|
# other utilities
|
|
|
|
def _sort_lines_with_sort_bin(input_paths, output_path, sort_bin,
|
|
sort_values=False, tmp_dir=None):
|
|
"""Sort lines the given *input_paths* into *output_path*,
|
|
using *sort_bin*. If there is a problem, fall back to in-memory sort.
|
|
|
|
This is a helper for :py:meth:`LocalMRJobRunner._sort_input_func`.
|
|
|
|
*tmp_dir* determines the value of :envvar:`$TMP` and :envvar:`$TMPDIR`
|
|
that *sort_bin* sees.
|
|
"""
|
|
if input_paths:
|
|
env = os.environ.copy()
|
|
|
|
# ignore locale when sorting
|
|
env['LC_ALL'] = 'C'
|
|
|
|
# Make sure that the tmp dir environment variables are changed if
|
|
# the default is changed.
|
|
env['TMP'] = tmp_dir
|
|
env['TMPDIR'] = tmp_dir
|
|
|
|
with open(output_path, 'wb') as output:
|
|
args = sort_bin + list(input_paths)
|
|
log.debug('> %s' % cmd_line(args))
|
|
|
|
try:
|
|
check_call(args, stdout=output, env=env)
|
|
return
|
|
except CalledProcessError:
|
|
log.error(
|
|
'`%s` failed, falling back to in-memory sort' %
|
|
cmd_line(sort_bin))
|
|
except OSError:
|
|
log.error(
|
|
'no sort binary, falling back to in-memory sort')
|
|
|
|
_sort_lines_in_memory(input_paths, output_path, sort_values=sort_values)
|