GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/local.py
2025-06-22 13:36:01 +05:30

332 lines
12 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2009-2013 Yelp and Contributors
# Copyright 2015-2017 Yelp
# Copyright 2018 Yelp and Contributors
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run an MRJob locally by forking off a bunch of processes and piping
them together. Useful for testing, not terrible for running medium-sized
jobs on all CPUs."""
import logging
import math
import os
import platform
from functools import partial
from multiprocessing import Pool
from subprocess import CalledProcessError
from subprocess import check_call
try:
import pty
pty # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
pty = None
from mrjob.bin import MRJobBinRunner
from mrjob.logs.errors import _log_probable_cause_of_failure
from mrjob.logs.errors import _pick_error
from mrjob.logs.step import _log_log4j_record
from mrjob.logs.task import _parse_task_stderr
from mrjob.py2 import string_types
from mrjob.sim import SimMRJobRunner
from mrjob.sim import _sort_lines_in_memory
from mrjob.step import StepFailedException
from mrjob.util import cmd_line
log = logging.getLogger(__name__)
_DEFAULT_EXECUTOR_MEMORY = '1g'
class _TaskFailedException(StepFailedException):
"""Extension of :py:class:`~mrjob.step.StepFailedException` that
blames one particular task."""
_FIELDS = StepFailedException._FIELDS + ('task_type', 'task_num')
def __init__(
self, reason=None, step_num=None, num_steps=None, step_desc=None,
task_type=None, task_num=None):
super(_TaskFailedException, self).__init__(
reason=reason, step_num=step_num,
num_steps=num_steps, step_desc=step_desc)
self.task_type = task_type
self.task_num = task_num
class LocalMRJobRunner(SimMRJobRunner, MRJobBinRunner):
"""Runs an :py:class:`~mrjob.job.MRJob` locally, for testing purposes.
Invoked when you run your job with ``-r local``.
Unlike :py:class:`~mrjob.job.InlineMRJobRunner`, this actually spawns
multiple subprocesses for each task.
It's rare to need to instantiate this class directly (see
:py:meth:`~LocalMRJobRunner.__init__` for details).
.. versionadded:: 0.6.8
can run Spark steps as well, on the ``local-cluster`` Spark master.
"""
alias = 'local'
OPT_NAMES = SimMRJobRunner.OPT_NAMES | MRJobBinRunner.OPT_NAMES | {
'sort_bin',
}
_STEP_TYPES = (
SimMRJobRunner._STEP_TYPES | {'spark_jar', 'spark_script'})
def __init__(self, **kwargs):
"""Arguments to this constructor may also appear in :file:`mrjob.conf`
under ``runners/local``.
:py:class:`~mrjob.local.LocalMRJobRunner`'s constructor takes the
same keyword args as
:py:class:`~mrjob.runner.MRJobRunner`. However, please note:
* *cmdenv* is combined with :py:func:`~mrjob.conf.combine_local_envs`
* *python_bin* defaults to ``sys.executable`` (the current python
interpreter)
* *hadoop_input_format*, *hadoop_output_format*,
and *partitioner* are ignored because they
require Java. If you need to test these, consider starting up a
standalone Hadoop instance and running your job with ``-r hadoop``.
"""
super(LocalMRJobRunner, self).__init__(**kwargs)
def _invoke_task_func(self, task_type, step_num, task_num):
args = self._substep_args(step_num, task_type)
num_steps = self._num_steps()
# stdin, stdout, stderr, wd, and env will be passed in later
return partial(
_invoke_task_in_subprocess,
task_type, step_num, task_num,
args, num_steps)
def _run_step_on_spark(self, step, step_num):
if self._opts['upload_archives']:
log.warning('Spark master %r will probably ignore archives' %
self._spark_master())
spark_submit_args = self._args_for_spark_step(step_num)
env = dict(os.environ)
env.update(self._spark_cmdenv(step_num))
returncode, step_interpretation = self._run_spark_submit(
spark_submit_args, env, record_callback=_log_log4j_record)
if returncode:
error = _pick_error(dict(step=step_interpretation))
if error:
_log_probable_cause_of_failure(log, error)
reason = str(CalledProcessError(returncode, spark_submit_args))
raise StepFailedException(
reason=reason, step_num=step_num,
num_steps=self._num_steps())
def _run_multiple(self, funcs, num_processes=None):
"""Use multiprocessing to run in parallel."""
pool = Pool(processes=self._opts['num_cores'])
try:
results = [
pool.apply_async(partial(_pickle_safe, func))
for func in funcs
]
for result in results:
result.get()
# make sure that the pool (and its file descriptors, etc.)
# don't stay open. This doesn't matter much for individual jobs,
# but it makes our automated tasks run out of file descriptors.
pool.close()
except:
# if there's an error in one task, terminate all others
pool.terminate()
raise
finally:
pool.join()
def _log_cause_of_error(self, ex):
if not isinstance(ex, _TaskFailedException):
# if something went wrong inside mrjob, the stacktrace
# will bubble up to the top level
return
# not using LogInterpretationMixin because it would be overkill
if not self._opts['read_logs']:
return
input_path = self._task_input_path(
ex.task_type, ex.step_num, ex.task_num)
stderr_path = self._task_stderr_path(
ex.task_type, ex.step_num, ex.task_num)
if self.fs.exists(stderr_path): # it should, but just to be safe
# log-parsing code expects "str", not bytes; open in text mode
with open(stderr_path) as stderr:
task_error = _parse_task_stderr(stderr)
if task_error:
task_error['path'] = stderr_path
error = dict(
split=dict(path=input_path),
task_error=task_error)
_log_probable_cause_of_failure(log, error)
return
# fallback if we can't find the error (e.g. the job does something
# weird to stderr or stack traces)
log.error('Error while reading from %s:\n' % input_path)
def _default_python_bin(self, local=False):
"""Always return *sys.executable*, if defined"""
return super(LocalMRJobRunner, self)._default_python_bin(
local=True)
def _sort_input_func(self):
"""Try sorting with the :command:`sort` binary before falling
back to in-memory sort."""
if platform.system() == 'Windows': # we assume Unix sort
return super(LocalMRJobRunner, self)._sort_input_func()
else:
return partial(
_sort_lines_with_sort_bin,
sort_bin=self._sort_bin(),
tmp_dir=self._get_local_tmp_dir())
def _sort_bin(self):
"""The binary to use to sort input.
(On Windows, we go straight to sorting in memory.)
"""
if self._opts['sort_bin']:
return self._opts['sort_bin']
elif self._sort_values:
return ['sort']
else:
# only sort on the reducer key (see #660)
return ['sort', '-t', '\t', '-k', '1,1', '-s']
# Spark steps
# TODO: _spark_master() should probably take step_num, to allow for
# step-specific jobconf
def _spark_master(self):
"""Use the local-cluster master, which simulates a Spark cluster."""
# figure out the required parameters to local-cluster
num_executors = self._num_cores()
# for now assigning one core per executor, so we don't have to worry
# about a number of cores that's not evenly divisible
cores_per_executor = 1
executor_mem_bytes = _to_num_bytes(
self._opts['jobconf'].get('spark.executor.memory') or
_DEFAULT_EXECUTOR_MEMORY)
executor_mem_mb = math.ceil(executor_mem_bytes / 1024.0 / 1024.0)
return 'local-cluster[%d,%d,%d]' % (
num_executors, cores_per_executor, executor_mem_mb)
def _to_num_bytes(java_mem_str):
if isinstance(java_mem_str, string_types):
for i, magnitude in enumerate(('k', 'm', 'g', 't'), start=1):
if java_mem_str.lower().endswith(magnitude):
return int(java_mem_str[:-1]) * 1024 ** i
return int(java_mem_str)
# pickle utilities, to protect multiprocessing from itself
def _invoke_task_in_subprocess(
task_type, step_num, task_num,
args, num_steps,
stdin, stdout, stderr, wd, env):
"""A pickleable function that invokes a task in a subprocess."""
log.debug('> %s' % cmd_line(args))
try:
check_call(args, stdin=stdin, stdout=stdout, stderr=stderr,
cwd=wd, env=env)
except Exception as ex:
raise _TaskFailedException(
reason=str(ex),
step_num=step_num,
num_steps=num_steps,
task_type=task_type,
task_num=task_num,
)
def _pickle_safe(func):
"""Call no-args function *func*, returning *None* and ensuring
that any exception raised is pickleable."""
try:
func() # always return None
except _TaskFailedException:
raise # we know these are pickleable
except Exception as ex:
raise Exception(repr(ex)) # we know this is pickleable
# other utilities
def _sort_lines_with_sort_bin(input_paths, output_path, sort_bin,
sort_values=False, tmp_dir=None):
"""Sort lines the given *input_paths* into *output_path*,
using *sort_bin*. If there is a problem, fall back to in-memory sort.
This is a helper for :py:meth:`LocalMRJobRunner._sort_input_func`.
*tmp_dir* determines the value of :envvar:`$TMP` and :envvar:`$TMPDIR`
that *sort_bin* sees.
"""
if input_paths:
env = os.environ.copy()
# ignore locale when sorting
env['LC_ALL'] = 'C'
# Make sure that the tmp dir environment variables are changed if
# the default is changed.
env['TMP'] = tmp_dir
env['TMPDIR'] = tmp_dir
with open(output_path, 'wb') as output:
args = sort_bin + list(input_paths)
log.debug('> %s' % cmd_line(args))
try:
check_call(args, stdout=output, env=env)
return
except CalledProcessError:
log.error(
'`%s` failed, falling back to in-memory sort' %
cmd_line(sort_bin))
except OSError:
log.error(
'no sort binary, falling back to in-memory sort')
_sort_lines_in_memory(input_paths, output_path, sort_values=sort_values)