GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/local.py

# -*- coding: utf-8 -*-
# Copyright 2009-2013 Yelp and Contributors
# Copyright 2015-2017 Yelp
# Copyright 2018 Yelp and Contributors
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run an MRJob locally by forking off a bunch of processes and piping
them together. Useful for testing, not terrible for running medium-sized
jobs on all CPUs."""
import logging
import math
import os
import platform
from functools import partial
from multiprocessing import Pool
from subprocess import CalledProcessError
from subprocess import check_call

try:
    import pty
    pty  # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
    pty = None

from mrjob.bin import MRJobBinRunner
from mrjob.logs.errors import _log_probable_cause_of_failure
from mrjob.logs.errors import _pick_error
from mrjob.logs.step import _log_log4j_record
from mrjob.logs.task import _parse_task_stderr
from mrjob.py2 import string_types
from mrjob.sim import SimMRJobRunner
from mrjob.sim import _sort_lines_in_memory
from mrjob.step import StepFailedException
from mrjob.util import cmd_line

log = logging.getLogger(__name__)


_DEFAULT_EXECUTOR_MEMORY = '1g'


class _TaskFailedException(StepFailedException):
    """Extension of :py:class:`~mrjob.step.StepFailedException` that
    blames one particular task."""
    _FIELDS = StepFailedException._FIELDS + ('task_type', 'task_num')

    def __init__(
            self, reason=None, step_num=None, num_steps=None, step_desc=None,
            task_type=None, task_num=None):
        super(_TaskFailedException, self).__init__(
            reason=reason, step_num=step_num,
            num_steps=num_steps, step_desc=step_desc)

        self.task_type = task_type
        self.task_num = task_num


class LocalMRJobRunner(SimMRJobRunner, MRJobBinRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` locally, for testing purposes.
    Invoked when you run your job with ``-r local``.

    Unlike :py:class:`~mrjob.job.InlineMRJobRunner`, this actually spawns
    multiple subprocesses for each task.

    It's rare to need to instantiate this class directly (see
    :py:meth:`~LocalMRJobRunner.__init__` for details).

    .. versionadded:: 0.6.8

       can run Spark steps as well, on the ``local-cluster`` Spark master.
    """
    alias = 'local'

    OPT_NAMES = SimMRJobRunner.OPT_NAMES | MRJobBinRunner.OPT_NAMES | {
        'sort_bin',
    }

    _STEP_TYPES = (
        SimMRJobRunner._STEP_TYPES | {'spark_jar', 'spark_script'})

    def __init__(self, **kwargs):
        """Arguments to this constructor may also appear in :file:`mrjob.conf`
        under ``runners/local``.

        :py:class:`~mrjob.local.LocalMRJobRunner`'s constructor takes the
        same keyword args as
        :py:class:`~mrjob.runner.MRJobRunner`. However, please note:

        * *cmdenv* is combined with :py:func:`~mrjob.conf.combine_local_envs`
        * *python_bin* defaults to ``sys.executable`` (the current python
          interpreter)
        * *hadoop_input_format*, *hadoop_output_format*,
          and *partitioner* are ignored because they
          require Java. If you need to test these, consider starting up a
          standalone Hadoop instance and running your job with ``-r hadoop``.
        """
        super(LocalMRJobRunner, self).__init__(**kwargs)

    def _invoke_task_func(self, task_type, step_num, task_num):
        args = self._substep_args(step_num, task_type)
        num_steps = self._num_steps()

        # stdin, stdout, stderr, wd, and env will be passed in later
        return partial(
            _invoke_task_in_subprocess,
            task_type, step_num, task_num,
            args, num_steps)

    def _run_step_on_spark(self, step, step_num):
        if self._opts['upload_archives']:
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode, step_interpretation = self._run_spark_submit(
            spark_submit_args, env, record_callback=_log_log4j_record)

        if returncode:
            error = _pick_error(dict(step=step_interpretation))
            if error:
                _log_probable_cause_of_failure(log, error)

            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(
                reason=reason, step_num=step_num,
                num_steps=self._num_steps())

    def _run_multiple(self, funcs, num_processes=None):
        """Use multiprocessing to run in parallel."""
        pool = Pool(processes=self._opts['num_cores'])

        try:
            results = [
                pool.apply_async(partial(_pickle_safe, func))
                for func in funcs
            ]

            for result in results:
                result.get()

        # make sure that the pool (and its file descriptors, etc.)
        # don't stay open. This doesn't matter much for individual jobs,
        # but it makes our automated tasks run out of file descriptors.

            pool.close()
        except:
            # if there's an error in one task, terminate all others
            pool.terminate()
            raise
        finally:
            pool.join()

    def _log_cause_of_error(self, ex):
        if not isinstance(ex, _TaskFailedException):
            # if something went wrong inside mrjob, the stacktrace
            # will bubble up to the top level
            return

        # not using LogInterpretationMixin because it would be overkill

        if not self._opts['read_logs']:
            return

        input_path = self._task_input_path(
            ex.task_type, ex.step_num, ex.task_num)
        stderr_path = self._task_stderr_path(
            ex.task_type, ex.step_num, ex.task_num)

        if self.fs.exists(stderr_path):  # it should, but just to be safe
            # log-parsing code expects "str", not bytes; open in text mode
            with open(stderr_path) as stderr:
                task_error = _parse_task_stderr(stderr)
                if task_error:
                    task_error['path'] = stderr_path
                    error = dict(
                        split=dict(path=input_path),
                        task_error=task_error)
                    _log_probable_cause_of_failure(log, error)
                    return

        # fallback if we can't find the error (e.g. the job does something
        # weird to stderr or stack traces)
        log.error('Error while reading from %s:\n' % input_path)

    def _default_python_bin(self, local=False):
        """Always return *sys.executable*, if defined"""
        return super(LocalMRJobRunner, self)._default_python_bin(
            local=True)

    def _sort_input_func(self):
        """Try sorting with the :command:`sort` binary before falling
        back to in-memory sort."""
        if platform.system() == 'Windows':  # we assume Unix sort
            return super(LocalMRJobRunner, self)._sort_input_func()
        else:
            return partial(
                _sort_lines_with_sort_bin,
                sort_bin=self._sort_bin(),
                tmp_dir=self._get_local_tmp_dir())

    def _sort_bin(self):
        """The binary to use to sort input.

        (On Windows, we go straight to sorting in memory.)
        """
        if self._opts['sort_bin']:
            return self._opts['sort_bin']
        elif self._sort_values:
            return ['sort']
        else:
            # only sort on the reducer key (see #660)
            return ['sort', '-t', '\t', '-k', '1,1', '-s']

    # Spark steps

    # TODO: _spark_master() should probably take step_num, to allow for
    # step-specific jobconf
    def _spark_master(self):
        """Use the local-cluster master, which simulates a Spark cluster."""
        # figure out the required parameters to local-cluster
        num_executors = self._num_cores()

        # for now assigning one core per executor, so we don't have to worry
        # about a number of cores that's not evenly divisible
        cores_per_executor = 1

        executor_mem_bytes = _to_num_bytes(
            self._opts['jobconf'].get('spark.executor.memory') or
            _DEFAULT_EXECUTOR_MEMORY)
        executor_mem_mb = math.ceil(executor_mem_bytes / 1024.0 / 1024.0)

        return 'local-cluster[%d,%d,%d]' % (
            num_executors, cores_per_executor, executor_mem_mb)


def _to_num_bytes(java_mem_str):
    if isinstance(java_mem_str, string_types):
        for i, magnitude in enumerate(('k', 'm', 'g', 't'), start=1):
            if java_mem_str.lower().endswith(magnitude):
                return int(java_mem_str[:-1]) * 1024 ** i

    return int(java_mem_str)


# pickle utilities, to protect multiprocessing from itself

def _invoke_task_in_subprocess(
        task_type, step_num, task_num,
        args, num_steps,
        stdin, stdout, stderr, wd, env):
    """A pickleable function that invokes a task in a subprocess."""
    log.debug('> %s' % cmd_line(args))

    try:
        check_call(args, stdin=stdin, stdout=stdout, stderr=stderr,
                   cwd=wd, env=env)
    except Exception as ex:
        raise _TaskFailedException(
            reason=str(ex),
            step_num=step_num,
            num_steps=num_steps,
            task_type=task_type,
            task_num=task_num,
        )


def _pickle_safe(func):
    """Call no-args function *func*, returning *None* and ensuring
    that any exception raised is pickleable."""
    try:
        func()  # always return None
    except _TaskFailedException:
        raise  # we know these are pickleable
    except Exception as ex:
        raise Exception(repr(ex))  # we know this is pickleable


# other utilities

def _sort_lines_with_sort_bin(input_paths, output_path, sort_bin,
                              sort_values=False, tmp_dir=None):
    """Sort lines the given *input_paths* into *output_path*,
    using *sort_bin*. If there is a problem, fall back to in-memory sort.

    This is a helper for :py:meth:`LocalMRJobRunner._sort_input_func`.

    *tmp_dir* determines the value of :envvar:`$TMP` and :envvar:`$TMPDIR`
    that *sort_bin* sees.
    """
    if input_paths:
        env = os.environ.copy()

        # ignore locale when sorting
        env['LC_ALL'] = 'C'

        # Make sure that the tmp dir environment variables are changed if
        # the default is changed.
        env['TMP'] = tmp_dir
        env['TMPDIR'] = tmp_dir

        with open(output_path, 'wb') as output:
            args = sort_bin + list(input_paths)
            log.debug('> %s' % cmd_line(args))

            try:
                check_call(args, stdout=output, env=env)
                return
            except CalledProcessError:
                log.error(
                    '`%s` failed, falling back to in-memory sort' %
                    cmd_line(sort_bin))
            except OSError:
                log.error(
                    'no sort binary, falling back to in-memory sort')

    _sort_lines_in_memory(input_paths, output_path, sort_values=sort_values)