GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/spark/runner.py
2025-06-22 13:36:01 +05:30

521 lines
19 KiB
Python

# Copyright 2019 Yelp and Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A runner that can run jobs on Spark, with or without Hadoop."""
import json
import logging
import os.path
import posixpath
import re
from copy import deepcopy
from subprocess import CalledProcessError
from tempfile import gettempdir
from mrjob.bin import MRJobBinRunner
from mrjob.cloud import _DEFAULT_CLOUD_PART_SIZE_MB
from mrjob.conf import combine_dicts
from mrjob.compat import jobconf_from_dict
from mrjob.dataproc import _DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS
from mrjob.fs.composite import CompositeFilesystem
from mrjob.fs.gcs import GCSFilesystem
from mrjob.fs.gcs import google as google_libs_installed
from mrjob.fs.gcs import _is_permanent_google_error
from mrjob.fs.hadoop import HadoopFilesystem
from mrjob.fs.local import LocalFilesystem
from mrjob.fs.s3 import S3Filesystem
from mrjob.fs.s3 import boto3 as boto3_installed
from mrjob.fs.s3 import _is_permanent_boto3_error
from mrjob.hadoop import fully_qualify_hdfs_path
from mrjob.logs.counters import _format_counters
from mrjob.logs.errors import _log_probable_cause_of_failure
from mrjob.logs.errors import _pick_error
from mrjob.logs.step import _log_log4j_record
from mrjob.parse import is_uri
from mrjob.parse import to_uri
from mrjob.py2 import to_unicode
from mrjob.setup import UploadDirManager
from mrjob.step import StepFailedException
from mrjob.util import cmd_line
from mrjob.util import _create_zip_file
log = logging.getLogger(__name__)
_CLOSE_BRACE_AFTER_CLOSE_BRACE_RE = re.compile(r'(?<=\})\}')
class SparkMRJobRunner(MRJobBinRunner):
"""Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or
without Hadoop). Invoked when you run your job with ``-r spark``.
See :ref:`running-on-your-spark-cluster` for more information.
The Spark runner can also run "classic" MRJobs directly on Spark, without
using Hadoop streaming. See :ref:`classic-mrjobs-on-spark`.
.. versionadded:: 0.6.8
"""
alias = 'spark'
# other than ``spark_*``, these options are only used for filesystems
#
# max_output_files doesn't appear here because it can only be read from
# the command line, not mrjob.conf (see #2040)
OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
'aws_access_key_id',
'aws_secret_access_key',
'aws_session_token',
'cloud_fs_sync_secs',
'cloud_part_size_mb',
'emulate_map_input_file',
'gcs_region', # used when creating buckets on GCS
'hadoop_bin',
'project_id', # used by GCS filesystem
's3_endpoint',
's3_region', # used when creating buckets on S3
'skip_internal_protocol',
'spark_deploy_mode',
'spark_master',
'spark_tmp_dir', # where to put temp files in Spark
}
# everything except Hadoop JARs
# streaming jobs will be run using mrjob/spark/harness.py (see #1972)
_STEP_TYPES = {
'spark', 'spark_jar', 'spark_script', 'streaming',
}
def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs):
"""Create a Spark runner.
:param max_output_files: limit on number of output files when
running streaming jobs. Can only be
set on command line (not config file)
:param mrjob_cls: class of the job you want to run. Used for
running streaming steps in Spark
"""
# need to set this before checking steps in superclass __init__()
self._mrjob_cls = mrjob_cls
super(SparkMRJobRunner, self).__init__(**kwargs)
self._max_output_files = max_output_files
if self._opts['spark_tmp_dir']:
self._check_spark_tmp_dir_opt()
self._spark_tmp_dir = self._pick_spark_tmp_dir()
# where local files are uploaded into Spark
if is_uri(self._spark_tmp_dir):
spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
self._upload_mgr = UploadDirManager(spark_files_dir)
# where to put job output (if not set explicitly)
if not self._output_dir:
self._output_dir = self.fs.join(self._spark_tmp_dir, 'output')
# keep track of where the spark-submit binary is
self._spark_submit_bin = self._opts['spark_submit_bin']
# where to store a .zip file containing the MRJob, with a unique
# module name
self._job_script_zip_path = None
# counters, one per job step. (Counters will be {} for non-streaming
# steps because Spark doesn't have counters).
self._counters = []
# TODO: we may eventually want log interpretation, but it shouldn't
# include counters, as they are not found in logs.
def _check_spark_tmp_dir_opt(self):
# warn if spark_tmp_dir isn't actually visible to Spark executors
# (see #2062)
tmp_dir_is_local = to_uri(
self._opts['spark_tmp_dir']).startswith('file://')
spark_master_is_local = self._spark_master().startswith('local')
if tmp_dir_is_local != spark_master_is_local:
log.warning(
'Warning: executors on Spark master %s may not be able to'
' access spark_tmp_dir %s' %
(self._spark_master(), self._opts['spark_tmp_dir']))
def _check_step(self, step, step_num):
"""Don't try to run steps that include commands or use manifests."""
super(SparkMRJobRunner, self)._check_step(step, step_num)
if step.get('input_manifest'):
raise NotImplementedError(
'spark runner does not support input manifests')
# we don't currently support commands, but we *could* (see #1956).
if step['type'] == 'streaming':
if not self._mrjob_cls:
raise ValueError(
'You must set mrjob_cls to run streaming steps')
for mrc in ('mapper', 'combiner', 'reducer'):
if step.get(mrc):
if 'command' in step[mrc] or 'pre_filter' in step[mrc]:
raise NotImplementedError(
"step %d's %s runs a command, but spark"
" runner does not support commands" % (
step_num, mrc))
@classmethod
def _default_opts(cls):
return combine_dicts(
super(SparkMRJobRunner, cls)._default_opts(),
dict(
cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB,
),
)
def _run(self):
self.get_spark_submit_bin() # find spark-submit up front
self._create_setup_wrapper_scripts()
self._upload_local_files()
self._run_steps_on_spark()
def _pick_spark_tmp_dir(self):
if self._opts['spark_tmp_dir']:
return self.fs.join(self._opts['spark_tmp_dir'], self._job_key)
else:
master = self._spark_master() or 'local'
if master.startswith('local'): # including local-cluster
# need a local temp dir
# add "-spark" so we don't collide with default local temp dir
return os.path.join(
gettempdir(), self._job_key + '-spark')
else:
# use HDFS (same default as HadoopJobRunner)
return posixpath.join(
fully_qualify_hdfs_path('tmp/mrjob'), self._job_key)
def _default_step_output_dir(self):
return self.fs.join(self._spark_tmp_dir, 'step-output')
def _counter_output_dir(self, step_num):
return self.fs.join(
self._spark_tmp_dir, 'counter-output-step-%d' % step_num)
def counters(self):
return deepcopy(self._counters)
@property
def fs(self):
# Spark supports basically every filesystem there is
if not self._fs:
self._fs = CompositeFilesystem()
if boto3_installed:
self._fs.add_fs('s3', S3Filesystem(
aws_access_key_id=self._opts['aws_access_key_id'],
aws_secret_access_key=self._opts['aws_secret_access_key'],
aws_session_token=self._opts['aws_session_token'],
s3_endpoint=self._opts['s3_endpoint'],
s3_region=self._opts['s3_region'],
), disable_if=_is_permanent_boto3_error)
if google_libs_installed:
self._fs.add_fs('gcs', GCSFilesystem(
project_id=self._opts['project_id'],
location=self._opts['gcs_region'],
object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
), disable_if=_is_permanent_google_error)
# Hadoop FS is responsible for all URIs that fall through to it
self._fs.add_fs('hadoop', HadoopFilesystem(
self._opts['hadoop_bin']))
self._fs.add_fs('local', LocalFilesystem())
return self._fs
# making mr_job_script visible in Spark
def _job_script_module_name(self):
"""A unique module name to use with the MRJob script."""
return re.sub(r'[^\w\d]', '_', self._job_key)
def _create_job_script_zip(self):
if not self._job_script_zip_path:
zip_path = os.path.join(self._get_local_tmp_dir(), 'script.zip')
name_in_zip = self._job_script_module_name() + '.py'
log.debug('archiving %s -> %s as %s' % (
self._script_path, zip_path, name_in_zip))
with _create_zip_file(zip_path) as zip_file:
zip_file.write(self._script_path, arcname=name_in_zip)
self._job_script_zip_path = zip_path
return self._job_script_zip_path
def _py_files(self):
"""Patch in :py:attr:`_job_script_zip_path`, if running streaming
steps."""
py_files = super(SparkMRJobRunner, self)._py_files()
if self._has_streaming_steps():
py_files.append(self._create_job_script_zip())
return py_files
# running the job
def _run_steps_on_spark(self):
steps = self._get_steps()
for group in self._group_steps(steps):
step_num = group['step_num']
last_step_num = step_num + len(group['steps']) - 1
# the Spark harness can run several streaming steps in one job
if step_num == last_step_num:
step_desc = 'step %d' % (step_num + 1)
else:
step_desc = 'steps %d-%d' % (step_num + 1, last_step_num + 1)
log.info('Running %s of %d' % (step_desc, len(steps)))
self._run_step_on_spark(group['steps'][0], step_num, last_step_num)
def _group_steps(self, steps):
"""Group streaming steps together."""
# a list of dicts with:
#
# type -- shared type of steps
# steps -- list of steps in group
# step_num -- (0-indexed) number of first step
groups = []
for step_num, step in enumerate(steps):
# should we add *step* to existing group of streaming steps?
if (step['type'] == 'streaming' and groups and
groups[-1]['type'] == 'streaming' and
step.get('jobconf') ==
groups[-1]['steps'][0].get('jobconf')):
groups[-1]['steps'].append(step)
else:
# start a new step group
groups.append(dict(
type=step['type'],
steps=[step],
step_num=step_num))
return groups
def _run_step_on_spark(self, step, step_num, last_step_num=None):
spark_submit_args = self._args_for_spark_step(step_num, last_step_num)
env = dict(os.environ)
env.update(self._spark_cmdenv(step_num))
returncode, step_interpretation = self._run_spark_submit(
spark_submit_args, env, record_callback=_log_log4j_record)
counters = None
if step['type'] == 'streaming':
counter_file = self.fs.join(
self._counter_output_dir(step_num), 'part-*')
counter_json = b''.join(self.fs.cat(counter_file))
if counter_json.strip():
# json.loads() on Python 3.4/3.5 can't take bytes
counters = json.loads(to_unicode(counter_json))
if isinstance(counters, list):
self._counters.extend(counters)
# desc_num is 1-indexed user-readable step num
for desc_num, counter_dict in enumerate(
counters, start=(step_num + 1)):
if counter_dict:
log.info(_format_counters(
counter_dict,
desc=('Counters for step %d' % desc_num)))
# for non-streaming steps, there are no counters.
# pad self._counters to match number of steps
while len(self._counters) < (last_step_num or step_num) + 1:
self._counters.append({})
if returncode:
error = _pick_error(dict(step=step_interpretation))
if error:
_log_probable_cause_of_failure(log, error)
reason = str(CalledProcessError(returncode, spark_submit_args))
raise StepFailedException(
reason=reason, step_num=step_num, last_step_num=last_step_num,
num_steps=self._num_steps())
def _spark_script_path(self, step_num):
"""For streaming steps, return the path of the harness script
(and handle other spark step types the usual way)."""
step = self._get_step(step_num)
if step['type'] == 'streaming':
return self._spark_harness_path()
else:
return super(SparkMRJobRunner, self)._spark_script_path(step_num)
def _spark_script_args(self, step_num, last_step_num=None):
"""Generate spark harness args for streaming steps (and handle
other spark step types the usual way).
"""
if last_step_num is None:
last_step_num = step_num
steps = self._get_steps()[step_num:last_step_num + 1]
if steps[0]['type'] != 'streaming':
return super(SparkMRJobRunner, self)._spark_script_args(
step_num, last_step_num)
args = []
# class name
args.append('%s.%s' % (self._job_script_module_name(),
self._mrjob_cls.__name__))
# INPUT
args.append(
','.join(self._step_input_uris(step_num)))
# OUTPUT
# note that we use the output dir for the *last* step
args.append(
self._step_output_uri(last_step_num))
# --hadoop-input-format
if self._hadoop_input_format:
args.extend(['--hadoop-input-format', self._hadoop_input_format])
else:
# you can't pass --hadoop-input-format '' to EMR's script runner,
# so pass something that doesn't use an empty string (see #2055)
args.append('--no-hadoop-input-format')
# --hadoop-output-format
if self._hadoop_output_format:
args.extend(['--hadoop-output-format', self._hadoop_output_format])
else:
# alternative to --hadoop-output-format '' (see #2055)
args.append('--no-hadoop-output-format')
# --sort-values
if self._sort_values:
args.append('--sort-values')
else:
args.append('--no-sort-values')
# --steps-desc
args.extend(['--steps-desc',
_emr_proof_steps_desc(json.dumps(steps))])
# --counter-output-dir, to simulate counters
args.extend(['--counter-output-dir',
self._counter_output_dir(step_num)])
# --first-step-num, --last-step-num (step range)
args.extend(['--first-step-num', str(step_num),
'--last-step-num', str(last_step_num)])
# --job-args (passthrough args)
# if on local[*] master, keep file upload args as-is (see #2031)
job_args = self._mr_job_extra_args(
local=not self._spark_executors_have_own_wd())
if job_args:
args.extend(['--job-args', cmd_line(job_args)])
# --compression-codec
jobconf = self._jobconf_for_step(step_num)
compress_conf = jobconf_from_dict(
jobconf, 'mapreduce.output.fileoutputformat.compress')
codec_conf = jobconf_from_dict(
jobconf, 'mapreduce.output.fileoutputformat.compress.codec')
if compress_conf and compress_conf != 'false' and codec_conf:
args.extend(['--compression-codec', codec_conf])
# --num-reducers
num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces')
if num_reducers and int(num_reducers) > 0:
args.extend(['--num-reducers', str(num_reducers)])
# --max-output-files
if self._max_output_files:
args.extend(['--max-output-files',
str(self._max_output_files)])
# --emulate-map-input-file
if self._opts['emulate_map_input_file']:
args.append('--emulate-map-input-file')
# --skip_internal-protocol
if self._opts['skip_internal_protocol']:
args.append('--skip-internal-protocol')
return args
def _spark_harness_path(self):
"""Where to find the Spark harness."""
# harness requires pyspark, which may be in spark-submit's PYTHONPATH
# but not ours. So don't import the harness unless we need it.
# (See #2091)
import mrjob.spark.harness
path = mrjob.spark.harness.__file__
if path.endswith('.pyc'):
path = path[:-1]
return path
# "streaming" steps run on Spark too
def _has_spark_steps(self):
"""Treat streaming steps as Spark steps."""
return (super(SparkMRJobRunner, self)._has_spark_steps() or
self._has_streaming_steps())
def _has_hadoop_streaming_steps(self):
# the Spark runner doesn't run "streaming" steps on Hadoop
return False
def _has_streaming_steps(self):
"""Are any of our steps "streaming" steps that would normally run
on Hadoop Streaming?"""
return any(step['type'] == 'streaming' for step in self._get_steps())
def _step_type_uses_pyspark(self, step_type):
"""Treat streaming steps as Spark steps that use Python."""
return (
super(SparkMRJobRunner, self)._step_type_uses_pyspark(step_type) or
step_type == 'streaming')
def _step_type_uses_spark(self, step_type):
"""Treat streaming steps as Spark steps that use Python."""
return (
super(SparkMRJobRunner, self)._step_type_uses_spark(step_type) or
step_type == 'streaming')
def _emr_proof_steps_desc(steps_desc):
# EMR's command-runner.jar does some very strange things to
# arguments, including deleting empty args and deleting
# '}}' from arguments. See #2070
return _CLOSE_BRACE_AFTER_CLOSE_BRACE_RE.sub(' }', steps_desc)