mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-09 11:41:37 +00:00
521 lines
19 KiB
Python
521 lines
19 KiB
Python
# Copyright 2019 Yelp and Google, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""A runner that can run jobs on Spark, with or without Hadoop."""
|
|
import json
|
|
import logging
|
|
import os.path
|
|
import posixpath
|
|
import re
|
|
from copy import deepcopy
|
|
from subprocess import CalledProcessError
|
|
from tempfile import gettempdir
|
|
|
|
from mrjob.bin import MRJobBinRunner
|
|
from mrjob.cloud import _DEFAULT_CLOUD_PART_SIZE_MB
|
|
from mrjob.conf import combine_dicts
|
|
from mrjob.compat import jobconf_from_dict
|
|
from mrjob.dataproc import _DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS
|
|
from mrjob.fs.composite import CompositeFilesystem
|
|
from mrjob.fs.gcs import GCSFilesystem
|
|
from mrjob.fs.gcs import google as google_libs_installed
|
|
from mrjob.fs.gcs import _is_permanent_google_error
|
|
from mrjob.fs.hadoop import HadoopFilesystem
|
|
from mrjob.fs.local import LocalFilesystem
|
|
from mrjob.fs.s3 import S3Filesystem
|
|
from mrjob.fs.s3 import boto3 as boto3_installed
|
|
from mrjob.fs.s3 import _is_permanent_boto3_error
|
|
from mrjob.hadoop import fully_qualify_hdfs_path
|
|
from mrjob.logs.counters import _format_counters
|
|
from mrjob.logs.errors import _log_probable_cause_of_failure
|
|
from mrjob.logs.errors import _pick_error
|
|
from mrjob.logs.step import _log_log4j_record
|
|
from mrjob.parse import is_uri
|
|
from mrjob.parse import to_uri
|
|
from mrjob.py2 import to_unicode
|
|
from mrjob.setup import UploadDirManager
|
|
from mrjob.step import StepFailedException
|
|
from mrjob.util import cmd_line
|
|
from mrjob.util import _create_zip_file
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_CLOSE_BRACE_AFTER_CLOSE_BRACE_RE = re.compile(r'(?<=\})\}')
|
|
|
|
|
|
class SparkMRJobRunner(MRJobBinRunner):
|
|
"""Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or
|
|
without Hadoop). Invoked when you run your job with ``-r spark``.
|
|
|
|
See :ref:`running-on-your-spark-cluster` for more information.
|
|
|
|
The Spark runner can also run "classic" MRJobs directly on Spark, without
|
|
using Hadoop streaming. See :ref:`classic-mrjobs-on-spark`.
|
|
|
|
.. versionadded:: 0.6.8
|
|
"""
|
|
alias = 'spark'
|
|
|
|
# other than ``spark_*``, these options are only used for filesystems
|
|
#
|
|
# max_output_files doesn't appear here because it can only be read from
|
|
# the command line, not mrjob.conf (see #2040)
|
|
OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
|
|
'aws_access_key_id',
|
|
'aws_secret_access_key',
|
|
'aws_session_token',
|
|
'cloud_fs_sync_secs',
|
|
'cloud_part_size_mb',
|
|
'emulate_map_input_file',
|
|
'gcs_region', # used when creating buckets on GCS
|
|
'hadoop_bin',
|
|
'project_id', # used by GCS filesystem
|
|
's3_endpoint',
|
|
's3_region', # used when creating buckets on S3
|
|
'skip_internal_protocol',
|
|
'spark_deploy_mode',
|
|
'spark_master',
|
|
'spark_tmp_dir', # where to put temp files in Spark
|
|
}
|
|
|
|
# everything except Hadoop JARs
|
|
# streaming jobs will be run using mrjob/spark/harness.py (see #1972)
|
|
_STEP_TYPES = {
|
|
'spark', 'spark_jar', 'spark_script', 'streaming',
|
|
}
|
|
|
|
def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs):
|
|
"""Create a Spark runner.
|
|
|
|
:param max_output_files: limit on number of output files when
|
|
running streaming jobs. Can only be
|
|
set on command line (not config file)
|
|
:param mrjob_cls: class of the job you want to run. Used for
|
|
running streaming steps in Spark
|
|
"""
|
|
# need to set this before checking steps in superclass __init__()
|
|
self._mrjob_cls = mrjob_cls
|
|
|
|
super(SparkMRJobRunner, self).__init__(**kwargs)
|
|
|
|
self._max_output_files = max_output_files
|
|
|
|
if self._opts['spark_tmp_dir']:
|
|
self._check_spark_tmp_dir_opt()
|
|
|
|
self._spark_tmp_dir = self._pick_spark_tmp_dir()
|
|
|
|
# where local files are uploaded into Spark
|
|
if is_uri(self._spark_tmp_dir):
|
|
spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
|
|
self._upload_mgr = UploadDirManager(spark_files_dir)
|
|
|
|
# where to put job output (if not set explicitly)
|
|
if not self._output_dir:
|
|
self._output_dir = self.fs.join(self._spark_tmp_dir, 'output')
|
|
|
|
# keep track of where the spark-submit binary is
|
|
self._spark_submit_bin = self._opts['spark_submit_bin']
|
|
|
|
# where to store a .zip file containing the MRJob, with a unique
|
|
# module name
|
|
self._job_script_zip_path = None
|
|
|
|
# counters, one per job step. (Counters will be {} for non-streaming
|
|
# steps because Spark doesn't have counters).
|
|
self._counters = []
|
|
|
|
# TODO: we may eventually want log interpretation, but it shouldn't
|
|
# include counters, as they are not found in logs.
|
|
|
|
def _check_spark_tmp_dir_opt(self):
|
|
# warn if spark_tmp_dir isn't actually visible to Spark executors
|
|
# (see #2062)
|
|
tmp_dir_is_local = to_uri(
|
|
self._opts['spark_tmp_dir']).startswith('file://')
|
|
spark_master_is_local = self._spark_master().startswith('local')
|
|
|
|
if tmp_dir_is_local != spark_master_is_local:
|
|
log.warning(
|
|
'Warning: executors on Spark master %s may not be able to'
|
|
' access spark_tmp_dir %s' %
|
|
(self._spark_master(), self._opts['spark_tmp_dir']))
|
|
|
|
def _check_step(self, step, step_num):
|
|
"""Don't try to run steps that include commands or use manifests."""
|
|
super(SparkMRJobRunner, self)._check_step(step, step_num)
|
|
|
|
if step.get('input_manifest'):
|
|
raise NotImplementedError(
|
|
'spark runner does not support input manifests')
|
|
|
|
# we don't currently support commands, but we *could* (see #1956).
|
|
if step['type'] == 'streaming':
|
|
if not self._mrjob_cls:
|
|
raise ValueError(
|
|
'You must set mrjob_cls to run streaming steps')
|
|
|
|
for mrc in ('mapper', 'combiner', 'reducer'):
|
|
if step.get(mrc):
|
|
if 'command' in step[mrc] or 'pre_filter' in step[mrc]:
|
|
raise NotImplementedError(
|
|
"step %d's %s runs a command, but spark"
|
|
" runner does not support commands" % (
|
|
step_num, mrc))
|
|
|
|
@classmethod
|
|
def _default_opts(cls):
|
|
return combine_dicts(
|
|
super(SparkMRJobRunner, cls)._default_opts(),
|
|
dict(
|
|
cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB,
|
|
),
|
|
)
|
|
|
|
def _run(self):
|
|
self.get_spark_submit_bin() # find spark-submit up front
|
|
self._create_setup_wrapper_scripts()
|
|
self._upload_local_files()
|
|
self._run_steps_on_spark()
|
|
|
|
def _pick_spark_tmp_dir(self):
|
|
if self._opts['spark_tmp_dir']:
|
|
return self.fs.join(self._opts['spark_tmp_dir'], self._job_key)
|
|
else:
|
|
master = self._spark_master() or 'local'
|
|
if master.startswith('local'): # including local-cluster
|
|
# need a local temp dir
|
|
# add "-spark" so we don't collide with default local temp dir
|
|
return os.path.join(
|
|
gettempdir(), self._job_key + '-spark')
|
|
else:
|
|
# use HDFS (same default as HadoopJobRunner)
|
|
return posixpath.join(
|
|
fully_qualify_hdfs_path('tmp/mrjob'), self._job_key)
|
|
|
|
def _default_step_output_dir(self):
|
|
return self.fs.join(self._spark_tmp_dir, 'step-output')
|
|
|
|
def _counter_output_dir(self, step_num):
|
|
return self.fs.join(
|
|
self._spark_tmp_dir, 'counter-output-step-%d' % step_num)
|
|
|
|
def counters(self):
|
|
return deepcopy(self._counters)
|
|
|
|
@property
|
|
def fs(self):
|
|
# Spark supports basically every filesystem there is
|
|
|
|
if not self._fs:
|
|
self._fs = CompositeFilesystem()
|
|
|
|
if boto3_installed:
|
|
self._fs.add_fs('s3', S3Filesystem(
|
|
aws_access_key_id=self._opts['aws_access_key_id'],
|
|
aws_secret_access_key=self._opts['aws_secret_access_key'],
|
|
aws_session_token=self._opts['aws_session_token'],
|
|
s3_endpoint=self._opts['s3_endpoint'],
|
|
s3_region=self._opts['s3_region'],
|
|
), disable_if=_is_permanent_boto3_error)
|
|
|
|
if google_libs_installed:
|
|
self._fs.add_fs('gcs', GCSFilesystem(
|
|
project_id=self._opts['project_id'],
|
|
location=self._opts['gcs_region'],
|
|
object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
|
|
), disable_if=_is_permanent_google_error)
|
|
|
|
# Hadoop FS is responsible for all URIs that fall through to it
|
|
self._fs.add_fs('hadoop', HadoopFilesystem(
|
|
self._opts['hadoop_bin']))
|
|
|
|
self._fs.add_fs('local', LocalFilesystem())
|
|
|
|
return self._fs
|
|
|
|
# making mr_job_script visible in Spark
|
|
|
|
def _job_script_module_name(self):
|
|
"""A unique module name to use with the MRJob script."""
|
|
return re.sub(r'[^\w\d]', '_', self._job_key)
|
|
|
|
def _create_job_script_zip(self):
|
|
if not self._job_script_zip_path:
|
|
zip_path = os.path.join(self._get_local_tmp_dir(), 'script.zip')
|
|
name_in_zip = self._job_script_module_name() + '.py'
|
|
|
|
log.debug('archiving %s -> %s as %s' % (
|
|
self._script_path, zip_path, name_in_zip))
|
|
with _create_zip_file(zip_path) as zip_file:
|
|
zip_file.write(self._script_path, arcname=name_in_zip)
|
|
|
|
self._job_script_zip_path = zip_path
|
|
|
|
return self._job_script_zip_path
|
|
|
|
def _py_files(self):
|
|
"""Patch in :py:attr:`_job_script_zip_path`, if running streaming
|
|
steps."""
|
|
py_files = super(SparkMRJobRunner, self)._py_files()
|
|
|
|
if self._has_streaming_steps():
|
|
py_files.append(self._create_job_script_zip())
|
|
|
|
return py_files
|
|
|
|
# running the job
|
|
|
|
def _run_steps_on_spark(self):
|
|
steps = self._get_steps()
|
|
|
|
for group in self._group_steps(steps):
|
|
step_num = group['step_num']
|
|
last_step_num = step_num + len(group['steps']) - 1
|
|
|
|
# the Spark harness can run several streaming steps in one job
|
|
if step_num == last_step_num:
|
|
step_desc = 'step %d' % (step_num + 1)
|
|
else:
|
|
step_desc = 'steps %d-%d' % (step_num + 1, last_step_num + 1)
|
|
|
|
log.info('Running %s of %d' % (step_desc, len(steps)))
|
|
|
|
self._run_step_on_spark(group['steps'][0], step_num, last_step_num)
|
|
|
|
def _group_steps(self, steps):
|
|
"""Group streaming steps together."""
|
|
# a list of dicts with:
|
|
#
|
|
# type -- shared type of steps
|
|
# steps -- list of steps in group
|
|
# step_num -- (0-indexed) number of first step
|
|
groups = []
|
|
|
|
for step_num, step in enumerate(steps):
|
|
# should we add *step* to existing group of streaming steps?
|
|
if (step['type'] == 'streaming' and groups and
|
|
groups[-1]['type'] == 'streaming' and
|
|
step.get('jobconf') ==
|
|
groups[-1]['steps'][0].get('jobconf')):
|
|
groups[-1]['steps'].append(step)
|
|
else:
|
|
# start a new step group
|
|
groups.append(dict(
|
|
type=step['type'],
|
|
steps=[step],
|
|
step_num=step_num))
|
|
|
|
return groups
|
|
|
|
def _run_step_on_spark(self, step, step_num, last_step_num=None):
|
|
spark_submit_args = self._args_for_spark_step(step_num, last_step_num)
|
|
|
|
env = dict(os.environ)
|
|
env.update(self._spark_cmdenv(step_num))
|
|
|
|
returncode, step_interpretation = self._run_spark_submit(
|
|
spark_submit_args, env, record_callback=_log_log4j_record)
|
|
|
|
counters = None
|
|
if step['type'] == 'streaming':
|
|
counter_file = self.fs.join(
|
|
self._counter_output_dir(step_num), 'part-*')
|
|
counter_json = b''.join(self.fs.cat(counter_file))
|
|
if counter_json.strip():
|
|
# json.loads() on Python 3.4/3.5 can't take bytes
|
|
counters = json.loads(to_unicode(counter_json))
|
|
|
|
if isinstance(counters, list):
|
|
self._counters.extend(counters)
|
|
|
|
# desc_num is 1-indexed user-readable step num
|
|
for desc_num, counter_dict in enumerate(
|
|
counters, start=(step_num + 1)):
|
|
if counter_dict:
|
|
log.info(_format_counters(
|
|
counter_dict,
|
|
desc=('Counters for step %d' % desc_num)))
|
|
|
|
# for non-streaming steps, there are no counters.
|
|
# pad self._counters to match number of steps
|
|
while len(self._counters) < (last_step_num or step_num) + 1:
|
|
self._counters.append({})
|
|
|
|
if returncode:
|
|
error = _pick_error(dict(step=step_interpretation))
|
|
if error:
|
|
_log_probable_cause_of_failure(log, error)
|
|
|
|
reason = str(CalledProcessError(returncode, spark_submit_args))
|
|
raise StepFailedException(
|
|
reason=reason, step_num=step_num, last_step_num=last_step_num,
|
|
num_steps=self._num_steps())
|
|
|
|
def _spark_script_path(self, step_num):
|
|
"""For streaming steps, return the path of the harness script
|
|
(and handle other spark step types the usual way)."""
|
|
step = self._get_step(step_num)
|
|
|
|
if step['type'] == 'streaming':
|
|
return self._spark_harness_path()
|
|
else:
|
|
return super(SparkMRJobRunner, self)._spark_script_path(step_num)
|
|
|
|
def _spark_script_args(self, step_num, last_step_num=None):
|
|
"""Generate spark harness args for streaming steps (and handle
|
|
other spark step types the usual way).
|
|
"""
|
|
if last_step_num is None:
|
|
last_step_num = step_num
|
|
|
|
steps = self._get_steps()[step_num:last_step_num + 1]
|
|
|
|
if steps[0]['type'] != 'streaming':
|
|
return super(SparkMRJobRunner, self)._spark_script_args(
|
|
step_num, last_step_num)
|
|
|
|
args = []
|
|
|
|
# class name
|
|
args.append('%s.%s' % (self._job_script_module_name(),
|
|
self._mrjob_cls.__name__))
|
|
|
|
# INPUT
|
|
args.append(
|
|
','.join(self._step_input_uris(step_num)))
|
|
|
|
# OUTPUT
|
|
# note that we use the output dir for the *last* step
|
|
args.append(
|
|
self._step_output_uri(last_step_num))
|
|
|
|
# --hadoop-input-format
|
|
if self._hadoop_input_format:
|
|
args.extend(['--hadoop-input-format', self._hadoop_input_format])
|
|
else:
|
|
# you can't pass --hadoop-input-format '' to EMR's script runner,
|
|
# so pass something that doesn't use an empty string (see #2055)
|
|
args.append('--no-hadoop-input-format')
|
|
|
|
# --hadoop-output-format
|
|
if self._hadoop_output_format:
|
|
args.extend(['--hadoop-output-format', self._hadoop_output_format])
|
|
else:
|
|
# alternative to --hadoop-output-format '' (see #2055)
|
|
args.append('--no-hadoop-output-format')
|
|
|
|
# --sort-values
|
|
if self._sort_values:
|
|
args.append('--sort-values')
|
|
else:
|
|
args.append('--no-sort-values')
|
|
|
|
# --steps-desc
|
|
args.extend(['--steps-desc',
|
|
_emr_proof_steps_desc(json.dumps(steps))])
|
|
|
|
# --counter-output-dir, to simulate counters
|
|
args.extend(['--counter-output-dir',
|
|
self._counter_output_dir(step_num)])
|
|
|
|
# --first-step-num, --last-step-num (step range)
|
|
args.extend(['--first-step-num', str(step_num),
|
|
'--last-step-num', str(last_step_num)])
|
|
|
|
# --job-args (passthrough args)
|
|
|
|
# if on local[*] master, keep file upload args as-is (see #2031)
|
|
job_args = self._mr_job_extra_args(
|
|
local=not self._spark_executors_have_own_wd())
|
|
|
|
if job_args:
|
|
args.extend(['--job-args', cmd_line(job_args)])
|
|
|
|
# --compression-codec
|
|
jobconf = self._jobconf_for_step(step_num)
|
|
|
|
compress_conf = jobconf_from_dict(
|
|
jobconf, 'mapreduce.output.fileoutputformat.compress')
|
|
codec_conf = jobconf_from_dict(
|
|
jobconf, 'mapreduce.output.fileoutputformat.compress.codec')
|
|
|
|
if compress_conf and compress_conf != 'false' and codec_conf:
|
|
args.extend(['--compression-codec', codec_conf])
|
|
|
|
# --num-reducers
|
|
num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces')
|
|
if num_reducers and int(num_reducers) > 0:
|
|
args.extend(['--num-reducers', str(num_reducers)])
|
|
|
|
# --max-output-files
|
|
if self._max_output_files:
|
|
args.extend(['--max-output-files',
|
|
str(self._max_output_files)])
|
|
|
|
# --emulate-map-input-file
|
|
if self._opts['emulate_map_input_file']:
|
|
args.append('--emulate-map-input-file')
|
|
|
|
# --skip_internal-protocol
|
|
if self._opts['skip_internal_protocol']:
|
|
args.append('--skip-internal-protocol')
|
|
|
|
return args
|
|
|
|
def _spark_harness_path(self):
|
|
"""Where to find the Spark harness."""
|
|
# harness requires pyspark, which may be in spark-submit's PYTHONPATH
|
|
# but not ours. So don't import the harness unless we need it.
|
|
# (See #2091)
|
|
import mrjob.spark.harness
|
|
path = mrjob.spark.harness.__file__
|
|
if path.endswith('.pyc'):
|
|
path = path[:-1]
|
|
return path
|
|
|
|
# "streaming" steps run on Spark too
|
|
|
|
def _has_spark_steps(self):
|
|
"""Treat streaming steps as Spark steps."""
|
|
return (super(SparkMRJobRunner, self)._has_spark_steps() or
|
|
self._has_streaming_steps())
|
|
|
|
def _has_hadoop_streaming_steps(self):
|
|
# the Spark runner doesn't run "streaming" steps on Hadoop
|
|
return False
|
|
|
|
def _has_streaming_steps(self):
|
|
"""Are any of our steps "streaming" steps that would normally run
|
|
on Hadoop Streaming?"""
|
|
return any(step['type'] == 'streaming' for step in self._get_steps())
|
|
|
|
def _step_type_uses_pyspark(self, step_type):
|
|
"""Treat streaming steps as Spark steps that use Python."""
|
|
return (
|
|
super(SparkMRJobRunner, self)._step_type_uses_pyspark(step_type) or
|
|
step_type == 'streaming')
|
|
|
|
def _step_type_uses_spark(self, step_type):
|
|
"""Treat streaming steps as Spark steps that use Python."""
|
|
return (
|
|
super(SparkMRJobRunner, self)._step_type_uses_spark(step_type) or
|
|
step_type == 'streaming')
|
|
|
|
|
|
def _emr_proof_steps_desc(steps_desc):
|
|
# EMR's command-runner.jar does some very strange things to
|
|
# arguments, including deleting empty args and deleting
|
|
# '}}' from arguments. See #2070
|
|
return _CLOSE_BRACE_AFTER_CLOSE_BRACE_RE.sub(' }', steps_desc)
|