GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/hadoop.py
2025-06-22 13:36:01 +05:30

609 lines
21 KiB
Python

# Copyright 2009-2016 Yelp and Contributors
# Copyright 2017 Yelp
# Copyright 2018 Yelp and Google, Inc.
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import getpass
import logging
import os
import posixpath
import re
from subprocess import CalledProcessError
from subprocess import Popen
from subprocess import PIPE
try:
import pty
pty # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
pty = None
from mrjob.bin import MRJobBinRunner
from mrjob.compat import uses_yarn
from mrjob.conf import combine_dicts
from mrjob.fs.composite import CompositeFilesystem
from mrjob.fs.hadoop import HadoopFilesystem
from mrjob.fs.local import LocalFilesystem
from mrjob.logs.counters import _pick_counters
from mrjob.logs.errors import _log_probable_cause_of_failure
from mrjob.logs.mixin import LogInterpretationMixin
from mrjob.logs.step import _eio_to_eof
from mrjob.logs.step import _interpret_hadoop_jar_command_stderr
from mrjob.logs.step import _is_counter_log4j_record
from mrjob.logs.step import _log_line_from_driver
from mrjob.logs.step import _log_log4j_record
from mrjob.logs.wrap import _logs_exist
from mrjob.parse import is_uri
from mrjob.py2 import to_unicode
from mrjob.runner import _fix_env
from mrjob.setup import UploadDirManager
from mrjob.step import StepFailedException
from mrjob.step import _is_spark_step_type
from mrjob.util import cmd_line
from mrjob.util import unique
from mrjob.util import which
log = logging.getLogger(__name__)
# don't look for the hadoop streaming jar here!
_BAD_HADOOP_HOMES = ['/', '/usr', '/usr/local']
# where YARN stores history logs, etc. on HDFS by default
_DEFAULT_YARN_HDFS_LOG_DIR = 'hdfs:///tmp/hadoop-yarn/staging'
# places to look for the Hadoop streaming jar if we're inside EMR
_EMR_HADOOP_STREAMING_JAR_DIRS = [
# for the 2.x and 3.x AMIs (the 2.x AMIs also set $HADOOP_HOME properly)
'/home/hadoop/contrib',
# for the 4.x AMIs
'/usr/lib/hadoop-mapreduce',
]
# these are fairly standard places to keep Hadoop logs
_FALLBACK_HADOOP_LOG_DIRS = [
'/var/log/hadoop',
'/mnt/var/log/hadoop', # EMR's 2.x and 3.x AMIs use this
]
# fairly standard places to keep YARN logs (see #1339)
_FALLBACK_HADOOP_YARN_LOG_DIRS = [
'/var/log/hadoop-yarn',
'/mnt/var/log/hadoop-yarn',
]
# start of Counters printed by Hadoop
_HADOOP_COUNTERS_START_RE = re.compile(br'^Counters: (?P<amount>\d+)\s*$')
# header for a group of counters
_HADOOP_COUNTER_GROUP_RE = re.compile(br'^(?P<indent>\s+)(?P<group>.*)$')
# line for a counter
_HADOOP_COUNTER_RE = re.compile(
br'^(?P<indent>\s+)(?P<counter>.*)=(?P<amount>\d+)\s*$')
# the one thing Hadoop streaming prints to stderr not in log format
_HADOOP_NON_LOG_LINE_RE = re.compile(r'^Streaming Command Failed!')
# if we see this from Hadoop, it actually came from stdout and shouldn't
# be logged
_HADOOP_STDOUT_RE = re.compile(br'^packageJobJar: ')
# match the filename of a hadoop streaming jar
_HADOOP_STREAMING_JAR_RE = re.compile(
r'^hadoop.*streaming.*(?<!-sources)\.jar$')
def fully_qualify_hdfs_path(path):
"""If path isn't an ``hdfs://`` URL, turn it into one."""
if is_uri(path):
return path
elif path.startswith('/'):
return 'hdfs://' + path
else:
return 'hdfs:///user/%s/%s' % (getpass.getuser(), path)
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin):
"""Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
Invoked when you run your job with ``-r hadoop``.
Input and support files can be either local or on HDFS; use ``hdfs://...``
URLs to refer to files on HDFS.
"""
alias = 'hadoop'
OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
'hadoop_bin',
'hadoop_extra_args',
'hadoop_log_dirs',
'hadoop_streaming_jar',
'hadoop_tmp_dir',
'spark_deploy_mode',
'spark_master',
}
# supports everything (so far)
_STEP_TYPES = {
'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'}
def __init__(self, **kwargs):
""":py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
"""
super(HadoopJobRunner, self).__init__(**kwargs)
self._hadoop_tmp_dir = fully_qualify_hdfs_path(
posixpath.join(
self._opts['hadoop_tmp_dir'], self._job_key))
# Keep track of local files to upload to HDFS. We'll add them
# to this manager just before we need them.
hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
self._upload_mgr = UploadDirManager(hdfs_files_dir)
# Set output dir if it wasn't set explicitly
self._output_dir = fully_qualify_hdfs_path(
self._output_dir or
posixpath.join(self._hadoop_tmp_dir, 'output'))
# Fully qualify step_output_dir, if set
if self._step_output_dir:
self._step_output_dir = fully_qualify_hdfs_path(
self._step_output_dir)
# Track job and (YARN) application ID to enable log parsing
self._application_id = None
self._job_id = None
# Keep track of where the hadoop streaming jar is
self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
self._searched_for_hadoop_streaming_jar = False
# List of dicts (one for each step) potentially containing
# the keys 'history', 'step', and 'task' ('step' will always
# be filled because it comes from the hadoop jar command output,
# others will be filled as needed)
self._log_interpretations = []
@classmethod
def _default_opts(cls):
return combine_dicts(
super(HadoopJobRunner, cls)._default_opts(),
dict(
hadoop_tmp_dir='tmp/mrjob',
)
)
@property
def fs(self):
""":py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
filesystem.
"""
if self._fs is None:
self._fs = CompositeFilesystem()
# don't pass [] to fs; this means not to use hadoop until
# fs.set_hadoop_bin() is called (used for running hadoop over SSH).
hadoop_bin = self._opts['hadoop_bin'] or None
self._fs.add_fs('hadoop',
HadoopFilesystem(hadoop_bin))
self._fs.add_fs('local', LocalFilesystem())
return self._fs
def get_hadoop_version(self):
"""Invoke the hadoop executable to determine its version"""
return self.fs.hadoop.get_hadoop_version()
def get_hadoop_bin(self):
"""Find the hadoop binary. A list: binary followed by arguments."""
return self.fs.hadoop.get_hadoop_bin()
def get_hadoop_streaming_jar(self):
"""Find the path of the hadoop streaming jar, or None if not found."""
if not (self._hadoop_streaming_jar or
self._searched_for_hadoop_streaming_jar):
self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()
if self._hadoop_streaming_jar:
log.info('Found Hadoop streaming jar: %s' %
self._hadoop_streaming_jar)
else:
log.warning('Hadoop streaming jar not found. Use'
' --hadoop-streaming-jar')
self._searched_for_hadoop_streaming_jar = True
return self._hadoop_streaming_jar
def _find_hadoop_streaming_jar(self):
"""Search for the hadoop streaming jar. See
:py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
for path in unique(self._hadoop_streaming_jar_dirs()):
log.info('Looking for Hadoop streaming jar in %s...' % path)
streaming_jars = []
for path in self.fs.ls(path):
if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
streaming_jars.append(path)
if streaming_jars:
# prefer shorter names and shallower paths
def sort_key(p):
return (len(p.split('/')),
len(posixpath.basename(p)),
p)
streaming_jars.sort(key=sort_key)
return streaming_jars[0]
return None
def _hadoop_dirs(self):
"""Yield all possible hadoop directories (used for streaming jar
and logs). May yield duplicates"""
for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
'HADOOP_MAPRED_HOME'):
path = os.environ.get(name)
if path:
yield path
# guess it from the path of the Hadoop binary
hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
if hadoop_home:
yield hadoop_home
# try HADOOP_*_HOME
for name, path in sorted(os.environ.items()):
if name.startswith('HADOOP_') and name.endswith('_HOME'):
yield path
def _hadoop_streaming_jar_dirs(self):
"""Yield all possible places to look for the Hadoop streaming jar.
May yield duplicates.
"""
for hadoop_dir in self._hadoop_dirs():
yield hadoop_dir
# use hard-coded paths to work out-of-the-box on EMR
for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
yield path
def _hadoop_log_dirs(self, output_dir=None):
"""Yield all possible places to look for hadoop logs."""
# hadoop_log_dirs opt overrides all this
if self._opts['hadoop_log_dirs']:
for path in self._opts['hadoop_log_dirs']:
yield path
return
hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
if hadoop_log_dir:
yield hadoop_log_dir
yarn = uses_yarn(self.get_hadoop_version())
if yarn:
yarn_log_dir = os.environ.get('YARN_LOG_DIR')
if yarn_log_dir:
yield yarn_log_dir
yield _DEFAULT_YARN_HDFS_LOG_DIR
if output_dir:
# Cloudera style of logging
yield posixpath.join(output_dir, '_logs')
for hadoop_dir in self._hadoop_dirs():
yield posixpath.join(hadoop_dir, 'logs')
# hard-coded fallback paths
if yarn:
for path in _FALLBACK_HADOOP_YARN_LOG_DIRS:
yield path
for path in _FALLBACK_HADOOP_LOG_DIRS:
yield path
def _run(self):
self._find_binaries_and_jars()
self._create_setup_wrapper_scripts()
self._add_job_files_for_upload()
self._upload_local_files()
self._run_job_in_hadoop()
def _find_binaries_and_jars(self):
"""Find hadoop and (if needed) spark-submit bin up-front, before
continuing with the job.
(This is just for user-interaction purposes; these would otherwise
lazy-load as needed.)
"""
# this triggers looking for Hadoop binary
self.get_hadoop_version()
if self._has_hadoop_streaming_steps():
self.get_hadoop_streaming_jar()
if self._has_spark_steps():
self.get_spark_submit_bin()
def _add_job_files_for_upload(self):
"""Add files needed for running the job (setup and input)
to self._upload_mgr."""
for path in self._py_files():
self._upload_mgr.add(path)
def _dump_stdin_to_local_file(self):
"""Dump sys.stdin to a local file, and return the path to it."""
stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
# prompt user, so they don't think the process has stalled
log.info('reading from STDIN')
log.debug('dumping stdin to local file %s...' % stdin_path)
stdin_file = open(stdin_path, 'wb')
for line in self._stdin:
stdin_file.write(line)
return stdin_path
def _run_job_in_hadoop(self):
for step_num, step in enumerate(self._get_steps()):
step_type = step['type']
step_args = self._args_for_step(step_num)
env = _fix_env(self._env_for_step(step_num))
# log this *after* _args_for_step(), which can start a search
# for the Hadoop streaming jar
log.info('Running step %d of %d...' %
(step_num + 1, self._num_steps()))
log.debug('> %s' % cmd_line(step_args))
log.debug(' with environment: %r' % sorted(env.items()))
log_interpretation = {}
self._log_interpretations.append(log_interpretation)
if self._step_type_uses_spark(step_type):
returncode, step_interpretation = self._run_spark_submit(
step_args, env, record_callback=_log_log4j_record)
else:
returncode, step_interpretation = self._run_hadoop(
step_args, env, record_callback=_log_record_from_hadoop)
# make sure output_dir is filled (used for history log)
if 'output_dir' not in step_interpretation:
step_interpretation['output_dir'] = (
self._step_output_uri(step_num))
log_interpretation['step'] = step_interpretation
self._log_counters(log_interpretation, step_num)
if returncode:
error = self._pick_error(log_interpretation, step_type)
if error:
_log_probable_cause_of_failure(log, error)
# use CalledProcessError's well-known message format
reason = str(CalledProcessError(returncode, step_args))
raise StepFailedException(
reason=reason, step_num=step_num,
num_steps=self._num_steps())
def _run_hadoop(self, hadoop_args, env, record_callback):
# try to use a PTY if it's available
try:
pid, master_fd = pty.fork()
except (AttributeError, OSError):
# no PTYs, just use Popen
# user won't get much feedback for a while, so tell them
# Hadoop is running
log.debug('No PTY available, using Popen() to invoke Hadoop')
step_proc = Popen(hadoop_args, stdout=PIPE, stderr=PIPE, env=env)
step_interpretation = _interpret_hadoop_jar_command_stderr(
step_proc.stderr,
record_callback=_log_record_from_hadoop)
# there shouldn't be much output to STDOUT
for line in step_proc.stdout:
_log_line_from_driver(to_unicode(line).strip('\r\n'))
step_proc.stdout.close()
step_proc.stderr.close()
returncode = step_proc.wait()
else:
# we have PTYs
if pid == 0: # we are the child process
try:
os.execvpe(hadoop_args[0], hadoop_args, env)
# now we are no longer Python
except OSError as ex:
# use _exit() so we don't do cleanup, etc. that's
# the parent process's job
os._exit(ex.errno)
finally:
# if we got some other exception, still exit hard
os._exit(-1)
else:
log.debug('Invoking Hadoop via PTY')
with os.fdopen(master_fd, 'rb') as master:
# reading from master gives us the subprocess's
# stderr and stdout (it's a fake terminal)
step_interpretation = (
_interpret_hadoop_jar_command_stderr(
_eio_to_eof(master),
record_callback=_log_record_from_hadoop))
_, returncode = os.waitpid(pid, 0)
return returncode, step_interpretation
def _spark_master(self):
return self._opts['spark_master'] or 'yarn'
def _args_for_step(self, step_num):
step = self._get_step(step_num)
if step['type'] == 'streaming':
return self._args_for_streaming_step(step_num)
elif step['type'] == 'jar':
return self._args_for_jar_step(step_num)
elif _is_spark_step_type(step['type']):
return self._args_for_spark_step(step_num)
else:
raise ValueError('Bad step type: %r' % (step['type'],))
def _args_for_streaming_step(self, step_num):
hadoop_streaming_jar = self.get_hadoop_streaming_jar()
if not hadoop_streaming_jar:
raise Exception('no Hadoop streaming jar')
return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] +
self._hadoop_streaming_jar_args(step_num))
def _args_for_jar_step(self, step_num):
step = self._get_step(step_num)
args = []
args.extend(self.get_hadoop_bin())
# special case for consistency with EMR runner.
#
# This might look less like duplicated code if we ever
# implement #780 (fetching jars from URIs)
if step['jar'].startswith('file:///'):
jar = step['jar'][7:] # keep leading slash
else:
jar = step['jar']
args.extend(['jar', jar])
if step.get('main_class'):
args.append(step['main_class'])
if step.get('args'):
args.extend(
self._interpolate_jar_step_args(step['args'], step_num))
return args
def _env_for_step(self, step_num):
step = self._get_step(step_num)
env = dict(os.environ)
# when running spark-submit, set its environment directly. See #1464
if _is_spark_step_type(step['type']):
env.update(self._spark_cmdenv(step_num))
return env
def _default_step_output_dir(self):
return posixpath.join(self._hadoop_tmp_dir, 'step-output')
def _cleanup_hadoop_tmp(self):
if self._hadoop_tmp_dir:
log.info('Removing HDFS temp directory %s...' %
self._hadoop_tmp_dir)
try:
self.fs.rm(self._hadoop_tmp_dir)
except Exception as e:
log.exception(e)
def _manifest_download_commands(self):
cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal']
return [
('*://*', cmd_line(cp_to_local)),
]
### LOG (implementation of LogInterpretationMixin) ###
def _stream_history_log_dirs(self, output_dir=None):
"""Yield lists of directories to look for the history log in."""
if not self._read_logs():
return
for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
if _logs_exist(self.fs, log_dir):
log.info('Looking for history log in %s...' % log_dir)
# logs aren't always in a subdir named history/
yield [log_dir]
def _stream_task_log_dirs(self, application_id=None, output_dir=None):
"""Yield lists of directories to look for the task logs in."""
# Note: this is unlikely to be super-helpful on "real" (multi-node)
# pre-YARN Hadoop because task logs aren't generally shipped to a
# local directory. It's a start, anyways. See #1201.
if not self._read_logs():
return
for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
if application_id:
path = self.fs.join(log_dir, 'userlogs', application_id)
else:
path = self.fs.join(log_dir, 'userlogs')
if _logs_exist(self.fs, path):
log.info('Looking for task syslogs in %s...' % path)
yield [path]
def counters(self):
return [_pick_counters(log_interpretation)
for log_interpretation in self._log_interpretations]
# These don't require state from HadoopJobRunner, so making them functions.
# Feel free to convert them back into methods as need be
def _hadoop_prefix_from_bin(hadoop_bin):
"""Given a path to the hadoop binary, return the path of the implied
hadoop home, or None if we don't know.
Don't return the parent directory of directories in the default
path (not ``/``, ``/usr``, or ``/usr/local``).
"""
# resolve unqualified binary name (relative paths are okay)
if '/' not in hadoop_bin:
hadoop_bin = which(hadoop_bin)
if not hadoop_bin:
return None
# use parent of hadoop_bin's directory
hadoop_home = posixpath.abspath(
posixpath.join(posixpath.realpath(posixpath.dirname(hadoop_bin)), '..')
)
if hadoop_home in _BAD_HADOOP_HOMES:
return None
return hadoop_home
def _log_record_from_hadoop(record):
"""Log log4j record parsed from hadoop stderr."""
if not _is_counter_log4j_record(record): # counters are printed separately
_log_log4j_record(record)