mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-10 04:01:19 +00:00
609 lines
21 KiB
Python
609 lines
21 KiB
Python
# Copyright 2009-2016 Yelp and Contributors
|
|
# Copyright 2017 Yelp
|
|
# Copyright 2018 Yelp and Google, Inc.
|
|
# Copyright 2019 Yelp
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import getpass
|
|
import logging
|
|
import os
|
|
import posixpath
|
|
import re
|
|
from subprocess import CalledProcessError
|
|
from subprocess import Popen
|
|
from subprocess import PIPE
|
|
|
|
try:
|
|
import pty
|
|
pty # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
pty = None
|
|
|
|
from mrjob.bin import MRJobBinRunner
|
|
from mrjob.compat import uses_yarn
|
|
from mrjob.conf import combine_dicts
|
|
from mrjob.fs.composite import CompositeFilesystem
|
|
from mrjob.fs.hadoop import HadoopFilesystem
|
|
from mrjob.fs.local import LocalFilesystem
|
|
from mrjob.logs.counters import _pick_counters
|
|
from mrjob.logs.errors import _log_probable_cause_of_failure
|
|
from mrjob.logs.mixin import LogInterpretationMixin
|
|
from mrjob.logs.step import _eio_to_eof
|
|
from mrjob.logs.step import _interpret_hadoop_jar_command_stderr
|
|
from mrjob.logs.step import _is_counter_log4j_record
|
|
from mrjob.logs.step import _log_line_from_driver
|
|
from mrjob.logs.step import _log_log4j_record
|
|
from mrjob.logs.wrap import _logs_exist
|
|
from mrjob.parse import is_uri
|
|
from mrjob.py2 import to_unicode
|
|
from mrjob.runner import _fix_env
|
|
from mrjob.setup import UploadDirManager
|
|
from mrjob.step import StepFailedException
|
|
from mrjob.step import _is_spark_step_type
|
|
from mrjob.util import cmd_line
|
|
from mrjob.util import unique
|
|
from mrjob.util import which
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# don't look for the hadoop streaming jar here!
|
|
_BAD_HADOOP_HOMES = ['/', '/usr', '/usr/local']
|
|
|
|
# where YARN stores history logs, etc. on HDFS by default
|
|
_DEFAULT_YARN_HDFS_LOG_DIR = 'hdfs:///tmp/hadoop-yarn/staging'
|
|
|
|
# places to look for the Hadoop streaming jar if we're inside EMR
|
|
_EMR_HADOOP_STREAMING_JAR_DIRS = [
|
|
# for the 2.x and 3.x AMIs (the 2.x AMIs also set $HADOOP_HOME properly)
|
|
'/home/hadoop/contrib',
|
|
# for the 4.x AMIs
|
|
'/usr/lib/hadoop-mapreduce',
|
|
]
|
|
|
|
# these are fairly standard places to keep Hadoop logs
|
|
_FALLBACK_HADOOP_LOG_DIRS = [
|
|
'/var/log/hadoop',
|
|
'/mnt/var/log/hadoop', # EMR's 2.x and 3.x AMIs use this
|
|
]
|
|
|
|
# fairly standard places to keep YARN logs (see #1339)
|
|
_FALLBACK_HADOOP_YARN_LOG_DIRS = [
|
|
'/var/log/hadoop-yarn',
|
|
'/mnt/var/log/hadoop-yarn',
|
|
]
|
|
|
|
# start of Counters printed by Hadoop
|
|
_HADOOP_COUNTERS_START_RE = re.compile(br'^Counters: (?P<amount>\d+)\s*$')
|
|
|
|
# header for a group of counters
|
|
_HADOOP_COUNTER_GROUP_RE = re.compile(br'^(?P<indent>\s+)(?P<group>.*)$')
|
|
|
|
# line for a counter
|
|
_HADOOP_COUNTER_RE = re.compile(
|
|
br'^(?P<indent>\s+)(?P<counter>.*)=(?P<amount>\d+)\s*$')
|
|
|
|
# the one thing Hadoop streaming prints to stderr not in log format
|
|
_HADOOP_NON_LOG_LINE_RE = re.compile(r'^Streaming Command Failed!')
|
|
|
|
# if we see this from Hadoop, it actually came from stdout and shouldn't
|
|
# be logged
|
|
_HADOOP_STDOUT_RE = re.compile(br'^packageJobJar: ')
|
|
|
|
# match the filename of a hadoop streaming jar
|
|
_HADOOP_STREAMING_JAR_RE = re.compile(
|
|
r'^hadoop.*streaming.*(?<!-sources)\.jar$')
|
|
|
|
|
|
def fully_qualify_hdfs_path(path):
|
|
"""If path isn't an ``hdfs://`` URL, turn it into one."""
|
|
if is_uri(path):
|
|
return path
|
|
elif path.startswith('/'):
|
|
return 'hdfs://' + path
|
|
else:
|
|
return 'hdfs:///user/%s/%s' % (getpass.getuser(), path)
|
|
|
|
|
|
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin):
|
|
"""Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
|
|
Invoked when you run your job with ``-r hadoop``.
|
|
|
|
Input and support files can be either local or on HDFS; use ``hdfs://...``
|
|
URLs to refer to files on HDFS.
|
|
"""
|
|
alias = 'hadoop'
|
|
|
|
OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
|
|
'hadoop_bin',
|
|
'hadoop_extra_args',
|
|
'hadoop_log_dirs',
|
|
'hadoop_streaming_jar',
|
|
'hadoop_tmp_dir',
|
|
'spark_deploy_mode',
|
|
'spark_master',
|
|
}
|
|
|
|
# supports everything (so far)
|
|
_STEP_TYPES = {
|
|
'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'}
|
|
|
|
def __init__(self, **kwargs):
|
|
""":py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
|
|
as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
|
|
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
|
|
"""
|
|
super(HadoopJobRunner, self).__init__(**kwargs)
|
|
|
|
self._hadoop_tmp_dir = fully_qualify_hdfs_path(
|
|
posixpath.join(
|
|
self._opts['hadoop_tmp_dir'], self._job_key))
|
|
|
|
# Keep track of local files to upload to HDFS. We'll add them
|
|
# to this manager just before we need them.
|
|
hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
|
|
self._upload_mgr = UploadDirManager(hdfs_files_dir)
|
|
|
|
# Set output dir if it wasn't set explicitly
|
|
self._output_dir = fully_qualify_hdfs_path(
|
|
self._output_dir or
|
|
posixpath.join(self._hadoop_tmp_dir, 'output'))
|
|
|
|
# Fully qualify step_output_dir, if set
|
|
if self._step_output_dir:
|
|
self._step_output_dir = fully_qualify_hdfs_path(
|
|
self._step_output_dir)
|
|
|
|
# Track job and (YARN) application ID to enable log parsing
|
|
self._application_id = None
|
|
self._job_id = None
|
|
|
|
# Keep track of where the hadoop streaming jar is
|
|
self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
|
|
self._searched_for_hadoop_streaming_jar = False
|
|
|
|
# List of dicts (one for each step) potentially containing
|
|
# the keys 'history', 'step', and 'task' ('step' will always
|
|
# be filled because it comes from the hadoop jar command output,
|
|
# others will be filled as needed)
|
|
self._log_interpretations = []
|
|
|
|
@classmethod
|
|
def _default_opts(cls):
|
|
return combine_dicts(
|
|
super(HadoopJobRunner, cls)._default_opts(),
|
|
dict(
|
|
hadoop_tmp_dir='tmp/mrjob',
|
|
)
|
|
)
|
|
|
|
@property
|
|
def fs(self):
|
|
""":py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
|
|
filesystem.
|
|
"""
|
|
if self._fs is None:
|
|
self._fs = CompositeFilesystem()
|
|
|
|
# don't pass [] to fs; this means not to use hadoop until
|
|
# fs.set_hadoop_bin() is called (used for running hadoop over SSH).
|
|
hadoop_bin = self._opts['hadoop_bin'] or None
|
|
|
|
self._fs.add_fs('hadoop',
|
|
HadoopFilesystem(hadoop_bin))
|
|
self._fs.add_fs('local', LocalFilesystem())
|
|
|
|
return self._fs
|
|
|
|
def get_hadoop_version(self):
|
|
"""Invoke the hadoop executable to determine its version"""
|
|
return self.fs.hadoop.get_hadoop_version()
|
|
|
|
def get_hadoop_bin(self):
|
|
"""Find the hadoop binary. A list: binary followed by arguments."""
|
|
return self.fs.hadoop.get_hadoop_bin()
|
|
|
|
def get_hadoop_streaming_jar(self):
|
|
"""Find the path of the hadoop streaming jar, or None if not found."""
|
|
if not (self._hadoop_streaming_jar or
|
|
self._searched_for_hadoop_streaming_jar):
|
|
|
|
self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()
|
|
|
|
if self._hadoop_streaming_jar:
|
|
log.info('Found Hadoop streaming jar: %s' %
|
|
self._hadoop_streaming_jar)
|
|
else:
|
|
log.warning('Hadoop streaming jar not found. Use'
|
|
' --hadoop-streaming-jar')
|
|
|
|
self._searched_for_hadoop_streaming_jar = True
|
|
|
|
return self._hadoop_streaming_jar
|
|
|
|
def _find_hadoop_streaming_jar(self):
|
|
"""Search for the hadoop streaming jar. See
|
|
:py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
|
|
for path in unique(self._hadoop_streaming_jar_dirs()):
|
|
log.info('Looking for Hadoop streaming jar in %s...' % path)
|
|
|
|
streaming_jars = []
|
|
for path in self.fs.ls(path):
|
|
if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
|
|
streaming_jars.append(path)
|
|
|
|
if streaming_jars:
|
|
# prefer shorter names and shallower paths
|
|
def sort_key(p):
|
|
return (len(p.split('/')),
|
|
len(posixpath.basename(p)),
|
|
p)
|
|
|
|
streaming_jars.sort(key=sort_key)
|
|
|
|
return streaming_jars[0]
|
|
|
|
return None
|
|
|
|
def _hadoop_dirs(self):
|
|
"""Yield all possible hadoop directories (used for streaming jar
|
|
and logs). May yield duplicates"""
|
|
for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
|
|
'HADOOP_MAPRED_HOME'):
|
|
path = os.environ.get(name)
|
|
if path:
|
|
yield path
|
|
|
|
# guess it from the path of the Hadoop binary
|
|
hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
|
|
if hadoop_home:
|
|
yield hadoop_home
|
|
|
|
# try HADOOP_*_HOME
|
|
for name, path in sorted(os.environ.items()):
|
|
if name.startswith('HADOOP_') and name.endswith('_HOME'):
|
|
yield path
|
|
|
|
def _hadoop_streaming_jar_dirs(self):
|
|
"""Yield all possible places to look for the Hadoop streaming jar.
|
|
May yield duplicates.
|
|
"""
|
|
for hadoop_dir in self._hadoop_dirs():
|
|
yield hadoop_dir
|
|
|
|
# use hard-coded paths to work out-of-the-box on EMR
|
|
for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
|
|
yield path
|
|
|
|
def _hadoop_log_dirs(self, output_dir=None):
|
|
"""Yield all possible places to look for hadoop logs."""
|
|
# hadoop_log_dirs opt overrides all this
|
|
if self._opts['hadoop_log_dirs']:
|
|
for path in self._opts['hadoop_log_dirs']:
|
|
yield path
|
|
return
|
|
|
|
hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
|
|
if hadoop_log_dir:
|
|
yield hadoop_log_dir
|
|
|
|
yarn = uses_yarn(self.get_hadoop_version())
|
|
|
|
if yarn:
|
|
yarn_log_dir = os.environ.get('YARN_LOG_DIR')
|
|
if yarn_log_dir:
|
|
yield yarn_log_dir
|
|
|
|
yield _DEFAULT_YARN_HDFS_LOG_DIR
|
|
|
|
if output_dir:
|
|
# Cloudera style of logging
|
|
yield posixpath.join(output_dir, '_logs')
|
|
|
|
for hadoop_dir in self._hadoop_dirs():
|
|
yield posixpath.join(hadoop_dir, 'logs')
|
|
|
|
# hard-coded fallback paths
|
|
if yarn:
|
|
for path in _FALLBACK_HADOOP_YARN_LOG_DIRS:
|
|
yield path
|
|
|
|
for path in _FALLBACK_HADOOP_LOG_DIRS:
|
|
yield path
|
|
|
|
def _run(self):
|
|
self._find_binaries_and_jars()
|
|
self._create_setup_wrapper_scripts()
|
|
self._add_job_files_for_upload()
|
|
self._upload_local_files()
|
|
self._run_job_in_hadoop()
|
|
|
|
def _find_binaries_and_jars(self):
|
|
"""Find hadoop and (if needed) spark-submit bin up-front, before
|
|
continuing with the job.
|
|
|
|
(This is just for user-interaction purposes; these would otherwise
|
|
lazy-load as needed.)
|
|
"""
|
|
# this triggers looking for Hadoop binary
|
|
self.get_hadoop_version()
|
|
|
|
if self._has_hadoop_streaming_steps():
|
|
self.get_hadoop_streaming_jar()
|
|
|
|
if self._has_spark_steps():
|
|
self.get_spark_submit_bin()
|
|
|
|
def _add_job_files_for_upload(self):
|
|
"""Add files needed for running the job (setup and input)
|
|
to self._upload_mgr."""
|
|
for path in self._py_files():
|
|
self._upload_mgr.add(path)
|
|
|
|
def _dump_stdin_to_local_file(self):
|
|
"""Dump sys.stdin to a local file, and return the path to it."""
|
|
stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
|
|
# prompt user, so they don't think the process has stalled
|
|
log.info('reading from STDIN')
|
|
|
|
log.debug('dumping stdin to local file %s...' % stdin_path)
|
|
stdin_file = open(stdin_path, 'wb')
|
|
for line in self._stdin:
|
|
stdin_file.write(line)
|
|
|
|
return stdin_path
|
|
|
|
def _run_job_in_hadoop(self):
|
|
for step_num, step in enumerate(self._get_steps()):
|
|
step_type = step['type']
|
|
step_args = self._args_for_step(step_num)
|
|
env = _fix_env(self._env_for_step(step_num))
|
|
|
|
# log this *after* _args_for_step(), which can start a search
|
|
# for the Hadoop streaming jar
|
|
log.info('Running step %d of %d...' %
|
|
(step_num + 1, self._num_steps()))
|
|
log.debug('> %s' % cmd_line(step_args))
|
|
log.debug(' with environment: %r' % sorted(env.items()))
|
|
|
|
log_interpretation = {}
|
|
self._log_interpretations.append(log_interpretation)
|
|
|
|
if self._step_type_uses_spark(step_type):
|
|
returncode, step_interpretation = self._run_spark_submit(
|
|
step_args, env, record_callback=_log_log4j_record)
|
|
else:
|
|
returncode, step_interpretation = self._run_hadoop(
|
|
step_args, env, record_callback=_log_record_from_hadoop)
|
|
|
|
# make sure output_dir is filled (used for history log)
|
|
if 'output_dir' not in step_interpretation:
|
|
step_interpretation['output_dir'] = (
|
|
self._step_output_uri(step_num))
|
|
|
|
log_interpretation['step'] = step_interpretation
|
|
|
|
self._log_counters(log_interpretation, step_num)
|
|
|
|
if returncode:
|
|
error = self._pick_error(log_interpretation, step_type)
|
|
if error:
|
|
_log_probable_cause_of_failure(log, error)
|
|
|
|
# use CalledProcessError's well-known message format
|
|
reason = str(CalledProcessError(returncode, step_args))
|
|
raise StepFailedException(
|
|
reason=reason, step_num=step_num,
|
|
num_steps=self._num_steps())
|
|
|
|
def _run_hadoop(self, hadoop_args, env, record_callback):
|
|
# try to use a PTY if it's available
|
|
try:
|
|
pid, master_fd = pty.fork()
|
|
except (AttributeError, OSError):
|
|
# no PTYs, just use Popen
|
|
|
|
# user won't get much feedback for a while, so tell them
|
|
# Hadoop is running
|
|
log.debug('No PTY available, using Popen() to invoke Hadoop')
|
|
|
|
step_proc = Popen(hadoop_args, stdout=PIPE, stderr=PIPE, env=env)
|
|
|
|
step_interpretation = _interpret_hadoop_jar_command_stderr(
|
|
step_proc.stderr,
|
|
record_callback=_log_record_from_hadoop)
|
|
|
|
# there shouldn't be much output to STDOUT
|
|
for line in step_proc.stdout:
|
|
_log_line_from_driver(to_unicode(line).strip('\r\n'))
|
|
|
|
step_proc.stdout.close()
|
|
step_proc.stderr.close()
|
|
|
|
returncode = step_proc.wait()
|
|
else:
|
|
# we have PTYs
|
|
if pid == 0: # we are the child process
|
|
try:
|
|
os.execvpe(hadoop_args[0], hadoop_args, env)
|
|
# now we are no longer Python
|
|
except OSError as ex:
|
|
# use _exit() so we don't do cleanup, etc. that's
|
|
# the parent process's job
|
|
os._exit(ex.errno)
|
|
finally:
|
|
# if we got some other exception, still exit hard
|
|
os._exit(-1)
|
|
else:
|
|
log.debug('Invoking Hadoop via PTY')
|
|
|
|
with os.fdopen(master_fd, 'rb') as master:
|
|
# reading from master gives us the subprocess's
|
|
# stderr and stdout (it's a fake terminal)
|
|
step_interpretation = (
|
|
_interpret_hadoop_jar_command_stderr(
|
|
_eio_to_eof(master),
|
|
record_callback=_log_record_from_hadoop))
|
|
_, returncode = os.waitpid(pid, 0)
|
|
|
|
return returncode, step_interpretation
|
|
|
|
def _spark_master(self):
|
|
return self._opts['spark_master'] or 'yarn'
|
|
|
|
def _args_for_step(self, step_num):
|
|
step = self._get_step(step_num)
|
|
|
|
if step['type'] == 'streaming':
|
|
return self._args_for_streaming_step(step_num)
|
|
elif step['type'] == 'jar':
|
|
return self._args_for_jar_step(step_num)
|
|
elif _is_spark_step_type(step['type']):
|
|
return self._args_for_spark_step(step_num)
|
|
else:
|
|
raise ValueError('Bad step type: %r' % (step['type'],))
|
|
|
|
def _args_for_streaming_step(self, step_num):
|
|
hadoop_streaming_jar = self.get_hadoop_streaming_jar()
|
|
if not hadoop_streaming_jar:
|
|
raise Exception('no Hadoop streaming jar')
|
|
|
|
return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] +
|
|
self._hadoop_streaming_jar_args(step_num))
|
|
|
|
def _args_for_jar_step(self, step_num):
|
|
step = self._get_step(step_num)
|
|
|
|
args = []
|
|
|
|
args.extend(self.get_hadoop_bin())
|
|
|
|
# special case for consistency with EMR runner.
|
|
#
|
|
# This might look less like duplicated code if we ever
|
|
# implement #780 (fetching jars from URIs)
|
|
if step['jar'].startswith('file:///'):
|
|
jar = step['jar'][7:] # keep leading slash
|
|
else:
|
|
jar = step['jar']
|
|
|
|
args.extend(['jar', jar])
|
|
|
|
if step.get('main_class'):
|
|
args.append(step['main_class'])
|
|
|
|
if step.get('args'):
|
|
args.extend(
|
|
self._interpolate_jar_step_args(step['args'], step_num))
|
|
|
|
return args
|
|
|
|
def _env_for_step(self, step_num):
|
|
step = self._get_step(step_num)
|
|
|
|
env = dict(os.environ)
|
|
|
|
# when running spark-submit, set its environment directly. See #1464
|
|
if _is_spark_step_type(step['type']):
|
|
env.update(self._spark_cmdenv(step_num))
|
|
|
|
return env
|
|
|
|
def _default_step_output_dir(self):
|
|
return posixpath.join(self._hadoop_tmp_dir, 'step-output')
|
|
|
|
def _cleanup_hadoop_tmp(self):
|
|
if self._hadoop_tmp_dir:
|
|
log.info('Removing HDFS temp directory %s...' %
|
|
self._hadoop_tmp_dir)
|
|
try:
|
|
self.fs.rm(self._hadoop_tmp_dir)
|
|
except Exception as e:
|
|
log.exception(e)
|
|
|
|
def _manifest_download_commands(self):
|
|
cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal']
|
|
|
|
return [
|
|
('*://*', cmd_line(cp_to_local)),
|
|
]
|
|
|
|
### LOG (implementation of LogInterpretationMixin) ###
|
|
|
|
def _stream_history_log_dirs(self, output_dir=None):
|
|
"""Yield lists of directories to look for the history log in."""
|
|
if not self._read_logs():
|
|
return
|
|
|
|
for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
|
|
if _logs_exist(self.fs, log_dir):
|
|
log.info('Looking for history log in %s...' % log_dir)
|
|
# logs aren't always in a subdir named history/
|
|
yield [log_dir]
|
|
|
|
def _stream_task_log_dirs(self, application_id=None, output_dir=None):
|
|
"""Yield lists of directories to look for the task logs in."""
|
|
# Note: this is unlikely to be super-helpful on "real" (multi-node)
|
|
# pre-YARN Hadoop because task logs aren't generally shipped to a
|
|
# local directory. It's a start, anyways. See #1201.
|
|
if not self._read_logs():
|
|
return
|
|
|
|
for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
|
|
if application_id:
|
|
path = self.fs.join(log_dir, 'userlogs', application_id)
|
|
else:
|
|
path = self.fs.join(log_dir, 'userlogs')
|
|
|
|
if _logs_exist(self.fs, path):
|
|
log.info('Looking for task syslogs in %s...' % path)
|
|
yield [path]
|
|
|
|
def counters(self):
|
|
return [_pick_counters(log_interpretation)
|
|
for log_interpretation in self._log_interpretations]
|
|
|
|
|
|
# These don't require state from HadoopJobRunner, so making them functions.
|
|
# Feel free to convert them back into methods as need be
|
|
|
|
|
|
def _hadoop_prefix_from_bin(hadoop_bin):
|
|
"""Given a path to the hadoop binary, return the path of the implied
|
|
hadoop home, or None if we don't know.
|
|
|
|
Don't return the parent directory of directories in the default
|
|
path (not ``/``, ``/usr``, or ``/usr/local``).
|
|
"""
|
|
# resolve unqualified binary name (relative paths are okay)
|
|
if '/' not in hadoop_bin:
|
|
hadoop_bin = which(hadoop_bin)
|
|
if not hadoop_bin:
|
|
return None
|
|
|
|
# use parent of hadoop_bin's directory
|
|
hadoop_home = posixpath.abspath(
|
|
posixpath.join(posixpath.realpath(posixpath.dirname(hadoop_bin)), '..')
|
|
)
|
|
|
|
if hadoop_home in _BAD_HADOOP_HOMES:
|
|
return None
|
|
|
|
return hadoop_home
|
|
|
|
|
|
def _log_record_from_hadoop(record):
|
|
"""Log log4j record parsed from hadoop stderr."""
|
|
if not _is_counter_log4j_record(record): # counters are printed separately
|
|
_log_log4j_record(record)
|