GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/fs/hadoop.py
2025-06-22 13:36:01 +05:30

345 lines
12 KiB
Python

# Copyright 2009-2012 Yelp and Contributors
# Copyright 2013 David Marin
# Copyright 2015-2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os.path
import re
from io import BytesIO
from subprocess import Popen
from subprocess import PIPE
from subprocess import CalledProcessError
from mrjob.cat import decompress
from mrjob.compat import uses_yarn
from mrjob.fs.base import Filesystem
from mrjob.py2 import to_unicode
from mrjob.parse import is_uri
from mrjob.parse import urlparse
from mrjob.util import cmd_line
from mrjob.util import unique
from mrjob.util import which
log = logging.getLogger(__name__)
# used by mkdir()
_HADOOP_FILE_EXISTS_RE = re.compile(br'.*File exists.*')
# used by ls() and exists()
_HADOOP_LS_NO_SUCH_FILE = re.compile(br'^lsr?: .*No such file.*$')
# used by rm() (see below)
_HADOOP_RM_NO_SUCH_FILE = re.compile(br'^rmr?: .*No such file.*$')
# find version string in "Hadoop 0.20.203" etc.
_HADOOP_VERSION_RE = re.compile(br'^.*?(?P<version>(\d|\.)+).*?$')
class HadoopFilesystem(Filesystem):
"""Filesystem for URIs accepted by ``hadoop fs``. Typically you will get
one of these via ``HadoopJobRunner().fs``, composed with
:py:class:`~mrjob.fs.local.LocalFilesystem`.
This also helps with other invocations of the ``hadoop`` binary, such
as ``hadoop version`` (see :py:meth:`invoke_hadoop`).
"""
def __init__(self, hadoop_bin=None):
"""Create a Hadoop filesystem
:param hadoop_bin: ``hadoop`` binary, as a list of args. If set to
``None``, we'll auto-detect the Hadoop binary.
If set to ``[]``, this FS will be disabled
until you call :py:meth:`set_hadoop_bin`.
"""
super(HadoopFilesystem, self).__init__()
self._hadoop_bin = hadoop_bin
self._hadoop_version = None # cache for get_hadoop_version()
def can_handle_path(self, path):
if not (self._hadoop_bin or self._hadoop_bin is None):
return False
return is_uri(path)
def get_hadoop_bin(self):
"""Return the hadoop binary, searching for it if need be."""
if self._hadoop_bin is None:
self._hadoop_bin = self._find_hadoop_bin()
return self._hadoop_bin
def set_hadoop_bin(self, hadoop_bin):
"""Manually set the hadoop binary, as a list of args."""
self._hadoop_bin = hadoop_bin
def _find_hadoop_bin(self):
"""Look for the hadoop binary in any plausible place. If all
else fails, return ``['hadoop']``.
"""
def yield_paths():
for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL':
path = os.environ.get(name)
if path:
yield os.path.join(path, 'bin')
# They use $HADOOP_INSTALL/hadoop/bin here:
# https://wiki.apache.org/hadoop/GettingStartedWithHadoop
if os.environ.get('HADOOP_INSTALL'):
yield os.path.join(
os.environ['HADOOP_INSTALL'], 'hadoop', 'bin')
yield None # use $PATH
# Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give
# up. Don't worry about duplicates; they're de-duplicated below
for name, path in sorted(os.environ.items()):
if name.startswith('HADOOP_') and name.endswith('_HOME'):
yield os.path.join(path, 'bin')
for path in unique(yield_paths()):
log.info('Looking for hadoop binary in %s...' % (path or '$PATH'))
hadoop_bin = which('hadoop', path=path)
if hadoop_bin:
log.info('Found hadoop binary: %s' % hadoop_bin)
return [hadoop_bin]
else:
log.info("Falling back to 'hadoop'")
return ['hadoop']
def get_hadoop_version(self):
"""Invoke the hadoop executable to determine its version"""
# mkdir() needs this
if not self._hadoop_version:
stdout = self.invoke_hadoop(['version'], return_stdout=True)
if stdout:
first_line = stdout.split(b'\n')[0]
m = _HADOOP_VERSION_RE.match(first_line)
if m:
self._hadoop_version = to_unicode(m.group('version'))
log.info("Using Hadoop version %s" % self._hadoop_version)
else:
raise Exception('Unable to determine Hadoop version.')
return self._hadoop_version
def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
return_stdout=False):
"""Run the given hadoop command, raising an exception on non-zero
return code. This only works for commands whose output we don't
care about.
Args:
ok_returncodes -- a list/tuple/set of return codes we expect to
get back from hadoop (e.g. [0,1]). By default, we only expect 0.
If we get an unexpected return code, we raise a CalledProcessError.
ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
matches a regex in this list (even if the returncode is bad)
return_stdout -- return the stdout from the hadoop command rather
than logging it. If this is False, we return the returncode
instead.
"""
args = self.get_hadoop_bin() + args
log.debug('> %s' % cmd_line(args))
proc = Popen(args, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
log_func = log.debug if proc.returncode == 0 else log.error
if not return_stdout:
for line in BytesIO(stdout):
log_func('STDOUT: ' + to_unicode(line.rstrip(b'\r\n')))
# check if STDERR is okay
stderr_is_ok = False
if ok_stderr:
for stderr_re in ok_stderr:
if stderr_re.match(stderr):
stderr_is_ok = True
break
if not stderr_is_ok:
for line in BytesIO(stderr):
log_func('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))
ok_returncodes = ok_returncodes or [0]
if not stderr_is_ok and proc.returncode not in ok_returncodes:
raise CalledProcessError(proc.returncode, args)
if return_stdout:
return stdout
else:
return proc.returncode
def du(self, path_glob):
"""Get the size of a file or directory (recursively), or 0
if it doesn't exist."""
try:
stdout = self.invoke_hadoop(['fs', '-du', path_glob],
return_stdout=True,
ok_returncodes=[0, 1, 255])
except CalledProcessError:
return 0
try:
return sum(int(line.split()[0])
for line in stdout.split(b'\n')
if line.strip())
except (ValueError, TypeError, IndexError):
raise IOError(
'Unexpected output from hadoop fs -du: %r' % stdout)
def ls(self, path_glob):
components = urlparse(path_glob)
hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
version = self.get_hadoop_version()
# use ls -R on Hadoop 2 (see #1152)
if uses_yarn(version):
args = ['fs', '-ls', '-R', path_glob]
else:
args = ['fs', '-lsr', path_glob]
try:
stdout = self.invoke_hadoop(args, return_stdout=True,
ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
except CalledProcessError:
raise IOError("Could not ls %s" % path_glob)
for line in BytesIO(stdout):
line = line.rstrip(b'\r\n')
# ignore total item count
if line.startswith(b'Found '):
continue
fields = line.split(b' ')
# Throw out directories
if fields[0].startswith(b'd'):
continue
# Try to figure out which part of the line is the path
# Expected lines:
#
# HDFS:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar
#
# S3:
# -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar
path_index = None
for index, field in enumerate(fields):
# look for time field, and pick one after that
# (can't use field[2] because that's an int in Python 3)
if len(field) == 5 and field[2:3] == b':':
path_index = (index + 1)
if not path_index:
raise IOError("Could not locate path in string %r" % line)
path = to_unicode(line.split(b' ', path_index)[-1])
# handle fully qualified URIs from newer versions of Hadoop ls
# (see Pull Request #577)
if is_uri(path):
yield path
else:
yield hdfs_prefix + path
def _cat_file(self, path):
# stream from HDFS
cat_args = self.get_hadoop_bin() + ['fs', '-cat', path]
log.debug('> %s' % cmd_line(cat_args))
cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)
for chunk in decompress(cat_proc.stdout, path):
yield chunk
# this does someties happen; see #1396
for line in cat_proc.stderr:
log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))
cat_proc.stdout.close()
cat_proc.stderr.close()
returncode = cat_proc.wait()
if returncode != 0:
raise IOError("Could not stream %s" % path)
def mkdir(self, path):
version = self.get_hadoop_version()
# use -p on Hadoop 2 (see #991, #845)
if uses_yarn(version):
args = ['fs', '-mkdir', '-p', path]
else:
args = ['fs', '-mkdir', path]
try:
self.invoke_hadoop(args, ok_stderr=[_HADOOP_FILE_EXISTS_RE])
except CalledProcessError:
raise IOError("Could not mkdir %s" % path)
def exists(self, path_glob):
"""Does the given path exist?
If dest is a directory (ends with a "/"), we check if there are
any files starting with that path.
"""
try:
return_code = self.invoke_hadoop(
['fs', '-ls', path_glob],
ok_returncodes=[0, -1, 255],
ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
return (return_code == 0)
except CalledProcessError:
raise IOError("Could not check path %s" % path_glob)
def put(self, src, path):
# don't inadvertently support cp syntax
if path.endswith('/'):
raise ValueError('put() destination may not be a directory')
self.invoke_hadoop(['fs', '-put', src, path])
def rm(self, path_glob):
if not is_uri(path_glob):
super(HadoopFilesystem, self).rm(path_glob)
version = self.get_hadoop_version()
if uses_yarn(version):
args = ['fs', '-rm', '-R', '-f', '-skipTrash', path_glob]
else:
args = ['fs', '-rmr', '-skipTrash', path_glob]
try:
self.invoke_hadoop(
args,
return_stdout=True, ok_stderr=[_HADOOP_RM_NO_SUCH_FILE])
except CalledProcessError:
raise IOError("Could not rm %s" % path_glob)
def touchz(self, path):
try:
self.invoke_hadoop(['fs', '-touchz', path])
except CalledProcessError:
raise IOError("Could not touchz %s" % path)