# Copyright 2009-2012 Yelp and Contributors # Copyright 2013 David Marin # Copyright 2015-2018 Yelp # Copyright 2019 Yelp # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os.path import re from io import BytesIO from subprocess import Popen from subprocess import PIPE from subprocess import CalledProcessError from mrjob.cat import decompress from mrjob.compat import uses_yarn from mrjob.fs.base import Filesystem from mrjob.py2 import to_unicode from mrjob.parse import is_uri from mrjob.parse import urlparse from mrjob.util import cmd_line from mrjob.util import unique from mrjob.util import which log = logging.getLogger(__name__) # used by mkdir() _HADOOP_FILE_EXISTS_RE = re.compile(br'.*File exists.*') # used by ls() and exists() _HADOOP_LS_NO_SUCH_FILE = re.compile(br'^lsr?: .*No such file.*$') # used by rm() (see below) _HADOOP_RM_NO_SUCH_FILE = re.compile(br'^rmr?: .*No such file.*$') # find version string in "Hadoop 0.20.203" etc. _HADOOP_VERSION_RE = re.compile(br'^.*?(?P(\d|\.)+).*?$') class HadoopFilesystem(Filesystem): """Filesystem for URIs accepted by ``hadoop fs``. Typically you will get one of these via ``HadoopJobRunner().fs``, composed with :py:class:`~mrjob.fs.local.LocalFilesystem`. This also helps with other invocations of the ``hadoop`` binary, such as ``hadoop version`` (see :py:meth:`invoke_hadoop`). """ def __init__(self, hadoop_bin=None): """Create a Hadoop filesystem :param hadoop_bin: ``hadoop`` binary, as a list of args. If set to ``None``, we'll auto-detect the Hadoop binary. If set to ``[]``, this FS will be disabled until you call :py:meth:`set_hadoop_bin`. """ super(HadoopFilesystem, self).__init__() self._hadoop_bin = hadoop_bin self._hadoop_version = None # cache for get_hadoop_version() def can_handle_path(self, path): if not (self._hadoop_bin or self._hadoop_bin is None): return False return is_uri(path) def get_hadoop_bin(self): """Return the hadoop binary, searching for it if need be.""" if self._hadoop_bin is None: self._hadoop_bin = self._find_hadoop_bin() return self._hadoop_bin def set_hadoop_bin(self, hadoop_bin): """Manually set the hadoop binary, as a list of args.""" self._hadoop_bin = hadoop_bin def _find_hadoop_bin(self): """Look for the hadoop binary in any plausible place. If all else fails, return ``['hadoop']``. """ def yield_paths(): for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL': path = os.environ.get(name) if path: yield os.path.join(path, 'bin') # They use $HADOOP_INSTALL/hadoop/bin here: # https://wiki.apache.org/hadoop/GettingStartedWithHadoop if os.environ.get('HADOOP_INSTALL'): yield os.path.join( os.environ['HADOOP_INSTALL'], 'hadoop', 'bin') yield None # use $PATH # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give # up. Don't worry about duplicates; they're de-duplicated below for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield os.path.join(path, 'bin') for path in unique(yield_paths()): log.info('Looking for hadoop binary in %s...' % (path or '$PATH')) hadoop_bin = which('hadoop', path=path) if hadoop_bin: log.info('Found hadoop binary: %s' % hadoop_bin) return [hadoop_bin] else: log.info("Falling back to 'hadoop'") return ['hadoop'] def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" # mkdir() needs this if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split(b'\n')[0] m = _HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = to_unicode(m.group('version')) log.info("Using Hadoop version %s" % self._hadoop_version) else: raise Exception('Unable to determine Hadoop version.') return self._hadoop_version def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None, return_stdout=False): """Run the given hadoop command, raising an exception on non-zero return code. This only works for commands whose output we don't care about. Args: ok_returncodes -- a list/tuple/set of return codes we expect to get back from hadoop (e.g. [0,1]). By default, we only expect 0. If we get an unexpected return code, we raise a CalledProcessError. ok_stderr -- don't log STDERR or raise CalledProcessError if stderr matches a regex in this list (even if the returncode is bad) return_stdout -- return the stdout from the hadoop command rather than logging it. If this is False, we return the returncode instead. """ args = self.get_hadoop_bin() + args log.debug('> %s' % cmd_line(args)) proc = Popen(args, stdout=PIPE, stderr=PIPE) stdout, stderr = proc.communicate() log_func = log.debug if proc.returncode == 0 else log.error if not return_stdout: for line in BytesIO(stdout): log_func('STDOUT: ' + to_unicode(line.rstrip(b'\r\n'))) # check if STDERR is okay stderr_is_ok = False if ok_stderr: for stderr_re in ok_stderr: if stderr_re.match(stderr): stderr_is_ok = True break if not stderr_is_ok: for line in BytesIO(stderr): log_func('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) ok_returncodes = ok_returncodes or [0] if not stderr_is_ok and proc.returncode not in ok_returncodes: raise CalledProcessError(proc.returncode, args) if return_stdout: return stdout else: return proc.returncode def du(self, path_glob): """Get the size of a file or directory (recursively), or 0 if it doesn't exist.""" try: stdout = self.invoke_hadoop(['fs', '-du', path_glob], return_stdout=True, ok_returncodes=[0, 1, 255]) except CalledProcessError: return 0 try: return sum(int(line.split()[0]) for line in stdout.split(b'\n') if line.strip()) except (ValueError, TypeError, IndexError): raise IOError( 'Unexpected output from hadoop fs -du: %r' % stdout) def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_unicode(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path def _cat_file(self, path): # stream from HDFS cat_args = self.get_hadoop_bin() + ['fs', '-cat', path] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) for chunk in decompress(cat_proc.stdout, path): yield chunk # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % path) def mkdir(self, path): version = self.get_hadoop_version() # use -p on Hadoop 2 (see #991, #845) if uses_yarn(version): args = ['fs', '-mkdir', '-p', path] else: args = ['fs', '-mkdir', path] try: self.invoke_hadoop(args, ok_stderr=[_HADOOP_FILE_EXISTS_RE]) except CalledProcessError: raise IOError("Could not mkdir %s" % path) def exists(self, path_glob): """Does the given path exist? If dest is a directory (ends with a "/"), we check if there are any files starting with that path. """ try: return_code = self.invoke_hadoop( ['fs', '-ls', path_glob], ok_returncodes=[0, -1, 255], ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) return (return_code == 0) except CalledProcessError: raise IOError("Could not check path %s" % path_glob) def put(self, src, path): # don't inadvertently support cp syntax if path.endswith('/'): raise ValueError('put() destination may not be a directory') self.invoke_hadoop(['fs', '-put', src, path]) def rm(self, path_glob): if not is_uri(path_glob): super(HadoopFilesystem, self).rm(path_glob) version = self.get_hadoop_version() if uses_yarn(version): args = ['fs', '-rm', '-R', '-f', '-skipTrash', path_glob] else: args = ['fs', '-rmr', '-skipTrash', path_glob] try: self.invoke_hadoop( args, return_stdout=True, ok_stderr=[_HADOOP_RM_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not rm %s" % path_glob) def touchz(self, path): try: self.invoke_hadoop(['fs', '-touchz', path]) except CalledProcessError: raise IOError("Could not touchz %s" % path)