# -*- coding: utf-8 -*- # Copyright 2017-2018 Yelp # Copyright 2019 Yelp # Copyright 2020 Affirm, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import pipes import socket import random import signal import time from copy import deepcopy from subprocess import Popen from subprocess import PIPE from mrjob.bin import MRJobBinRunner from mrjob.bin import _unarchive_cmd from mrjob.conf import combine_dicts from mrjob.py2 import integer_types from mrjob.py2 import xrange from mrjob.setup import WorkingDirManager from mrjob.setup import parse_setup_cmd from mrjob.util import cmd_line log = logging.getLogger(__name__) # don't try to bind SSH tunnel to more than this many local ports _MAX_SSH_RETRIES = 20 # issue a warning if max_mins_idle is set to less than this _DEFAULT_MAX_MINS_IDLE = 10.0 # default part size (so we can share with Spark runner) _DEFAULT_CLOUD_PART_SIZE_MB = 100 class HadoopInTheCloudJobRunner(MRJobBinRunner): """Abstract base class for all Hadoop-in-the-cloud services.""" alias = '_cloud' OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'bootstrap', 'bootstrap_python', 'check_cluster_every', 'cloud_fs_sync_secs', 'cloud_part_size_mb', 'cloud_tmp_dir', 'cluster_id', 'core_instance_type', 'extra_cluster_params', 'hadoop_streaming_jar', 'image_id', 'image_version', 'instance_type', 'master_instance_type', 'max_mins_idle', 'num_core_instances', 'num_task_instances', 'region', 'ssh_bind_ports', 'ssh_tunnel', 'ssh_tunnel_is_open', 'task_instance_type', 'zone', } def __init__(self, **kwargs): super(HadoopInTheCloudJobRunner, self).__init__(**kwargs) # if *cluster_id* is not set, ``self._cluster_id`` will be # set when we create or join a cluster self._cluster_id = self._opts['cluster_id'] # bootstrapping self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() # add files to manager self._bootstrap_dir_mgr = WorkingDirManager() for cmd in self._bootstrap: for token in cmd: if isinstance(token, dict): # convert dir archive tokens to archives if token['type'] == 'dir': token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._bootstrap_dir_mgr.add(**token) # we'll create this script later, as needed self._master_bootstrap_script_path = None # ssh state # the process for the SSH tunnel self._ssh_proc = None # if this is true, stop trying to launch the SSH tunnel self._give_up_on_ssh_tunnel = False # store the (tunneled) URL of the job tracker/resource manager self._ssh_tunnel_url = None ### Options ### @classmethod def _default_opts(cls): return combine_dicts( super(HadoopInTheCloudJobRunner, cls)._default_opts(), dict( cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB, max_mins_idle=_DEFAULT_MAX_MINS_IDLE, # don't use a list because it makes it hard to read option # values when running in verbose mode. See #1284 ssh_bind_ports=xrange(40001, 40841), ssh_tunnel=False, ssh_tunnel_is_open=False, # ssh_bin isn't included here. For example, the Dataproc # runner launches ssh through the gcloud util ), ) def _fix_opts(self, opts, source=None): opts = super(HadoopInTheCloudJobRunner, self)._fix_opts( opts, source=source) # cloud_part_size_mb should be a number if opts.get('cloud_part_size_mb') is not None: if not isinstance(opts['cloud_part_size_mb'], (integer_types, float)): raise TypeError('cloud_part_size_mb must be a number') return opts def _combine_opts(self, opt_list): """Propagate *instance_type* to other instance type opts, if not already set. Also propagate core instance type to task instance type, if it's not already set. """ opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list) if opts['instance_type']: # figure out how late in the configs opt was set (setting # --instance_type on the command line overrides core_instance_type # set in configs) opt_priority = {k: -1 for k in opts} for i, sub_opts in enumerate(opt_list): for k, v in sub_opts.items(): if v == opts[k]: opt_priority[k] = i # instance_type only affects master_instance_type if there are # no other instances if opts['num_core_instances'] or opts['num_task_instances']: propagate_to = ['core_instance_type', 'task_instance_type'] else: propagate_to = ['master_instance_type'] for k in propagate_to: if opts[k] is None or ( opt_priority[k] < opt_priority['instance_type']): opts[k] = opts['instance_type'] if not opts['task_instance_type']: opts['task_instance_type'] = opts['core_instance_type'] return opts ### Bootstrapping ### def _bootstrap_python(self): """Redefine this to return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())' to make sure a compatible version of Python is installed If the *bootstrap_python* option is false, should always return ``[]``. """ return [] def _cp_to_local_cmd(self): """Command to copy files from the cloud to the local directory (usually via Hadoop). Redefine this as needed; for example, on EMR, we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't installed at bootstrap time.""" return 'hadoop fs -copyToLocal' def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']] def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not self._bootstrap: return path = os.path.join(self._get_local_tmp_dir(), 'b.sh') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content( self._bootstrap) self._write_script(contents, path, 'master bootstrap script') self._master_bootstrap_script_path = path def _master_bootstrap_script_content(self, bootstrap): """Return a list containing the lines of the master bootstrap script. (without trailing newlines) """ out = [] # shebang, precommands out.extend(self._start_of_sh_script()) out.append('') # for example, create a tmp dir and cd to it if self._bootstrap_pre_commands(): out.extend(self._bootstrap_pre_commands()) out.append('') # store $PWD out.append('# store $PWD') out.append('__mrjob_PWD=$PWD') out.append('') # special case for PWD being in /, which happens on Dataproc # (really we should cd to tmp or something) out.append('if [ $__mrjob_PWD = "/" ]; then') out.append(' __mrjob_PWD=""') out.append('fi') out.append('') # run commands in a block so we can redirect stdout to stderr # (e.g. to catch errors from compileall). See #370 out.append('{') # download files out.append(' # download files and mark them executable') cp_to_local = self._cp_to_local_cmd() # TODO: why bother with $__mrjob_PWD here, since we're already in it? for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) out.append(' %s %s $__mrjob_PWD/%s' % (cp_to_local, pipes.quote(uri), pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append(' chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # download and unarchive archives archive_names_and_paths = sorted( self._bootstrap_dir_mgr.name_to_path('archive').items()) if archive_names_and_paths: # make tmp dir if needed out.append(' # download and unpack archives') out.append(' __mrjob_TMP=$(mktemp -d)') out.append('') for name, path in archive_names_and_paths: uri = self._upload_mgr.uri(path) archive_file_name = self._bootstrap_dir_mgr.name( 'archive_file', path) # copy file to tmp dir quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote( archive_file_name) out.append(' %s %s %s' % ( cp_to_local, pipes.quote(uri), quoted_archive_path)) out.append(' ' + _unarchive_cmd(path) % dict( file=quoted_archive_path, dir='$__mrjob_PWD/' + pipes.quote(name))) # imitate Hadoop Distributed Cache (see #1602) out.append( ' chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name)) out.append('') # run bootstrap commands out.append(' # bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/ # for path dicts line = ' ' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token out.append(line) out.append('} 1>&2') # stdout -> stderr for ease of error log parsing return out def _bootstrap_pre_commands(self): """A list of hard-coded commands to run at the beginning of the bootstrap script. Currently used by dataproc to cd into a tmp dir.""" return [] def _start_of_sh_script(self): """Return a list of lines (without trailing newlines) containing the shell script shebang and pre-commands.""" out = [] # shebang sh_bin = self._sh_bin() if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin out.append('#!' + cmd_line(sh_bin)) # hook for 'set -e', etc. (see #1549) out.extend(self._sh_pre_commands()) return out ### Launching Clusters ### def _add_extra_cluster_params(self, params): """Return a dict with the *extra_cluster_params* opt patched into *params*, and ``None`` values removed.""" params = deepcopy(params) for k, v in sorted(self._opts['extra_cluster_params'].items()): _patch_params(params, k, v) return params ### SSH Tunnel ### def _ssh_tunnel_args(self, bind_port): """Redefine this in your subclass. You will probably want to call :py:meth:`_ssh_tunnel_opts` somewhere in here. Should return the list of args used to run the command to open the SSH tunnel, bound to *bind_port* on your computer, or ``None`` if it isn't possible to set up an SSH tunnel. """ return None def _ssh_tunnel_config(self): """Redefine this in your subclass. Should return a dict with the following keys: *localhost*: once we SSH in, is the web interface? reachable at ``localhost`` *name*: either ``'job tracker'`` or ``'resource manager'`` *path*: path of main page on web interface (e.g. "/cluster") *port*: port number of the web interface """ raise NotImplementedError def _launch_ssh_proc(self, args): """The command used to create a :py:class:`subprocess.Popen` to run the SSH tunnel. You usually don't need to redefine this.""" log.debug('> %s' % cmd_line(args)) return Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE) def _ssh_launch_wait_secs(self): """Wait this long after launching the SSH process before checking for failure (default 1 second). You may redefine this.""" return 1.0 def _set_up_ssh_tunnel(self): """Call this whenever you think it is possible to SSH to your cluster. This sets :py:attr:`_ssh_proc`. Does nothing if :mrjob-opt:`ssh_tunnel` is not set, or there is already a tunnel process running. """ # did the user request an SSH tunnel? if not self._opts['ssh_tunnel']: return # no point in trying to launch a nonexistent command twice if self._give_up_on_ssh_tunnel: return # did we already launch the SSH tunnel process? is it still running? if self._ssh_proc: self._ssh_proc.poll() if self._ssh_proc.returncode is None: return else: log.warning(' Oops, ssh subprocess exited with return code' ' %d, restarting...' % self._ssh_proc.returncode) self._ssh_proc = None tunnel_config = self._ssh_tunnel_config() bind_port = None popen_exception = None ssh_tunnel_args = [] for bind_port in self._pick_ssh_bind_ports(): ssh_proc = None ssh_tunnel_args = self._ssh_tunnel_args(bind_port) # can't launch SSH tunnel right now if not ssh_tunnel_args: return try: ssh_proc = self._launch_ssh_proc(ssh_tunnel_args) except OSError as ex: # e.g. OSError(2, 'File not found') popen_exception = ex # warning handled below break if ssh_proc: time.sleep(self._ssh_launch_wait_secs()) ssh_proc.poll() # still running. We are golden if ssh_proc.returncode is None: self._ssh_proc = ssh_proc break else: ssh_proc.stdin.close() ssh_proc.stdout.close() ssh_proc.stderr.close() if self._ssh_proc: if self._opts['ssh_tunnel_is_open']: bind_host = socket.getfqdn() else: bind_host = 'localhost' self._ssh_tunnel_url = 'http://%s:%d%s' % ( bind_host, bind_port, tunnel_config['path']) log.info(' Connect to %s at: %s' % ( tunnel_config['name'], self._ssh_tunnel_url)) else: if popen_exception: # this only happens if the ssh binary is not present # or not executable (so tunnel_config and the args to the # ssh binary don't matter) log.warning( " Couldn't open SSH tunnel: %s" % popen_exception) self._give_up_on_ssh_tunnel = True return else: log.warning( ' Failed to open ssh tunnel to %s' % tunnel_config['name']) def _kill_ssh_tunnel(self): """Send SIGKILL to SSH tunnel, if it's running.""" if not self._ssh_proc: return self._ssh_proc.poll() if self._ssh_proc.returncode is None: log.info('Killing our SSH tunnel (pid %d)' % self._ssh_proc.pid) self._ssh_proc.stdin.close() self._ssh_proc.stdout.close() self._ssh_proc.stderr.close() try: if hasattr(signal, 'SIGKILL'): os.kill(self._ssh_proc.pid, signal.SIGKILL) else: # Windows doesn't have SIGKILL, see #1892 os.kill(self._ssh_proc.pid, signal.SIGABRT) except Exception as e: log.exception(e) self._ssh_proc = None self._ssh_tunnel_url = None def _ssh_tunnel_opts(self, bind_port): """Options to SSH related to setting up a tunnel (rather than SSHing in). Helper for :py:meth:`_ssh_tunnel_args`. """ args = self._ssh_local_tunnel_opt(bind_port) + [ '-N', '-n', '-q', ] if self._opts['ssh_tunnel_is_open']: args.extend(['-g', '-4']) # -4: listen on IPv4 only return args def _ssh_local_tunnel_opt(self, bind_port): """Helper for :py:meth:`_ssh_tunnel_opts`.""" tunnel_config = self._ssh_tunnel_config() return [ '-L', '%d:%s:%d' % ( bind_port, self._job_tracker_host(), tunnel_config['port'], ), ] def _pick_ssh_bind_ports(self): """Pick a list of ports to try binding our SSH tunnel to. We will try to bind the same port for any given cluster (Issue #67) """ # don't perturb the random number generator random_state = random.getstate() try: # seed random port selection on cluster ID random.seed(self._cluster_id) num_picks = min(_MAX_SSH_RETRIES, len(self._opts['ssh_bind_ports'])) return random.sample(self._opts['ssh_bind_ports'], num_picks) finally: random.setstate(random_state) def _patch_params(params, name, value): """Helper method for _add_extra_cluster_params(). Set *name* in *params* to *value* If *name* has one or more dots in it, recursively set the value in successive nested dictionaries, creating them if necessary. For example, if *name* is ``Instances.EmrManagedMasterSecurityGroup``, set ``params['Instances']['EmrManagedMasterSecurityGroup']`` If *value* is ``None``, delete the value (if it exists), rather than setting it to ``None``. """ if not isinstance(params, dict): raise TypeError('must be a dictionary') if '.' in name: head, rest = name.split('.', 1) _patch_params(params.setdefault(head, {}), rest, value) elif value is None: if name in params: del params[name] elif isinstance(value, dict) and isinstance(params.get(name), dict): # recursively patch dicts rather than clobbering them (see #2154) for k, v in value.items(): _patch_params(params[name], k, v) else: params[name] = value