GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/tools/emr/mrboss.py
2025-06-22 13:36:01 +05:30

150 lines
5.5 KiB
Python

# Copyright 2009-2012 Yelp
# Copyright 2015-2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run a command on every node of a cluster. Store stdout and stderr for
results in OUTPUT_DIR.
Usage::
mrjob boss CLUSTER_ID [options] "command string"
Options::
-c CONF_PATHS, --conf-path CONF_PATHS
Path to alternate mrjob.conf file to read from
--no-conf Don't load mrjob.conf even if it's available
--ec2-endpoint EC2_ENDPOINT
Force mrjob to connect to EC2 on this endpoint (e.g.
ec2.us-west-1.amazonaws.com). Default is to infer this
from region.
--ec2-key-pair-file EC2_KEY_PAIR_FILE
Path to file containing SSH key for EMR
--emr-endpoint EMR_ENDPOINT
Force mrjob to connect to EMR on this endpoint (e.g.
us-west-1.elasticmapreduce.amazonaws.com). Default is
to infer this from region.
-h, --help show this help message and exit
-o OUTPUT_DIR, --output-dir OUTPUT_DIR
Specify an output directory (default: CLUSTER_ID)
-q, --quiet Don't print anything to stderr
--region REGION GCE/AWS region to run Dataproc/EMR jobs in.
--s3-endpoint S3_ENDPOINT
Force mrjob to connect to S3 on this endpoint (e.g. s3
-us-west-1.amazonaws.com). You usually shouldn't set
this; by default mrjob will choose the correct
endpoint for each S3 bucket based on its location.
--ssh-bin SSH_BIN Name/path of ssh binary. Arguments are allowed (e.g.
--ssh-bin 'ssh -v')
-v, --verbose print more messages to stderr
"""
from __future__ import print_function
import os
from argparse import ArgumentParser
from mrjob.emr import EMRJobRunner
from mrjob.job import MRJob
from mrjob.options import _add_basic_args
from mrjob.options import _add_runner_args
from mrjob.options import _alphabetize_actions
from mrjob.options import _filter_by_role
from mrjob.py2 import to_unicode
from mrjob.util import shlex_split
def main(cl_args=None):
usage = 'usage: %(prog)s boss CLUSTER_ID [options] "command string"'
description = ('Run a command on the master and all worker nodes of an EMR'
' cluster. Store stdout/stderr for results in OUTPUT_DIR.')
arg_parser = ArgumentParser(usage=usage, description=description)
arg_parser.add_argument('-o', '--output-dir', dest='output_dir',
default=None,
help="Specify an output directory (default:"
" CLUSTER_ID)")
arg_parser.add_argument(dest='cluster_id',
help='ID of cluster to run command on')
arg_parser.add_argument(dest='cmd_string',
help='command to run, as a single string')
_add_basic_args(arg_parser)
_add_runner_args(
arg_parser,
{'ec2_key_pair_file', 'ssh_bin'} | _filter_by_role(
EMRJobRunner.OPT_NAMES, 'connect')
)
_alphabetize_actions(arg_parser)
options = arg_parser.parse_args(cl_args)
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
runner_kwargs = options.__dict__.copy()
for unused_arg in ('cluster_id', 'cmd_string', 'output_dir',
'quiet', 'verbose'):
del runner_kwargs[unused_arg]
cmd_args = shlex_split(options.cmd_string)
output_dir = os.path.abspath(options.output_dir or options.cluster_id)
with EMRJobRunner(
cluster_id=options.cluster_id, **runner_kwargs) as runner:
_run_on_all_nodes(runner, output_dir, cmd_args)
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
"""Given an :py:class:`EMRJobRunner`, run the command specified by
*cmd_args* on all nodes in the cluster and save the stdout and stderr of
each run to subdirectories of *output_dir*.
"""
master_addr = runner._address_of_master()
addresses = [master_addr]
worker_addrs = runner._ssh_worker_hosts()
if worker_addrs:
addresses += ['%s!%s' % (master_addr, worker_addr)
for worker_addr in worker_addrs]
for addr in addresses:
stdout, stderr = runner.fs.ssh._ssh_run(addr, cmd_args)
if print_stderr:
print('---')
print('Command completed on %s.' % addr)
print(to_unicode(stderr), end=' ')
if '!' in addr:
base_dir = os.path.join(output_dir, 'worker ' + addr.split('!')[1])
else:
base_dir = os.path.join(output_dir, 'master')
if not os.path.exists(base_dir):
os.makedirs(base_dir)
with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
f.write(stdout)
with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
f.write(stderr)
if __name__ == '__main__':
main()