# Copyright 2019 Yelp # Copyright 2020 Yelp # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """A Spark script that can run a MRJob without Hadoop.""" import os import sys import json from argparse import ArgumentParser from collections import defaultdict from importlib import import_module from itertools import chain from mrjob.parse import is_uri from mrjob.util import shlex_split from pyspark.accumulators import AccumulatorParam # tuples of (args, kwargs) for ArgumentParser.add_argument() # # TODO: this is shared code with mr_spark_harness.py, which started out # in this directory but has since been moved to tests/. Totally fine to # inline this stuff and reduplicate it in mr_spark_harness.py _PASSTHRU_OPTIONS = [ (['--job-args'], dict( default=None, dest='job_args', help=('The arguments pass to the MRJob. Please quote all passthru args' ' so that they are in the same string'), )), (['--first-step-num'], dict( default=None, dest='first_step_num', type=int, help=("(0-indexed) first step in range of steps to run") )), (['--last-step-num'], dict( default=None, dest='last_step_num', type=int, help=("(0-indexed) last step in range of steps to run") )), (['--compression-codec'], dict( default=None, dest='compression_codec', help=('Java class path of a codec to use to compress output.'), )), (['--counter-output-dir'], dict( default=None, dest='counter_output_dir', help=( 'An empty directory to write counter output to. ' 'Can be a path or URI.') )), (['--num-reducers'], dict( default=None, dest='num_reducers', type=int, help=('Set number of reducers (and thus number of output files)') )), # switches to deal with jobs that can't be instantiated in the Spark # driver (e.g. because of problems with file upload args). See #2044 (['--steps-desc'], dict( default=None, dest='steps_desc', help=("Description of job's steps, in JSON format (otherwise," " we'll instantiate a job and ask it"), )), (['--hadoop-input-format'], dict( default=None, dest='hadoop_input_format', help=("Hadoop input format class. Set to '' to indicate no" " format (otherwise we'll instantiate a job and ask it"), )), (['--no-hadoop-input-format'], dict( action='store_const', const='', default=None, dest='hadoop_input_format', help=("Alternate way to specify no Hadoop input format class."), )), (['--hadoop-output-format'], dict( default=None, dest='hadoop_output_format', help=("Hadoop output format class. Set to '' to indicate no" " format (otherwise we'll instantiate a job and ask it"), )), (['--no-hadoop-output-format'], dict( action='store_const', const='', default=None, dest='hadoop_output_format', help=("Alternate way to specify no Hadoop output format class."), )), (['--sort-values'], dict( action='store_true', default=None, dest='sort_values', )), (['--no-sort-values'], dict( action='store_false', default=None, dest='sort_values', )), ] # Used to implement skip_internal_protocol # internal_protocol() method. pick_protocols() just expects a thing with # *read* and *write* attributes, and a class is the simplest way to get it. # The harness has special cases for when *read* or *write* is ``None``. class _NO_INTERNAL_PROTOCOL(object): read = None write = None class CounterAccumulator(AccumulatorParam): def zero(self, value): return value def addInPlace(self, value1, value2): for group in value2: for key in value2[group]: if key not in value1[group]: value1[group][key] = value2[group][key] else: value1[group][key] += value2[group][key] return value1 def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) if args.num_reducers is not None and args.num_reducers <= 0: raise ValueError( 'You can only configure num_reducers to positive number.') # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) # load initial data from pyspark import SparkContext if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] # determine hadoop_*_format, steps # try to avoid instantiating a job in the driver; see #2044 job = None if args.hadoop_input_format is None: job = job or job_class(job_args) hadoop_input_format = job.hadoop_input_format() else: hadoop_input_format = args.hadoop_input_format or None if args.hadoop_output_format is None: job = job or job_class(job_args) hadoop_output_format = job.hadoop_output_format() else: hadoop_output_format = args.hadoop_output_format or None if args.sort_values is None: job = job or job_class(job_args) sort_values = job.sort_values() else: sort_values = args.sort_values if args.steps_desc is None: job = job or job_class(job_args) steps = [step.description(step_num) for step_num, step in enumerate(job.steps())] else: steps = json.loads(args.steps_desc) # pick steps start = args.first_step_num or 0 end = None if args.last_step_num is None else args.last_step_num + 1 steps_to_run = list(enumerate(steps))[start:end] sc = SparkContext() # keep track of one set of counters per job step counter_accumulators = [ sc.accumulator(defaultdict(dict), CounterAccumulator()) for _ in steps_to_run ] def make_increment_counter(step_num): counter_accumulator = counter_accumulators[step_num - start] def increment_counter(group, counter, amount=1): counter_accumulator.add({group: {counter: amount}}) return increment_counter def make_mrc_job(mrc, step_num): j = job_class(job_args + [ '--%s' % mrc, '--step-num=%d' % step_num ]) # patch increment_counter() to update the accumulator for this step j.increment_counter = make_increment_counter(step_num) # if skip_internal_protocol is true, patch internal_protocol() to # return an object whose *read* and *write* attributes are ``None`` if args.skip_internal_protocol: j.internal_protocol = lambda: _NO_INTERNAL_PROTOCOL return j # --emulate-map-input-file doesn't work with hadoop_input_format emulate_map_input_file = ( args.emulate_map_input_file and not hadoop_input_format) try: if emulate_map_input_file: # load an rdd with pairs of (input_path, line). *path* here # has to be a single path, not a comma-separated list rdd = sc.union([_text_file_with_path(sc, path) for path in args.input_path.split(',')]) elif hadoop_input_format: rdd = sc.hadoopFile( args.input_path, inputFormatClass=hadoop_input_format, keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.Text') # hadoopFile loads each line as a key-value pair in which the # contents of the line are the key and the value is an empty # string. Convert to an rdd of just lines, encoded as bytes. rdd = rdd.map(lambda kv: kv[0].encode('utf-8')) else: rdd = sc.textFile(args.input_path, use_unicode=False) # run steps for step_num, step in steps_to_run: rdd = _run_step( step, step_num, rdd, make_mrc_job, args.num_reducers, sort_values, emulate_map_input_file, args.skip_internal_protocol) # max_output_files: limit number of partitions if args.max_output_files: rdd = rdd.coalesce(args.max_output_files) # write the results if hadoop_output_format: # saveAsHadoopFile takes an rdd of key-value pairs, so convert to # that format rdd = rdd.map(lambda line: tuple( x.decode('utf-8') for x in line.split(b'\t', 1))) rdd.saveAsHadoopFile( args.output_path, outputFormatClass=hadoop_output_format, compressionCodecClass=args.compression_codec) else: rdd.saveAsTextFile( args.output_path, compressionCodecClass=args.compression_codec) finally: if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] # If the given path is an s3 path, use s3.parallelize, # otherwise just write them directly to the local dir if is_uri(args.counter_output_dir): sc.parallelize( [json.dumps(counters)], numSlices=1 ).saveAsTextFile( args.counter_output_dir ) else: # Use regular python built-in file writer if the part-* file # is not created path = os.path.join(args.counter_output_dir, "part-00000") if not os.path.exists(args.counter_output_dir): os.mkdir(args.counter_output_dir) with open(path, 'w') as wb: wb.write(str(json.dumps(counters))) def _text_file_with_path(sc, path): """Return an RDD that yields (path, line) for each line in the file. *path* must be a single path, not a comma-separated list of paths """ from pyspark.sql import SparkSession from pyspark.sql import functions as F spark = SparkSession(sc) df = spark.read.text(path).select([ F.input_file_name().alias('input_file_name'), F.col('value') ]) return df.rdd.map( lambda row: (row.input_file_name, (row.value if isinstance(row.value, bytes) else row.value.encode('utf_8'))) ) def _run_step( step, step_num, rdd, make_mrc_job, num_reducers=None, sort_values=None, emulate_map_input_file=False, skip_internal_protocol=False): """Run the given step on the RDD and return the transformed RDD.""" _check_step(step, step_num) # we try to avoid initializing job instances here in the driver (see #2044 # for why). However, while we can get away with initializing one instance # per partition in the mapper and reducer, that would be too inefficient # for combiners, which run on *two* key-value pairs at a time. # # but combiners are optional! if we can't initialize a combiner job # instance, we can just skip it! # mapper if step.get('mapper'): rdd_includes_input_path = (emulate_map_input_file and step_num == 0) rdd = _run_mapper( make_mrc_job, step_num, rdd, rdd_includes_input_path) # combiner/shuffle-and-sort combiner_job = None if step.get('combiner'): try: _check_substep(step, step_num, 'combiner') combiner_job = make_mrc_job('combiner', step_num) except Exception: # if combiner needs to run subprocesses, or we can't # initialize a job instance, just skip combiners pass if combiner_job: # _run_combiner() includes shuffle-and-sort rdd = _run_combiner( combiner_job, rdd, sort_values=sort_values, num_reducers=num_reducers) elif step.get('reducer'): rdd = _shuffle_and_sort( rdd, sort_values=sort_values, num_reducers=num_reducers, skip_internal_protocol=skip_internal_protocol) # reducer if step.get('reducer'): rdd = _run_reducer( make_mrc_job, step_num, rdd, num_reducers=num_reducers) return rdd def _run_mapper(make_mrc_job, step_num, rdd, rdd_includes_input_path): """Run our job's mapper. :param make_mrc_job: an instance of our job, instantiated to be the mapper for the step we wish to run :param rdd: an RDD containing lines representing encoded key-value pairs :param rdd_includes_input_path: if true, rdd contains pairs of (input_file_path, line). set $mapreduce_map_input_file to *input_file_path*. :return: an RDD containing lines representing encoded key-value pairs """ # initialize job class inside mapPartitions(). this deals with jobs that # can't be initialized in the Spark driver (see #2044) def map_lines(lines): job = make_mrc_job('mapper', step_num) read, write = job.pick_protocols(step_num, 'mapper') if rdd_includes_input_path: # rdd actually contains pairs of (input_path, line), convert path_line_pairs = lines # emulate the mapreduce.map.input.file config property # set in Hadoop # # do this first so that mapper_init() etc. will work. # we can assume *rdd* contains at least one record. input_path, first_line = next(path_line_pairs) os.environ['mapreduce_map_input_file'] = input_path # reconstruct *lines* (without dumping to memory) lines = chain([first_line], (line for _, line in path_line_pairs)) # decode lines into key-value pairs (as a generator, not a list) # # line -> (k, v) if read: pairs = (read(line) for line in lines) else: pairs = lines # was never encoded # reduce_pairs() runs key-value pairs through mapper # # (k, v), ... -> (k, v), ... for k, v in job.map_pairs(pairs, step_num): # encode key-value pairs back into lines # # (k, v) -> line if write: yield write(k, v) else: yield k, v return rdd.mapPartitions(map_lines) def _run_combiner(combiner_job, rdd, sort_values=False, num_reducers=None): """Run our job's combiner, and group lines with the same key together. :param combiner_job: an instance of our job, instantiated to be the mapper for the step we wish to run :param rdd: an RDD containing lines representing encoded key-value pairs :param sort_values: if true, ensure all lines corresponding to a given key are sorted (by their encoded value) :param num_reducers: limit the number of paratitions of output rdd, which is similar to mrjob's limit on number of reducers. :return: an RDD containing "reducer ready" lines representing encoded key-value pairs, that is, where all lines with the same key are adjacent and in the same partition """ step_num = combiner_job.options.step_num c_read, c_write = combiner_job.pick_protocols(step_num, 'combiner') # decode lines into key-value pairs # # line -> (k, v) if c_read: rdd = rdd.map(c_read) # The common case for MRJob combiners is to yield a single key-value pair # (for example ``(key, sum(values))``. If the combiner does something # else, just build a list of values so we don't end up running multiple # values through the MRJob's combiner multiple times. def combiner_helper(pairs1, pairs2): if len(pairs1) == len(pairs2) == 1: return list( combiner_job.combine_pairs(pairs1 + pairs2, step_num), ) else: pairs1.extend(pairs2) return pairs1 # include key in "value", so MRJob combiner can see it # # (k, v) -> (k, (k, v)) rdd = rdd.map(lambda k_v: (k_v[0], k_v)) # :py:meth:`pyspark.RDD.combineByKey()`, where the magic happens. # # (k, (k, v)), ... -> (k, ([(k, v1), (k, v2), ...])) # # Our "values" are key-value pairs, and our "combined values" are lists of # key-value pairs (single-item lists in the common case). # # note that unlike Hadoop combiners, combineByKey() sees *all* the # key-value pairs, essentially doing a shuffle-and-sort for free. rdd = rdd.combineByKey( createCombiner=lambda k_v: [k_v], mergeValue=lambda k_v_list, k_v: combiner_helper(k_v_list, [k_v]), mergeCombiners=combiner_helper, numPartitions=num_reducers ) # encode lists of key-value pairs into lists of lines # # (k, [(k, v1), (k, v2), ...]) -> (k, [line1, line2, ...]) if c_write: rdd = rdd.mapValues( lambda pairs: [c_write(*pair) for pair in pairs]) # free the lines! # # (k, [line1, line2, ...]) -> line1, line2, ... rdd = _discard_key_and_flatten_values(rdd, sort_values=sort_values) return rdd def _shuffle_and_sort( rdd, sort_values=False, num_reducers=None, skip_internal_protocol=False): """Simulate Hadoop's shuffle-and-sort step, so that data will be in the format the reducer expects. :param rdd: an RDD containing lines representing encoded key-value pairs, where the encoded key comes first and is followed by a TAB character (the encoded key may not contain TAB). :param sort_values: if true, ensure all lines corresponding to a given key are sorted (by their encoded value) :param num_reducers: limit the number of paratitions of output rdd, which is similar to mrjob's limit on number of reducers. :param skip_internal_protocol: if true, assume *rdd* contains key/value pairs, not lines :return: an RDD containing "reducer ready" lines representing encoded key-value pairs, that is, where all lines with the same key are adjacent and in the same partition """ if skip_internal_protocol: def key_func(k_v): return k_v[0] else: def key_func(line): return line.split(b'\t')[0] rdd = rdd.groupBy(key_func, numPartitions=num_reducers) rdd = _discard_key_and_flatten_values(rdd, sort_values=sort_values) return rdd def _run_reducer(make_mrc_job, step_num, rdd, num_reducers=None): """Run our job's combiner, and group lines with the same key together. :param reducer_job: an instance of our job, instantiated to be the mapper for the step we wish to run :param rdd: an RDD containing "reducer ready" lines representing encoded key-value pairs, that is, where all lines with the same key are adjacent and in the same partition :param num_reducers: limit the number of paratitions of output rdd, which is similar to mrjob's limit on number of reducers. :return: an RDD containing encoded key-value pairs """ # initialize job class inside mapPartitions(). this deals with jobs that # can't be initialized in the Spark driver (see #2044) def reduce_lines(lines): job = make_mrc_job('reducer', step_num) read, write = job.pick_protocols(step_num, 'reducer') # decode lines into key-value pairs (as a generator, not a list) # # line -> (k, v) if read: pairs = (read(line) for line in lines) else: pairs = lines # pairs were never encoded # reduce_pairs() runs key-value pairs through reducer # # (k, v), ... -> (k, v), ... for k, v in job.reduce_pairs(pairs, step_num): # encode key-value pairs back into lines # # (k, v) -> line if write: yield write(k, v) else: yield k, v # if *num_reducers* is set, don't re-partition. otherwise, doesn't matter return rdd.mapPartitions( reduce_lines, preservesPartitioning=bool(num_reducers)) def _discard_key_and_flatten_values(rdd, sort_values=False): """Helper function for :py:func:`_run_combiner` and :py:func:`_shuffle_and_sort`. Given an RDD containing (key, [line1, line2, ...]), discard *key* and return an RDD containing line1, line2, ... Guarantees that lines in the same list will end up in the same partition. If *sort_values* is true, sort each list of lines before flattening it. """ if sort_values: def map_f(key_and_lines): return sorted(key_and_lines[1]) else: def map_f(key_and_lines): return key_and_lines[1] return rdd.flatMap(map_f, preservesPartitioning=True) def _check_step(step, step_num): """Check that the given step description is for a MRStep with no input manifest""" if step.get('type') != 'streaming': raise ValueError( 'step %d has unexpected type: %r' % ( step_num, step.get('type'))) if step.get('input_manifest'): raise NotImplementedError( 'step %d uses an input manifest, which is unsupported') for mrc in ('mapper', 'reducer'): _check_substep(step, step_num, mrc) def _check_substep(step, step_num, mrc): """Raise :py:class:`NotImplementedError` if the given substep (e.g. ``'mapper'``) runs subprocesses.""" substep = step.get(mrc) if not substep: return if substep.get('type') != 'script': raise NotImplementedError( "step %d's %s has unexpected type: %r" % ( step_num, mrc, substep.get('type'))) if substep.get('pre_filter'): raise NotImplementedError( "step %d's %s has pre-filter, which is unsupported" % ( step_num, mrc)) def _make_arg_parser(): parser = ArgumentParser() parser.add_argument( dest='job_class', help=('dot-separated module and name of MRJob class. For example:' ' mrjob.examples.mr_wc.MRWordCountUtility')) parser.add_argument( dest='input_path', help=('Where to read input from. Can be a path or a URI, or several of' ' these joined by commas')) parser.add_argument( dest='output_path', help=('An empty directory to write output to. Can be a path or URI.')) # can't put this in _PASSTHRU_OPTIONS because it's also a runner opt parser.add_argument( '--max-output-files', dest='max_output_files', type=int, help='Directly limit number of output files, using coalesce()', ) parser.add_argument( '--emulate-map-input-file', dest='emulate_map_input_file', action='store_true', help=('Set mapreduce_map_input_file to the input file path' ' in the first mapper function, so we can read it' ' with mrjob.compat.jobconf_from_env(). Ignored if' ' job has a Hadoop input format'), ) parser.add_argument( '--skip-internal-protocol', dest='skip_internal_protocol', action='store_true', help=("Don't use the job's internal protocol to communicate" " between tasks internal to the job, instead relying" " on Spark to encode and decode raw data structures.") ) for args, kwargs in _PASSTHRU_OPTIONS: parser.add_argument(*args, **kwargs) return parser if __name__ == '__main__': main()