# -*- coding: utf-8 -*- # Copyright 2016 Google Inc. and Yelp # Copyright 2017 Yelp # Copyright 2018 Google Inc. and Yelp # Copyright 2019 Yelp # Copyright 2020 Affirm, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import time import re from io import BytesIO from os import environ try: import google.auth import google.cloud.dataproc_v1beta2 import google.cloud.dataproc_v1beta2.types import google.cloud.logging import google.api_core.exceptions import google.api_core.grpc_helpers except: google = None import mrjob from mrjob.cloud import HadoopInTheCloudJobRunner from mrjob.compat import map_version from mrjob.conf import combine_dicts from mrjob.fs.composite import CompositeFilesystem from mrjob.fs.gcs import GCSFilesystem from mrjob.fs.gcs import is_gcs_uri from mrjob.fs.local import LocalFilesystem from mrjob.logs.counters import _pick_counters from mrjob.logs.errors import _log_probable_cause_of_failure from mrjob.logs.mixin import LogInterpretationMixin from mrjob.logs.task import _parse_task_stderr from mrjob.logs.task import _parse_task_syslog_records from mrjob.logs.step import _interpret_new_dataproc_step_stderr from mrjob.parse import is_uri from mrjob.py2 import PY2 from mrjob.py2 import string_types from mrjob.py2 import to_unicode from mrjob.runner import _blank_out_conflicting_opts from mrjob.setup import UploadDirManager from mrjob.step import StepFailedException from mrjob.util import random_identifier log = logging.getLogger(__name__) _DEFAULT_GCE_REGION = 'us-west1' _DEFAULT_ENDPOINT = 'dataproc.googleapis.com:443' _DATAPROC_MIN_WORKERS = 2 _GCE_API_VERSION = 'v1' _DEFAULT_INSTANCE_TYPE = 'n1-standard-1' # default imageVersion to use on Dataproc. This may be updated with each # version of mrjob _DEFAULT_IMAGE_VERSION = '1.3' _DEFAULT_CHECK_CLUSTER_EVERY = 10.0 _DEFAULT_CLOUD_FS_SYNC_SECS = 5.0 _DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS = 90 # job state matcher enum # use this to only find active jobs. (2 for NON_ACTIVE, but we don't use that) _STATE_MATCHER_ACTIVE = 1 # Dataproc images where Hadoop version changed (we use map_version() on this) # # This will need to be updated by hand if we want it to be fully accurate # (it doesn't really matter to mrjob though, which only cares about # major version) # # See https://cloud.google.com/dataproc/docs/concepts/dataproc-versions # for the full list. _DATAPROC_IMAGE_TO_HADOOP_VERSION = { '0.1': '2.7.1', '1.0': '2.7.2', '1.1': '2.7.7', '1.2': '2.8.5', '1.3': '2.9.2', } _HADOOP_STREAMING_JAR_URI = ( 'file:///usr/lib/hadoop-mapreduce/hadoop-streaming.jar') # TODO - mtai @ davidmarin - Re-implement logs parsing? See dataproc metainfo and driver output - ask Dennis Huo for more details # noqa # 'gs://dataproc-801485be-0997-40e7-84a7-00926031747c-us/google-cloud-dataproc-metainfo/8b76d95e-ebdc-4b81-896d-b2c5009b3560/jobs/mr_most_used_word-taim-20160228-172000-034993---Step-2-of-2/driveroutput' # noqa _GCP_CLUSTER_NAME_REGEX = '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # on Dataproc, the resource manager is always at 8088. Tunnel to the master # node's own hostname, not localhost. _SSH_TUNNEL_CONFIG = dict( localhost=False, name='resource manager', path='/cluster', port=8088, ) # used to match log entries that tell us if a container exited _CONTAINER_EXECUTOR_CLASS_NAME = ( 'org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor') # used to determine which containers exited with nonzero status _CONTAINER_EXIT_RE = re.compile( r'Exit code from container (?P\w+)' r' is ?: (?P\d+)') _TRACEBACK_EXCEPTION_RE = re.compile(r'\w+: .*$') _STDERR_LOG4J_WARNING = re.compile( r'.*(No appenders could be found for logger' r'|Please initialize the log4j system' r'|See http://logging.apache.org/log4j)') # this is equivalent to full permission _FULL_SCOPE = 'https://www.googleapis.com/auth/cloud-platform' # convert enum values to strings (e.g. 'RUNNING') def _cluster_state_name(state_value): return google.cloud.dataproc_v1beta2.types.ClusterStatus.State.Name( state_value) def _job_state_name(state_value): return google.cloud.dataproc_v1beta2.types.JobStatus.State.Name( state_value) ########## BEGIN - Helper fxns for _cluster_create_kwargs ########## def _gcp_zone_uri(project, zone): return ( 'https://www.googleapis.com/compute/%(gce_api_version)s/projects/' '%(project)s/zones/%(zone)s' % dict( gce_api_version=_GCE_API_VERSION, project=project, zone=zone)) def _gcp_instance_group_config( project, zone, count, instance_type, is_preemptible=False): if zone: zone_uri = _gcp_zone_uri(project, zone) machine_type = "%(zone_uri)s/machineTypes/%(machine_type)s" % dict( zone_uri=zone_uri, machine_type=instance_type) else: machine_type = instance_type return dict( num_instances=count, machine_type_uri=machine_type, is_preemptible=is_preemptible ) ########## END - Helper fxns for _cluster_create_kwargs ########### def _wait_for(msg, sleep_secs): log.info("Waiting for %s - sleeping %.1f second(s)", msg, sleep_secs) time.sleep(sleep_secs) def _cleanse_gcp_job_id(job_id): return re.sub(r'[^a-zA-Z0-9_\-]', '-', job_id) def _check_and_fix_fs_dir(gcs_uri): """Helper for __init__""" # TODO - mtai @ davidmarin - push this to fs/*.py if not is_gcs_uri(gcs_uri): raise ValueError('Invalid GCS URI: %r' % gcs_uri) if not gcs_uri.endswith('/'): gcs_uri += '/' return gcs_uri def _zone_to_region(zone): """Convert a zone (like us-west1-b) to the corresponding region (like us-west1).""" # See https://cloud.google.com/compute/docs/regions-zones/#identifying_a_region_or_zone # noqa return '-'.join(zone.split('-')[:-1]) class DataprocException(Exception): pass class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | { 'cluster_properties', 'core_instance_config', 'gcloud_bin', 'master_instance_config', 'network', 'project_id', 'service_account', 'service_account_scopes', 'subnet', 'task_instance_config', } # no Spark support yet (see #1765) _STEP_TYPES = {'jar', 'streaming'} def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf `. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError( 'You must install google-cloud-logging and ' 'google-cloud-storage to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException( 'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # see #1820 if self._opts['image_id']: log.warning('mrjob does not yet support custom machine images' ' on Dataproc') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=[_FULL_SCOPE]) # needed for $GOOGLE_APPLICATION_CREDENTIALS self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() if self._opts['service_account_scopes']: self._opts['service_account_scopes'] = [ _fully_qualify_scope_uri(s) for s in self._opts['service_account_scopes'] ] # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # map driver_output_uri to a dict with the keys: # log_uri: uri of file we're reading from # pos: position in file # buffer: bytes read from file already self._driver_output_state = {} # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def _fix_zone_and_region_opts(self): """Ensure that exactly one of region and zone is set.""" if self._opts['region'] and self._opts['zone']: log.warning('you do not need to set region if you set zone') self._opts['region'] = None return if not (self._opts['region'] or self._opts['zone']): if environ.get('CLOUDSDK_COMPUTE_ZONE'): self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE'] elif environ.get('CLOUDSDK_COMPUTE_REGION'): self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION'] else: self._opts['region'] = _DEFAULT_GCE_REGION @classmethod def _default_opts(cls): return combine_dicts( super(DataprocJobRunner, cls)._default_opts(), dict( bootstrap_python=True, check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY, cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'], cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS, image_version=_DEFAULT_IMAGE_VERSION, instance_type=_DEFAULT_INSTANCE_TYPE, master_instance_type=_DEFAULT_INSTANCE_TYPE, num_core_instances=_DATAPROC_MIN_WORKERS, num_task_instances=0, ) ) def _combine_opts(self, opt_list): """Blank out conflicts between *network*/*subnet* and *region*/*zone*.""" opt_list = _blank_out_conflicting_opts(opt_list, ['region', 'zone']) opt_list = _blank_out_conflicting_opts(opt_list, ['network', 'subnet']) # now combine opts, with region/zone blanked out return super(DataprocJobRunner, self)._combine_opts(opt_list) @property def cluster_client(self): return google.cloud.dataproc_v1beta2.ClusterControllerClient( **self._client_create_kwargs()) @property def job_client(self): return google.cloud.dataproc_v1beta2.JobControllerClient( **self._client_create_kwargs()) @property def logging_client(self): return google.cloud.logging.Client(credentials=self._credentials, project=self._project_id) def _client_create_kwargs(self): if self._opts['region']: endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT) return dict( channel=google.api_core.grpc_helpers.create_channel( endpoint, credentials=self._credentials)) else: return dict(credentials=self._credentials) @property def api_client(self): raise NotImplementedError( '"api_client" was disabled in v0.6.2. Use "cluster_client"' ' or "job_client" instead.') @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() location = self._opts['region'] or _zone_to_region( self._opts['zone']) self._fs.add_fs('gcs', GCSFilesystem( credentials=self._credentials, project_id=self._project_id, part_size=self._upload_part_size(), location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, )) self._fs.add_fs('local', LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None # determine region for bucket region = self._region() for tmp_bucket_name in self.fs.gcs.get_all_bucket_names( prefix='mrjob-'): tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name) # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase. (As of Feb. 12, 2018, this is still true, # observed on google-cloud-sdk) if tmp_bucket.location.lower() == region: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', region, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _region(self): # region of cluster, which is either the region set by the user, # or the region derived from the zone they set. # used to pick bucket location and name cluster return self._opts['region'] or _zone_to_region(self._opts['zone']) def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_output_not_exists() self._create_setup_wrapper_scripts() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files() self._wait_for_fs_sync() def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError( 'Output path %s already exists!' % (self._output_dir,)) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Create the master bootstrap script if necessary. """ # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" if self._opts['hadoop_streaming_jar']: self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # close our SSH tunnel, if any self._kill_ssh_tunnel() # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for job in self._list_jobs( cluster_name=self._cluster_id, state_matcher=_STATE_MATCHER_ACTIVE): # Kill all active jobs with the same job_prefix as this job job_id = job.reference.job_id if not job_id.startswith(job_prefix): continue self._cancel_job(job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._delete_cluster(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _streaming_step_job_kwarg(self, step_num): """Returns a map from ``'hadoop_job'`` to a dict representing a hadoop streaming job. """ return dict( hadoop_job=dict( args=self._hadoop_streaming_jar_args(step_num), main_jar_file_uri=self._hadoop_streaming_jar_uri(), ) ) def _jar_step_job_kwarg(self, step_num): """Returns a map from ``'hadoop_job'`` to a dict representing a Hadoop job that runs a JAR""" step = self._get_step(step_num) hadoop_job = {} hadoop_job['args'] = ( self._interpolate_jar_step_args(step['args'], step_num)) jar_uri = self._upload_mgr.uri(step['jar']) # can't specify main_class and main_jar_file_uri; see # https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa if step.get('main_class'): hadoop_job['jar_file_uris'] = [jar_uri] hadoop_job['main_class'] = step['main_class'] else: hadoop_job['main_jar_file_uri'] = jar_uri return dict(hadoop_job=hadoop_job) def _hadoop_streaming_jar_uri(self): if self._opts['hadoop_streaming_jar']: return self._upload_mgr.uri(self._opts['hadoop_streaming_jar']) else: return _HADOOP_STREAMING_JAR_URI def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" self.fs.mkdir(self._job_tmpdir) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._region(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._get_cluster(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google.api_core.exceptions.NotFound: log.info( 'Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_kwargs() self._create_cluster(cluster_data) self._wait_for_cluster_ready(self._cluster_id) self._set_up_ssh_tunnel() # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in ('RUNNING', 'UPDATING'): cluster = self._get_cluster(cluster_id) cluster_state = cluster.status.State.Name(cluster.status.state) if cluster_state in ('ERROR', 'DELETING'): raise DataprocException(cluster) self._wait_for_api('cluster to accept jobs') return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete( job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): step = self._get_step(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Build step # job_kwarg is a single-item dict, where the key is 'hadoop_job', # 'spark_job', etc. if step['type'] == 'streaming': job_kwarg = self._streaming_step_job_kwarg(step_num) elif step['type'] == 'jar': job_kwarg = self._jar_step_job_kwarg(step_num) else: raise NotImplementedError( 'Unsupported step type: %r' % step['type']) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._submit_job(step_name, job_kwarg) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result.reference.job_id assert job_id == step_name return job_id def _wait_for_step_to_complete(self, job_id, step_num, num_steps): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) log_interpretation['step'] = {} step_type = self._get_step(step_num)['type'] while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) log_interpretation['step']['driver_output_uri'] = ( job.driver_output_resource_uri) self._interpret_step_logs(log_interpretation, step_type) progress = log_interpretation['step'].get('progress') if progress: log.info(' ' + progress['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) if job_state == 'ERROR': error = self._pick_error(log_interpretation, step_type) if error: _log_probable_cause_of_failure(log, error) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException( step_num=step_num, num_steps=num_steps) def _default_step_output_dir(self): # put intermediate data in HDFS return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key ### log intepretation ### # step def _interpret_step_logs(self, log_interpretation, step_type): """Hook for interpreting step logs. Unlike with most runners, you may call this multiple times and it will continue to parse the step log incrementally, which is useful for getting job progress.""" # don't turn this off even if read_logs opt is false; it's # the only way this runner can track job progress driver_output_uri = log_interpretation.get( 'step', {}).get('driver_output_uri') if driver_output_uri: self._update_step_interpretation( log_interpretation['step'], driver_output_uri) def _update_step_interpretation( self, step_interpretation, driver_output_uri): new_lines = self._get_new_driver_output_lines(driver_output_uri) _interpret_new_dataproc_step_stderr(step_interpretation, new_lines) def _get_new_driver_output_lines(self, driver_output_uri): """Get a list of complete job driver output lines that are new since the last time we checked. """ state = self._driver_output_state.setdefault( driver_output_uri, dict(log_uri=None, pos=0, buffer=b'')) # driver output is in logs with names like driveroutput.000000000 log_uris = sorted(self.fs.ls(driver_output_uri + '*')) for log_uri in log_uris: # initialize log_uri with first URI we see if state['log_uri'] is None: # log the location of job driver output just once log.info( ' Parsing job driver output from %s*' % driver_output_uri) state['log_uri'] = log_uri # skip log files already parsed if log_uri < state['log_uri']: continue # when parsing the next file, reset *pos* elif log_uri > state['log_uri']: state['pos'] = 0 state['log_uri'] = log_uri log_blob = self.fs.gcs._get_blob(log_uri) try: new_data = log_blob.download_as_string(start=state['pos']) except (google.api_core.exceptions.NotFound, google.api_core.exceptions.RequestRangeNotSatisfiable): # blob was just created, or no more data is available break state['buffer'] += new_data state['pos'] += len(new_data) # convert buffer into lines, saving leftovers for next time stream = BytesIO(state['buffer']) state['buffer'] = b'' lines = [] for line_bytes in stream: if line_bytes.endswith(b'\n'): lines.append(to_unicode(line_bytes)) else: # leave final partial line (if any) in buffer state['buffer'] = line_bytes return lines # history def _interpret_history_log(self, log_interpretation): """Does nothing. We can't get the history logs, and we don't need them.""" if not self._read_logs(): return log_interpretation.setdefault('history', {}) # task def _interpret_task_logs(self, log_interpretation, step_type, error_attempt_ids=(), partial=True): """Scan node manager log to find failed container IDs of failed tasks, and then scan the corresponding stderr and syslogs.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted if not self._read_logs(): return step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') if not application_id: log.warning( "Can't parse node manager logs; missing application ID") return log_interpretation['task'] = self._task_log_interpretation( application_id, step_type, partial) def _task_log_interpretation( self, application_id, step_type, partial=True): """Helper for :py:meth:`_interpret_task_logs`""" # not bothering with _read_logs() since this is a helper method result = {} for container_id in self._failed_task_container_ids(application_id): error = _parse_task_syslog_records( self._task_syslog_records( application_id, container_id, step_type)) if not error.get('hadoop_error'): # not sure if this ever happens, since we already know # which containers failed continue error['container_id'] = container_id # fix weird munging of java stacktrace error['hadoop_error']['message'] = _fix_java_stack_trace( error['hadoop_error']['message']) task_error = _parse_task_stderr( self._task_stderr_lines( application_id, container_id, step_type)) if task_error: task_error['message'] = _fix_traceback(task_error['message']) error['task_error'] = task_error result.setdefault('errors', []).append(error) # if partial is true, bail out when we find the first task error if task_error and partial: result['partial'] = True return result return result def _failed_task_container_ids(self, application_id): """Stream container IDs of failed tasks, in reverse order.""" container_id_prefix = 'container' + application_id[11:] log_filter = self._make_log_filter( 'yarn-yarn-nodemanager', {'jsonPayload.class': _CONTAINER_EXECUTOR_CLASS_NAME}) log.info('Scanning node manager logs for IDs of failed tasks...') # it doesn't seem to work to do self.logging_client.logger(); # there's some RPC dispute about whether the log name should # be qualified by project name or not entries = self.logging_client.list_entries( filter_=log_filter, order_by=google.cloud.logging.DESCENDING) for entry in entries: message = entry.payload.get('message') if not message: continue m = _CONTAINER_EXIT_RE.match(message) if not m: continue returncode = int(m.group('returncode')) if not returncode: continue container_id = m.group('container_id') # matches some other step if not container_id.startswith(container_id_prefix): continue log.debug(' %s' % container_id) yield container_id def _task_stderr_lines(self, application_id, container_id, step_type): """Yield lines from a specific stderr log.""" log_filter = self._make_log_filter( 'yarn-userlogs', { 'jsonPayload.application': application_id, 'jsonPayload.container': container_id, # TODO: pick based on step_type 'jsonPayload.container_logname': 'stderr', }) log.info(' reading stderr log...') entries = self.logging_client.list_entries(filter_=log_filter) # use log4j parsing to handle tab -> newline conversion for record in _log_entries_to_log4j(entries): for line in record['message'].split('\n'): yield line def _task_syslog_records(self, application_id, container_id, step_type): """Yield log4j records from a specific syslog. """ log_filter = self._make_log_filter( 'yarn-userlogs', { 'jsonPayload.application': application_id, 'jsonPayload.container': container_id, # TODO: pick based on step_type 'jsonPayload.container_logname': 'syslog', }) log.info(' reading syslog...') entries = self.logging_client.list_entries(filter_=log_filter) return _log_entries_to_log4j(entries) # misc def _make_log_filter(self, log_name=None, extra_values=None): # we only want logs from this project, cluster, and region d = {} d['resource.labels.cluster_name'] = self._cluster_id d['resource.labels.project_id'] = self._project_id d['resource.labels.region'] = self._region() d['resource.type'] = 'cloud_dataproc_cluster' if log_name: d['logName'] = 'projects/%s/logs/%s' % ( self._project_id, log_name) if extra_values: d.update(extra_values) return _log_filter_str(d) def counters(self): return [_pick_counters(log_interpretation) for log_interpretation in self._log_interpretations] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise ValueError('cluster has not yet been created') cluster = self._get_cluster(self._cluster_id) self._image_version = ( cluster.config.software_config.image_version) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version( self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) def _bootstrap_pre_commands(self): # don't run the bootstrap script in / (see #1601) return [ 'mkdir /tmp/mrjob', 'cd /tmp/mrjob', ] ### Bootstrapping ### def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def get_cluster_id(self): return self._cluster_id def _cluster_create_kwargs(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ # TODO: remove mrjob-max-secs-idle once lifecycle_config is visible # through the gcloud utility and the Google Cloud Console cluster_metadata['mrjob-max-secs-idle'] = str(int( self._opts['max_mins_idle'] * 60)) gce_cluster_config = dict( metadata=cluster_metadata, service_account_scopes=self._opts['service_account_scopes'], ) if self._opts['network']: gce_cluster_config['network_uri'] = self._opts['network'] if self._opts['subnet']: gce_cluster_config['subnetwork_uri'] = self._opts['subnet'] if self._opts['service_account']: gce_cluster_config['service_account'] = ( self._opts['service_account']) if self._opts['service_account_scopes']: gce_cluster_config['service_account_scopes'] = ( self._opts['service_account_scopes']) if self._opts['zone']: gce_cluster_config['zone_uri'] = _gcp_zone_uri( project=self._project_id, zone=self._opts['zone']) cluster_config = dict( gce_cluster_config=gce_cluster_config, initialization_actions=[ dict(executable_file=init_script_uri) for init_script_uri in gcs_init_script_uris ] ) # Task tracker master_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=1, instance_type=self._opts['master_instance_type'], ) if self._opts['master_instance_config']: master_conf.update(self._opts['master_instance_config']) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type'] ) if self._opts['core_instance_config']: worker_conf.update(self._opts['core_instance_config']) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True ) if self._opts['task_instance_config']: secondary_worker_conf.update(self._opts['task_instance_config']) cluster_config['master_config'] = master_conf cluster_config['worker_config'] = worker_conf if secondary_worker_conf.get('num_instances'): cluster_config['secondary_worker_config'] = secondary_worker_conf cluster_config['lifecycle_config'] = dict( idle_delete_ttl=dict( seconds=int(self._opts['max_mins_idle'] * 60))) software_config = {} if self._opts['cluster_properties']: software_config['properties'] = _values_to_text( self._opts['cluster_properties']) # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: software_config['image_version'] = self._opts['image_version'] if software_config: cluster_config['software_config'] = software_config # in Python 2, dict keys loaded from JSON will be unicode, which # the Google protobuf objects don't like if PY2: cluster_config = _clean_json_dict_keys(cluster_config) kwargs = dict(project_id=self._project_id, cluster_name=self._cluster_id, config=cluster_config) return self._add_extra_cluster_params(kwargs) ### Dataproc-specific Stuff ### def _get_cluster(self, cluster_id): return self.cluster_client.get_cluster( cluster_name=cluster_id, **self._project_id_and_region() ) def _create_cluster(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa self.cluster_client.create_cluster( cluster=cluster_data, **self._project_id_and_region() ) def _delete_cluster(self, cluster_id): return self.cluster_client.delete_cluster( cluster_name=cluster_id, **self._project_id_and_region() ) def _list_jobs(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = self._project_id_and_region() if cluster_name: list_kwargs['cluster_name'] = cluster_name if state_matcher: list_kwargs['job_state_matcher'] = state_matcher return self.job_client.list_jobs(**list_kwargs) def _get_job(self, job_id): return self.job_client.get_job( job_id=job_id, **self._project_id_and_region() ) def _cancel_job(self, job_id): return self.job_client.cancel_job( job_id=job_id, **self._project_id_and_region() ) def _submit_job(self, step_name, job_kwarg): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa submit_job_kwargs = dict( job=dict( reference=dict(project_id=self._project_id, job_id=step_name), placement=dict(cluster_name=self._cluster_id), **job_kwarg ), **self._project_id_and_region() ) log.debug(' submit_job(%s)' % ', '.join( '%s=%r' % (k, v) for k, v in sorted(submit_job_kwargs.items()))) return self.job_client.submit_job(**submit_job_kwargs) def _project_id_and_region(self): return dict( project_id=self._project_id, region=(self._opts['region'] or 'global'), ) def _manifest_download_commands(self): return [ # TODO: SSH in and figure out how to use gsutil or similar # ('gs://*', 'gsutil cp'), ('*://*', 'hadoop fs -copyToLocal'), ] ### SSH hooks ### def _job_tracker_host(self): return '%s-m' % self._cluster_id def _ssh_tunnel_config(self): return _SSH_TUNNEL_CONFIG def _launch_ssh_proc(self, args): ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args) # enter an empty passphrase if creating a key for the first time ssh_proc.stdin.write(b'\n\n') return ssh_proc def _ssh_launch_wait_secs(self): """Wait 20 seconds because gcloud has to update project metadata (unless we were going to check the cluster sooner anyway).""" return min(20.0, self._opts['check_cluster_every']) def _ssh_tunnel_args(self, bind_port): if not self._cluster_id: return gcloud_bin = self._opts['gcloud_bin'] or ['gcloud'] cluster = self._get_cluster(self._cluster_id) zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1] return gcloud_bin + [ 'compute', 'ssh', '--zone', zone, self._job_tracker_host(), '--', ] + self._ssh_tunnel_opts(bind_port) def _log_filter_str(name_to_value): """return a map from name to value into a log filter query that requires each name to equal the given value.""" return ' AND '.join( '%s = %s' % (name, _quote_filter_value(value)) for name, value in sorted(name_to_value.items())) def _quote_filter_value(s): """Put a string in double quotes, escaping double quote characters""" return '"%s"' % s.replace('"', r'\"') def _log_entries_to_log4j(entries): """Convert log entries from a single log file to log4j format, tracking line number. See :py:meth:`mrjob.logs.log4j._parse_hadoop_log4j_records` for format. """ line_num = 0 for entry in entries: message = entry.payload.get('message') or '' # NOTE: currently, google.cloud.logging seems strip newlines :( num_lines = len(message.split('\n')) yield dict( caller_location='', level=(entry.severity or ''), logger=(entry.payload.get('class') or ''), message=message, num_lines=num_lines, start_line=line_num, thread='', timestamp=(entry.timestamp or ''), ) line_num += num_lines def _fix_java_stack_trace(s): # this is what we get from `gcloud logging` if '\n' in s: return s else: return s.replace('\t', '\n\t') def _fix_traceback(s): lines = s.split('\n') # strip log4j warnings (which do have proper linebreaks) lines = [ line for line in lines if line and not _STDERR_LOG4J_WARNING.match(line) ] s = '\n'.join(lines) if '\n' in s: return s # traceback does have newlines s = s.replace(' File', '\n File') s = s.replace(' ', '\n ') s = _TRACEBACK_EXCEPTION_RE.sub(lambda m: '\n' + m.group(0), s) return s def _clean_json_dict_keys(x): """Cast any dictionary keys in the given JSON object to str. We can assume that x isn't a recursive data structure, and that this is only called in Python 2.""" if isinstance(x, dict): return {str(k): _clean_json_dict_keys(v) for k, v in x.items()} elif isinstance(x, list): return [_clean_json_dict_keys(item) for item in x] else: return x def _values_to_text(d): """Return a dictionary with the same keys as *d*, but where the non-string, non-bytes values have been JSON-encoded. Used to encode cluster properties. """ result = {} for k, v in d.items(): if not isinstance(v, (string_types, bytes)): v = json.dumps(v) result[k] = v return result def _fully_qualify_scope_uri(uri): if is_uri(uri): return uri else: return 'https://www.googleapis.com/auth/%s' % uri