mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-08 19:21:28 +00:00
1407 lines
48 KiB
Python
1407 lines
48 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2016 Google Inc. and Yelp
|
|
# Copyright 2017 Yelp
|
|
# Copyright 2018 Google Inc. and Yelp
|
|
# Copyright 2019 Yelp
|
|
# Copyright 2020 Affirm, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import json
|
|
import logging
|
|
import time
|
|
import re
|
|
from io import BytesIO
|
|
from os import environ
|
|
|
|
try:
|
|
import google.auth
|
|
import google.cloud.dataproc_v1beta2
|
|
import google.cloud.dataproc_v1beta2.types
|
|
import google.cloud.logging
|
|
import google.api_core.exceptions
|
|
import google.api_core.grpc_helpers
|
|
except:
|
|
google = None
|
|
|
|
import mrjob
|
|
from mrjob.cloud import HadoopInTheCloudJobRunner
|
|
from mrjob.compat import map_version
|
|
from mrjob.conf import combine_dicts
|
|
from mrjob.fs.composite import CompositeFilesystem
|
|
from mrjob.fs.gcs import GCSFilesystem
|
|
from mrjob.fs.gcs import is_gcs_uri
|
|
from mrjob.fs.local import LocalFilesystem
|
|
from mrjob.logs.counters import _pick_counters
|
|
from mrjob.logs.errors import _log_probable_cause_of_failure
|
|
from mrjob.logs.mixin import LogInterpretationMixin
|
|
from mrjob.logs.task import _parse_task_stderr
|
|
from mrjob.logs.task import _parse_task_syslog_records
|
|
from mrjob.logs.step import _interpret_new_dataproc_step_stderr
|
|
from mrjob.parse import is_uri
|
|
from mrjob.py2 import PY2
|
|
from mrjob.py2 import string_types
|
|
from mrjob.py2 import to_unicode
|
|
from mrjob.runner import _blank_out_conflicting_opts
|
|
from mrjob.setup import UploadDirManager
|
|
from mrjob.step import StepFailedException
|
|
from mrjob.util import random_identifier
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_DEFAULT_GCE_REGION = 'us-west1'
|
|
|
|
_DEFAULT_ENDPOINT = 'dataproc.googleapis.com:443'
|
|
|
|
_DATAPROC_MIN_WORKERS = 2
|
|
_GCE_API_VERSION = 'v1'
|
|
|
|
_DEFAULT_INSTANCE_TYPE = 'n1-standard-1'
|
|
|
|
# default imageVersion to use on Dataproc. This may be updated with each
|
|
# version of mrjob
|
|
_DEFAULT_IMAGE_VERSION = '1.3'
|
|
_DEFAULT_CHECK_CLUSTER_EVERY = 10.0
|
|
_DEFAULT_CLOUD_FS_SYNC_SECS = 5.0
|
|
_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS = 90
|
|
|
|
# job state matcher enum
|
|
# use this to only find active jobs. (2 for NON_ACTIVE, but we don't use that)
|
|
_STATE_MATCHER_ACTIVE = 1
|
|
|
|
# Dataproc images where Hadoop version changed (we use map_version() on this)
|
|
#
|
|
# This will need to be updated by hand if we want it to be fully accurate
|
|
# (it doesn't really matter to mrjob though, which only cares about
|
|
# major version)
|
|
#
|
|
# See https://cloud.google.com/dataproc/docs/concepts/dataproc-versions
|
|
# for the full list.
|
|
_DATAPROC_IMAGE_TO_HADOOP_VERSION = {
|
|
'0.1': '2.7.1',
|
|
'1.0': '2.7.2',
|
|
'1.1': '2.7.7',
|
|
'1.2': '2.8.5',
|
|
'1.3': '2.9.2',
|
|
}
|
|
|
|
_HADOOP_STREAMING_JAR_URI = (
|
|
'file:///usr/lib/hadoop-mapreduce/hadoop-streaming.jar')
|
|
|
|
# TODO - mtai @ davidmarin - Re-implement logs parsing? See dataproc metainfo and driver output - ask Dennis Huo for more details # noqa
|
|
# 'gs://dataproc-801485be-0997-40e7-84a7-00926031747c-us/google-cloud-dataproc-metainfo/8b76d95e-ebdc-4b81-896d-b2c5009b3560/jobs/mr_most_used_word-taim-20160228-172000-034993---Step-2-of-2/driveroutput' # noqa
|
|
|
|
_GCP_CLUSTER_NAME_REGEX = '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
|
|
|
|
# on Dataproc, the resource manager is always at 8088. Tunnel to the master
|
|
# node's own hostname, not localhost.
|
|
_SSH_TUNNEL_CONFIG = dict(
|
|
localhost=False,
|
|
name='resource manager',
|
|
path='/cluster',
|
|
port=8088,
|
|
)
|
|
|
|
# used to match log entries that tell us if a container exited
|
|
_CONTAINER_EXECUTOR_CLASS_NAME = (
|
|
'org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor')
|
|
|
|
# used to determine which containers exited with nonzero status
|
|
_CONTAINER_EXIT_RE = re.compile(
|
|
r'Exit code from container (?P<container_id>\w+)'
|
|
r' is ?: (?P<returncode>\d+)')
|
|
|
|
_TRACEBACK_EXCEPTION_RE = re.compile(r'\w+: .*$')
|
|
|
|
_STDERR_LOG4J_WARNING = re.compile(
|
|
r'.*(No appenders could be found for logger'
|
|
r'|Please initialize the log4j system'
|
|
r'|See http://logging.apache.org/log4j)')
|
|
|
|
# this is equivalent to full permission
|
|
_FULL_SCOPE = 'https://www.googleapis.com/auth/cloud-platform'
|
|
|
|
|
|
# convert enum values to strings (e.g. 'RUNNING')
|
|
|
|
def _cluster_state_name(state_value):
|
|
return google.cloud.dataproc_v1beta2.types.ClusterStatus.State.Name(
|
|
state_value)
|
|
|
|
|
|
def _job_state_name(state_value):
|
|
return google.cloud.dataproc_v1beta2.types.JobStatus.State.Name(
|
|
state_value)
|
|
|
|
|
|
########## BEGIN - Helper fxns for _cluster_create_kwargs ##########
|
|
def _gcp_zone_uri(project, zone):
|
|
return (
|
|
'https://www.googleapis.com/compute/%(gce_api_version)s/projects/'
|
|
'%(project)s/zones/%(zone)s' % dict(
|
|
gce_api_version=_GCE_API_VERSION, project=project, zone=zone))
|
|
|
|
|
|
def _gcp_instance_group_config(
|
|
project, zone, count, instance_type, is_preemptible=False):
|
|
if zone:
|
|
zone_uri = _gcp_zone_uri(project, zone)
|
|
machine_type = "%(zone_uri)s/machineTypes/%(machine_type)s" % dict(
|
|
zone_uri=zone_uri, machine_type=instance_type)
|
|
else:
|
|
machine_type = instance_type
|
|
|
|
return dict(
|
|
num_instances=count,
|
|
machine_type_uri=machine_type,
|
|
is_preemptible=is_preemptible
|
|
)
|
|
########## END - Helper fxns for _cluster_create_kwargs ###########
|
|
|
|
|
|
def _wait_for(msg, sleep_secs):
|
|
log.info("Waiting for %s - sleeping %.1f second(s)", msg, sleep_secs)
|
|
time.sleep(sleep_secs)
|
|
|
|
|
|
def _cleanse_gcp_job_id(job_id):
|
|
return re.sub(r'[^a-zA-Z0-9_\-]', '-', job_id)
|
|
|
|
|
|
def _check_and_fix_fs_dir(gcs_uri):
|
|
"""Helper for __init__"""
|
|
# TODO - mtai @ davidmarin - push this to fs/*.py
|
|
if not is_gcs_uri(gcs_uri):
|
|
raise ValueError('Invalid GCS URI: %r' % gcs_uri)
|
|
if not gcs_uri.endswith('/'):
|
|
gcs_uri += '/'
|
|
|
|
return gcs_uri
|
|
|
|
|
|
def _zone_to_region(zone):
|
|
"""Convert a zone (like us-west1-b) to the corresponding region
|
|
(like us-west1)."""
|
|
# See https://cloud.google.com/compute/docs/regions-zones/#identifying_a_region_or_zone # noqa
|
|
return '-'.join(zone.split('-')[:-1])
|
|
|
|
|
|
class DataprocException(Exception):
|
|
pass
|
|
|
|
|
|
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
|
|
"""Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
|
|
Invoked when you run your job with ``-r dataproc``.
|
|
|
|
:py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
|
|
is basically a temporary Hadoop cluster.
|
|
|
|
Input, support, and jar files can be either local or on GCS; use
|
|
``gs://...`` URLs to refer to files on GCS.
|
|
|
|
This class has some useful utilities for talking directly to GCS and
|
|
Dataproc, so you may find it useful to instantiate it without a script::
|
|
|
|
from mrjob.dataproc import DataprocJobRunner
|
|
...
|
|
"""
|
|
alias = 'dataproc'
|
|
|
|
OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
|
|
'cluster_properties',
|
|
'core_instance_config',
|
|
'gcloud_bin',
|
|
'master_instance_config',
|
|
'network',
|
|
'project_id',
|
|
'service_account',
|
|
'service_account_scopes',
|
|
'subnet',
|
|
'task_instance_config',
|
|
}
|
|
|
|
# no Spark support yet (see #1765)
|
|
_STEP_TYPES = {'jar', 'streaming'}
|
|
|
|
def __init__(self, **kwargs):
|
|
""":py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
|
|
arguments as
|
|
:py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
|
|
which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
|
|
"""
|
|
super(DataprocJobRunner, self).__init__(**kwargs)
|
|
|
|
# check for library support
|
|
if google is None:
|
|
raise ImportError(
|
|
'You must install google-cloud-logging and '
|
|
'google-cloud-storage to connect to Dataproc')
|
|
|
|
# Dataproc requires a master and >= 2 core instances
|
|
# num_core_instances refers ONLY to number of CORE instances and does
|
|
# NOT include the required 1 instance for master
|
|
# In other words, minimum cluster size is 3 machines, 1 master and 2
|
|
# "num_core_instances" workers
|
|
if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
|
|
raise DataprocException(
|
|
'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)
|
|
|
|
if (self._opts['core_instance_type'] !=
|
|
self._opts['task_instance_type']):
|
|
raise DataprocException(
|
|
'Dataproc v1 expects core/task instance types to be identical')
|
|
|
|
# see #1820
|
|
if self._opts['image_id']:
|
|
log.warning('mrjob does not yet support custom machine images'
|
|
' on Dataproc')
|
|
|
|
# load credentials and project ID
|
|
self._credentials, auth_project_id = google.auth.default(
|
|
scopes=[_FULL_SCOPE]) # needed for $GOOGLE_APPLICATION_CREDENTIALS
|
|
|
|
self._project_id = self._opts['project_id'] or auth_project_id
|
|
|
|
if not self._project_id:
|
|
raise DataprocException(
|
|
'project_id must be set. Use --project_id or'
|
|
' set $GOOGLE_CLOUD_PROJECT')
|
|
|
|
self._fix_zone_and_region_opts()
|
|
|
|
if self._opts['service_account_scopes']:
|
|
self._opts['service_account_scopes'] = [
|
|
_fully_qualify_scope_uri(s)
|
|
for s in self._opts['service_account_scopes']
|
|
]
|
|
|
|
# cluster_id can be None here
|
|
self._cluster_id = self._opts['cluster_id']
|
|
|
|
self._api_client = None
|
|
self._gcs_fs = None
|
|
self._fs = None
|
|
|
|
# BEGIN - setup directories
|
|
base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])
|
|
|
|
self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)
|
|
|
|
# use job key to make a unique tmp dir
|
|
self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'
|
|
|
|
# pick/validate output dir
|
|
if self._output_dir:
|
|
self._output_dir = _check_and_fix_fs_dir(self._output_dir)
|
|
else:
|
|
self._output_dir = self._job_tmpdir + 'output/'
|
|
# END - setup directories
|
|
|
|
# manage local files that we want to upload to GCS. We'll add them
|
|
# to this manager just before we need them.
|
|
fs_files_dir = self._job_tmpdir + 'files/'
|
|
self._upload_mgr = UploadDirManager(fs_files_dir)
|
|
|
|
# when did our particular task start?
|
|
self._dataproc_job_start = None
|
|
|
|
# init hadoop, ami version caches
|
|
self._image_version = None
|
|
self._hadoop_version = None
|
|
|
|
# map driver_output_uri to a dict with the keys:
|
|
# log_uri: uri of file we're reading from
|
|
# pos: position in file
|
|
# buffer: bytes read from file already
|
|
self._driver_output_state = {}
|
|
|
|
# This will be filled by _run_steps()
|
|
# NOTE - log_interpretations will be empty except job_id until we
|
|
# parse task logs
|
|
self._log_interpretations = []
|
|
|
|
def _fix_zone_and_region_opts(self):
|
|
"""Ensure that exactly one of region and zone is set."""
|
|
if self._opts['region'] and self._opts['zone']:
|
|
log.warning('you do not need to set region if you set zone')
|
|
self._opts['region'] = None
|
|
return
|
|
|
|
if not (self._opts['region'] or self._opts['zone']):
|
|
if environ.get('CLOUDSDK_COMPUTE_ZONE'):
|
|
self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE']
|
|
elif environ.get('CLOUDSDK_COMPUTE_REGION'):
|
|
self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION']
|
|
else:
|
|
self._opts['region'] = _DEFAULT_GCE_REGION
|
|
|
|
@classmethod
|
|
def _default_opts(cls):
|
|
return combine_dicts(
|
|
super(DataprocJobRunner, cls)._default_opts(),
|
|
dict(
|
|
bootstrap_python=True,
|
|
check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY,
|
|
cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'],
|
|
cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS,
|
|
image_version=_DEFAULT_IMAGE_VERSION,
|
|
instance_type=_DEFAULT_INSTANCE_TYPE,
|
|
master_instance_type=_DEFAULT_INSTANCE_TYPE,
|
|
num_core_instances=_DATAPROC_MIN_WORKERS,
|
|
num_task_instances=0,
|
|
)
|
|
)
|
|
|
|
def _combine_opts(self, opt_list):
|
|
"""Blank out conflicts between *network*/*subnet* and
|
|
*region*/*zone*."""
|
|
opt_list = _blank_out_conflicting_opts(opt_list, ['region', 'zone'])
|
|
opt_list = _blank_out_conflicting_opts(opt_list, ['network', 'subnet'])
|
|
|
|
# now combine opts, with region/zone blanked out
|
|
return super(DataprocJobRunner, self)._combine_opts(opt_list)
|
|
|
|
@property
|
|
def cluster_client(self):
|
|
return google.cloud.dataproc_v1beta2.ClusterControllerClient(
|
|
**self._client_create_kwargs())
|
|
|
|
@property
|
|
def job_client(self):
|
|
return google.cloud.dataproc_v1beta2.JobControllerClient(
|
|
**self._client_create_kwargs())
|
|
|
|
@property
|
|
def logging_client(self):
|
|
return google.cloud.logging.Client(credentials=self._credentials,
|
|
project=self._project_id)
|
|
|
|
def _client_create_kwargs(self):
|
|
if self._opts['region']:
|
|
endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT)
|
|
return dict(
|
|
channel=google.api_core.grpc_helpers.create_channel(
|
|
endpoint, credentials=self._credentials))
|
|
else:
|
|
return dict(credentials=self._credentials)
|
|
|
|
@property
|
|
def api_client(self):
|
|
raise NotImplementedError(
|
|
'"api_client" was disabled in v0.6.2. Use "cluster_client"'
|
|
' or "job_client" instead.')
|
|
|
|
@property
|
|
def fs(self):
|
|
""":py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
|
|
the local filesystem.
|
|
"""
|
|
if self._fs is None:
|
|
self._fs = CompositeFilesystem()
|
|
|
|
location = self._opts['region'] or _zone_to_region(
|
|
self._opts['zone'])
|
|
|
|
self._fs.add_fs('gcs', GCSFilesystem(
|
|
credentials=self._credentials,
|
|
project_id=self._project_id,
|
|
part_size=self._upload_part_size(),
|
|
location=location,
|
|
object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
|
|
))
|
|
|
|
self._fs.add_fs('local', LocalFilesystem())
|
|
|
|
return self._fs
|
|
|
|
def _get_tmpdir(self, given_tmpdir):
|
|
"""Helper for _fix_tmpdir"""
|
|
if given_tmpdir:
|
|
return given_tmpdir
|
|
|
|
# Loop over buckets until we find one that matches region
|
|
# NOTE - because this is a tmpdir, we look for a GCS bucket in the
|
|
# same GCE region
|
|
chosen_bucket_name = None
|
|
|
|
# determine region for bucket
|
|
region = self._region()
|
|
|
|
for tmp_bucket_name in self.fs.gcs.get_all_bucket_names(
|
|
prefix='mrjob-'):
|
|
tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name)
|
|
|
|
# NOTE - GCP ambiguous Behavior - Bucket location is being
|
|
# returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
|
|
# suggest lowercase. (As of Feb. 12, 2018, this is still true,
|
|
# observed on google-cloud-sdk)
|
|
if tmp_bucket.location.lower() == region:
|
|
# Regions are both specified and match
|
|
log.info("using existing temp bucket %s" % tmp_bucket_name)
|
|
chosen_bucket_name = tmp_bucket_name
|
|
break
|
|
|
|
# Example default - "mrjob-us-central1-RANDOMHEX"
|
|
if not chosen_bucket_name:
|
|
chosen_bucket_name = '-'.join(
|
|
['mrjob', region, random_identifier()])
|
|
|
|
return 'gs://%s/tmp/' % chosen_bucket_name
|
|
|
|
def _region(self):
|
|
# region of cluster, which is either the region set by the user,
|
|
# or the region derived from the zone they set.
|
|
# used to pick bucket location and name cluster
|
|
return self._opts['region'] or _zone_to_region(self._opts['zone'])
|
|
|
|
def _run(self):
|
|
self._launch()
|
|
self._run_steps()
|
|
|
|
def _launch(self):
|
|
self._prepare_for_launch()
|
|
self._launch_cluster()
|
|
|
|
def _prepare_for_launch(self):
|
|
self._check_output_not_exists()
|
|
self._create_setup_wrapper_scripts()
|
|
self._add_bootstrap_files_for_upload()
|
|
self._add_job_files_for_upload()
|
|
self._upload_local_files()
|
|
self._wait_for_fs_sync()
|
|
|
|
def _check_output_not_exists(self):
|
|
"""Verify the output path does not already exist. This avoids
|
|
provisioning a cluster only to have Hadoop refuse to launch.
|
|
"""
|
|
if self.fs.exists(self._output_dir):
|
|
raise IOError(
|
|
'Output path %s already exists!' % (self._output_dir,))
|
|
|
|
def _add_bootstrap_files_for_upload(self):
|
|
"""Add files needed by the bootstrap script to self._upload_mgr.
|
|
|
|
Create the master bootstrap script if necessary.
|
|
|
|
"""
|
|
# all other files needed by the script are already in
|
|
# _bootstrap_dir_mgr
|
|
for path in self._bootstrap_dir_mgr.paths():
|
|
self._upload_mgr.add(path)
|
|
|
|
# now that we know where the above files live, we can create
|
|
# the master bootstrap script
|
|
self._create_master_bootstrap_script_if_needed()
|
|
if self._master_bootstrap_script_path:
|
|
self._upload_mgr.add(self._master_bootstrap_script_path)
|
|
|
|
def _add_job_files_for_upload(self):
|
|
"""Add files needed for running the job (setup and input)
|
|
to self._upload_mgr."""
|
|
if self._opts['hadoop_streaming_jar']:
|
|
self._upload_mgr.add(self._opts['hadoop_streaming_jar'])
|
|
|
|
for step in self._get_steps():
|
|
if step.get('jar'):
|
|
self._upload_mgr.add(step['jar'])
|
|
|
|
### Running the job ###
|
|
|
|
def cleanup(self, mode=None):
|
|
super(DataprocJobRunner, self).cleanup(mode=mode)
|
|
|
|
# close our SSH tunnel, if any
|
|
self._kill_ssh_tunnel()
|
|
|
|
# stop the cluster if it belongs to us (it may have stopped on its
|
|
# own already, but that's fine)
|
|
if self._cluster_id and not self._opts['cluster_id']:
|
|
self._cleanup_cluster()
|
|
|
|
def _cleanup_cloud_tmp(self):
|
|
# delete all the files we created
|
|
if not self._job_tmpdir:
|
|
return
|
|
|
|
try:
|
|
log.info('Removing all files in %s' % self._job_tmpdir)
|
|
self.fs.rm(self._job_tmpdir)
|
|
self._job_tmpdir = None
|
|
except Exception as e:
|
|
log.exception(e)
|
|
|
|
# TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
|
|
def _cleanup_logs(self):
|
|
super(DataprocJobRunner, self)._cleanup_logs()
|
|
|
|
def _cleanup_job(self):
|
|
job_prefix = self._dataproc_job_prefix()
|
|
for job in self._list_jobs(
|
|
cluster_name=self._cluster_id,
|
|
state_matcher=_STATE_MATCHER_ACTIVE):
|
|
# Kill all active jobs with the same job_prefix as this job
|
|
job_id = job.reference.job_id
|
|
|
|
if not job_id.startswith(job_prefix):
|
|
continue
|
|
|
|
self._cancel_job(job_id)
|
|
self._wait_for_api('job cancellation')
|
|
|
|
def _cleanup_cluster(self):
|
|
if not self._cluster_id:
|
|
# If we don't have a cluster, then we can't terminate it.
|
|
return
|
|
|
|
try:
|
|
log.info("Attempting to terminate cluster")
|
|
self._delete_cluster(self._cluster_id)
|
|
except Exception as e:
|
|
log.exception(e)
|
|
return
|
|
log.info('cluster %s successfully terminated' % self._cluster_id)
|
|
|
|
def _wait_for_api(self, msg):
|
|
_wait_for(msg, self._opts['check_cluster_every'])
|
|
|
|
def _wait_for_fs_sync(self):
|
|
"""Sleep for a little while, to give FS a chance to sync up.
|
|
"""
|
|
_wait_for('GCS sync (eventual consistency)',
|
|
self._opts['cloud_fs_sync_secs'])
|
|
|
|
def _streaming_step_job_kwarg(self, step_num):
|
|
"""Returns a map from ``'hadoop_job'`` to a dict representing
|
|
a hadoop streaming job.
|
|
"""
|
|
return dict(
|
|
hadoop_job=dict(
|
|
args=self._hadoop_streaming_jar_args(step_num),
|
|
main_jar_file_uri=self._hadoop_streaming_jar_uri(),
|
|
)
|
|
)
|
|
|
|
def _jar_step_job_kwarg(self, step_num):
|
|
"""Returns a map from ``'hadoop_job'`` to a dict representing
|
|
a Hadoop job that runs a JAR"""
|
|
step = self._get_step(step_num)
|
|
|
|
hadoop_job = {}
|
|
|
|
hadoop_job['args'] = (
|
|
self._interpolate_jar_step_args(step['args'], step_num))
|
|
|
|
jar_uri = self._upload_mgr.uri(step['jar'])
|
|
|
|
# can't specify main_class and main_jar_file_uri; see
|
|
# https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa
|
|
if step.get('main_class'):
|
|
hadoop_job['jar_file_uris'] = [jar_uri]
|
|
hadoop_job['main_class'] = step['main_class']
|
|
else:
|
|
hadoop_job['main_jar_file_uri'] = jar_uri
|
|
|
|
return dict(hadoop_job=hadoop_job)
|
|
|
|
def _hadoop_streaming_jar_uri(self):
|
|
if self._opts['hadoop_streaming_jar']:
|
|
return self._upload_mgr.uri(self._opts['hadoop_streaming_jar'])
|
|
else:
|
|
return _HADOOP_STREAMING_JAR_URI
|
|
|
|
def _launch_cluster(self):
|
|
"""Create an empty cluster on Dataproc, and set self._cluster_id to
|
|
its ID."""
|
|
self.fs.mkdir(self._job_tmpdir)
|
|
|
|
# clusterName must be a match of
|
|
# regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
|
|
# as documented in an API error message
|
|
# (not currently documented in the Dataproc docs)
|
|
if not self._cluster_id:
|
|
self._cluster_id = '-'.join(
|
|
['mrjob', self._region(), random_identifier()])
|
|
|
|
# Create the cluster if it's missing, otherwise join an existing one
|
|
try:
|
|
self._get_cluster(self._cluster_id)
|
|
log.info('Adding job to existing cluster - %s' % self._cluster_id)
|
|
except google.api_core.exceptions.NotFound:
|
|
log.info(
|
|
'Creating Dataproc Hadoop cluster - %s' % self._cluster_id)
|
|
|
|
cluster_data = self._cluster_create_kwargs()
|
|
self._create_cluster(cluster_data)
|
|
|
|
self._wait_for_cluster_ready(self._cluster_id)
|
|
|
|
self._set_up_ssh_tunnel()
|
|
|
|
# keep track of when we launched our job
|
|
self._dataproc_job_start = time.time()
|
|
return self._cluster_id
|
|
|
|
def _wait_for_cluster_ready(self, cluster_id):
|
|
# See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa
|
|
cluster_state = None
|
|
|
|
# Poll until cluster is ready
|
|
while cluster_state not in ('RUNNING', 'UPDATING'):
|
|
cluster = self._get_cluster(cluster_id)
|
|
cluster_state = cluster.status.State.Name(cluster.status.state)
|
|
|
|
if cluster_state in ('ERROR', 'DELETING'):
|
|
raise DataprocException(cluster)
|
|
|
|
self._wait_for_api('cluster to accept jobs')
|
|
|
|
return cluster_id
|
|
|
|
def _dataproc_job_prefix(self):
|
|
return _cleanse_gcp_job_id(self._job_key)
|
|
|
|
def _run_steps(self):
|
|
"""Wait for every step of the job to complete, one by one."""
|
|
total_steps = self._num_steps()
|
|
# define out steps
|
|
for step_num in range(total_steps):
|
|
job_id = self._launch_step(step_num)
|
|
|
|
self._wait_for_step_to_complete(
|
|
job_id, step_num=step_num, num_steps=total_steps)
|
|
|
|
log.info('Completed Dataproc Hadoop Job - %s', job_id)
|
|
|
|
# After all steps completed, wait for the last output (which is
|
|
# usually written to GCS) to sync
|
|
self._wait_for_fs_sync()
|
|
|
|
def _launch_step(self, step_num):
|
|
step = self._get_step(step_num)
|
|
|
|
# Clean-up step name
|
|
step_name = '%s---step-%05d-of-%05d' % (
|
|
self._dataproc_job_prefix(), step_num + 1, self._num_steps())
|
|
|
|
# Build step
|
|
|
|
# job_kwarg is a single-item dict, where the key is 'hadoop_job',
|
|
# 'spark_job', etc.
|
|
if step['type'] == 'streaming':
|
|
job_kwarg = self._streaming_step_job_kwarg(step_num)
|
|
elif step['type'] == 'jar':
|
|
job_kwarg = self._jar_step_job_kwarg(step_num)
|
|
else:
|
|
raise NotImplementedError(
|
|
'Unsupported step type: %r' % step['type'])
|
|
|
|
# Submit it
|
|
log.info('Submitting Dataproc Hadoop Job - %s', step_name)
|
|
result = self._submit_job(step_name, job_kwarg)
|
|
log.info('Submitted Dataproc Hadoop Job - %s', step_name)
|
|
|
|
job_id = result.reference.job_id
|
|
assert job_id == step_name
|
|
|
|
return job_id
|
|
|
|
def _wait_for_step_to_complete(self, job_id, step_num, num_steps):
|
|
"""Helper for _wait_for_step_to_complete(). Wait for
|
|
step with the given ID to complete, and fetch counters.
|
|
If it fails, attempt to diagnose the error, and raise an
|
|
exception.
|
|
|
|
This also adds an item to self._log_interpretations
|
|
"""
|
|
log_interpretation = dict(job_id=job_id)
|
|
self._log_interpretations.append(log_interpretation)
|
|
|
|
log_interpretation['step'] = {}
|
|
step_type = self._get_step(step_num)['type']
|
|
|
|
while True:
|
|
# https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa
|
|
job = self._get_job(job_id)
|
|
|
|
job_state = job.status.State.Name(job.status.state)
|
|
|
|
log.info('%s => %s' % (job_id, job_state))
|
|
|
|
log_interpretation['step']['driver_output_uri'] = (
|
|
job.driver_output_resource_uri)
|
|
|
|
self._interpret_step_logs(log_interpretation, step_type)
|
|
|
|
progress = log_interpretation['step'].get('progress')
|
|
if progress:
|
|
log.info(' ' + progress['message'])
|
|
|
|
# https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa
|
|
# these are the states covered by the ACTIVE job state matcher,
|
|
# plus SETUP_DONE
|
|
if job_state in ('PENDING', 'RUNNING',
|
|
'CANCEL_PENDING', 'SETUP_DONE'):
|
|
self._wait_for_api('job completion')
|
|
continue
|
|
|
|
# print counters if job wasn't CANCELLED
|
|
if job_state != 'CANCELLED':
|
|
self._log_counters(log_interpretation, step_num)
|
|
|
|
if job_state == 'ERROR':
|
|
error = self._pick_error(log_interpretation, step_type)
|
|
if error:
|
|
_log_probable_cause_of_failure(log, error)
|
|
|
|
# we're done, will return at the end of this
|
|
if job_state == 'DONE':
|
|
break
|
|
else:
|
|
raise StepFailedException(
|
|
step_num=step_num, num_steps=num_steps)
|
|
|
|
def _default_step_output_dir(self):
|
|
# put intermediate data in HDFS
|
|
return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key
|
|
|
|
### log intepretation ###
|
|
|
|
# step
|
|
|
|
def _interpret_step_logs(self, log_interpretation, step_type):
|
|
"""Hook for interpreting step logs.
|
|
|
|
Unlike with most runners, you may call this multiple times and it
|
|
will continue to parse the step log incrementally, which is useful
|
|
for getting job progress."""
|
|
# don't turn this off even if read_logs opt is false; it's
|
|
# the only way this runner can track job progress
|
|
|
|
driver_output_uri = log_interpretation.get(
|
|
'step', {}).get('driver_output_uri')
|
|
|
|
if driver_output_uri:
|
|
self._update_step_interpretation(
|
|
log_interpretation['step'], driver_output_uri)
|
|
|
|
def _update_step_interpretation(
|
|
self, step_interpretation, driver_output_uri):
|
|
new_lines = self._get_new_driver_output_lines(driver_output_uri)
|
|
_interpret_new_dataproc_step_stderr(step_interpretation, new_lines)
|
|
|
|
def _get_new_driver_output_lines(self, driver_output_uri):
|
|
"""Get a list of complete job driver output lines that are
|
|
new since the last time we checked.
|
|
"""
|
|
state = self._driver_output_state.setdefault(
|
|
driver_output_uri,
|
|
dict(log_uri=None, pos=0, buffer=b''))
|
|
|
|
# driver output is in logs with names like driveroutput.000000000
|
|
log_uris = sorted(self.fs.ls(driver_output_uri + '*'))
|
|
|
|
for log_uri in log_uris:
|
|
# initialize log_uri with first URI we see
|
|
if state['log_uri'] is None:
|
|
# log the location of job driver output just once
|
|
log.info(
|
|
' Parsing job driver output from %s*' % driver_output_uri)
|
|
state['log_uri'] = log_uri
|
|
|
|
# skip log files already parsed
|
|
if log_uri < state['log_uri']:
|
|
continue
|
|
|
|
# when parsing the next file, reset *pos*
|
|
elif log_uri > state['log_uri']:
|
|
state['pos'] = 0
|
|
state['log_uri'] = log_uri
|
|
|
|
log_blob = self.fs.gcs._get_blob(log_uri)
|
|
|
|
try:
|
|
new_data = log_blob.download_as_string(start=state['pos'])
|
|
except (google.api_core.exceptions.NotFound,
|
|
google.api_core.exceptions.RequestRangeNotSatisfiable):
|
|
# blob was just created, or no more data is available
|
|
break
|
|
|
|
state['buffer'] += new_data
|
|
state['pos'] += len(new_data)
|
|
|
|
# convert buffer into lines, saving leftovers for next time
|
|
stream = BytesIO(state['buffer'])
|
|
state['buffer'] = b''
|
|
|
|
lines = []
|
|
|
|
for line_bytes in stream:
|
|
if line_bytes.endswith(b'\n'):
|
|
lines.append(to_unicode(line_bytes))
|
|
else:
|
|
# leave final partial line (if any) in buffer
|
|
state['buffer'] = line_bytes
|
|
|
|
return lines
|
|
|
|
# history
|
|
|
|
def _interpret_history_log(self, log_interpretation):
|
|
"""Does nothing. We can't get the history logs, and we don't need
|
|
them."""
|
|
if not self._read_logs():
|
|
return
|
|
|
|
log_interpretation.setdefault('history', {})
|
|
|
|
# task
|
|
|
|
def _interpret_task_logs(self, log_interpretation, step_type,
|
|
error_attempt_ids=(), partial=True):
|
|
"""Scan node manager log to find failed container IDs of failed
|
|
tasks, and then scan the corresponding stderr and syslogs."""
|
|
if 'task' in log_interpretation and (
|
|
partial or not log_interpretation['task'].get('partial')):
|
|
return # already interpreted
|
|
|
|
if not self._read_logs():
|
|
return
|
|
|
|
step_interpretation = log_interpretation.get('step') or {}
|
|
|
|
application_id = step_interpretation.get('application_id')
|
|
if not application_id:
|
|
log.warning(
|
|
"Can't parse node manager logs; missing application ID")
|
|
return
|
|
|
|
log_interpretation['task'] = self._task_log_interpretation(
|
|
application_id, step_type, partial)
|
|
|
|
def _task_log_interpretation(
|
|
self, application_id, step_type, partial=True):
|
|
"""Helper for :py:meth:`_interpret_task_logs`"""
|
|
# not bothering with _read_logs() since this is a helper method
|
|
result = {}
|
|
|
|
for container_id in self._failed_task_container_ids(application_id):
|
|
error = _parse_task_syslog_records(
|
|
self._task_syslog_records(
|
|
application_id, container_id, step_type))
|
|
|
|
if not error.get('hadoop_error'):
|
|
# not sure if this ever happens, since we already know
|
|
# which containers failed
|
|
continue
|
|
|
|
error['container_id'] = container_id
|
|
|
|
# fix weird munging of java stacktrace
|
|
error['hadoop_error']['message'] = _fix_java_stack_trace(
|
|
error['hadoop_error']['message'])
|
|
|
|
task_error = _parse_task_stderr(
|
|
self._task_stderr_lines(
|
|
application_id, container_id, step_type))
|
|
|
|
if task_error:
|
|
task_error['message'] = _fix_traceback(task_error['message'])
|
|
error['task_error'] = task_error
|
|
|
|
result.setdefault('errors', []).append(error)
|
|
|
|
# if partial is true, bail out when we find the first task error
|
|
if task_error and partial:
|
|
result['partial'] = True
|
|
return result
|
|
|
|
return result
|
|
|
|
def _failed_task_container_ids(self, application_id):
|
|
"""Stream container IDs of failed tasks, in reverse order."""
|
|
container_id_prefix = 'container' + application_id[11:]
|
|
|
|
log_filter = self._make_log_filter(
|
|
'yarn-yarn-nodemanager',
|
|
{'jsonPayload.class': _CONTAINER_EXECUTOR_CLASS_NAME})
|
|
|
|
log.info('Scanning node manager logs for IDs of failed tasks...')
|
|
|
|
# it doesn't seem to work to do self.logging_client.logger();
|
|
# there's some RPC dispute about whether the log name should
|
|
# be qualified by project name or not
|
|
entries = self.logging_client.list_entries(
|
|
filter_=log_filter, order_by=google.cloud.logging.DESCENDING)
|
|
|
|
for entry in entries:
|
|
message = entry.payload.get('message')
|
|
if not message:
|
|
continue
|
|
|
|
m = _CONTAINER_EXIT_RE.match(message)
|
|
if not m:
|
|
continue
|
|
|
|
returncode = int(m.group('returncode'))
|
|
if not returncode:
|
|
continue
|
|
|
|
container_id = m.group('container_id')
|
|
# matches some other step
|
|
if not container_id.startswith(container_id_prefix):
|
|
continue
|
|
|
|
log.debug(' %s' % container_id)
|
|
yield container_id
|
|
|
|
def _task_stderr_lines(self, application_id, container_id, step_type):
|
|
"""Yield lines from a specific stderr log."""
|
|
log_filter = self._make_log_filter(
|
|
'yarn-userlogs', {
|
|
'jsonPayload.application': application_id,
|
|
'jsonPayload.container': container_id,
|
|
# TODO: pick based on step_type
|
|
'jsonPayload.container_logname': 'stderr',
|
|
})
|
|
|
|
log.info(' reading stderr log...')
|
|
entries = self.logging_client.list_entries(filter_=log_filter)
|
|
|
|
# use log4j parsing to handle tab -> newline conversion
|
|
for record in _log_entries_to_log4j(entries):
|
|
for line in record['message'].split('\n'):
|
|
yield line
|
|
|
|
def _task_syslog_records(self, application_id, container_id, step_type):
|
|
"""Yield log4j records from a specific syslog.
|
|
"""
|
|
log_filter = self._make_log_filter(
|
|
'yarn-userlogs', {
|
|
'jsonPayload.application': application_id,
|
|
'jsonPayload.container': container_id,
|
|
# TODO: pick based on step_type
|
|
'jsonPayload.container_logname': 'syslog',
|
|
})
|
|
|
|
log.info(' reading syslog...')
|
|
entries = self.logging_client.list_entries(filter_=log_filter)
|
|
|
|
return _log_entries_to_log4j(entries)
|
|
|
|
# misc
|
|
|
|
def _make_log_filter(self, log_name=None, extra_values=None):
|
|
# we only want logs from this project, cluster, and region
|
|
d = {}
|
|
|
|
d['resource.labels.cluster_name'] = self._cluster_id
|
|
d['resource.labels.project_id'] = self._project_id
|
|
d['resource.labels.region'] = self._region()
|
|
d['resource.type'] = 'cloud_dataproc_cluster'
|
|
|
|
if log_name:
|
|
d['logName'] = 'projects/%s/logs/%s' % (
|
|
self._project_id, log_name)
|
|
|
|
if extra_values:
|
|
d.update(extra_values)
|
|
|
|
return _log_filter_str(d)
|
|
|
|
def counters(self):
|
|
return [_pick_counters(log_interpretation)
|
|
for log_interpretation in self._log_interpretations]
|
|
|
|
### Bootstrapping ###
|
|
|
|
def get_hadoop_version(self):
|
|
if self._hadoop_version is None:
|
|
self._store_cluster_info()
|
|
return self._hadoop_version
|
|
|
|
def get_image_version(self):
|
|
"""Get the version that our cluster is running.
|
|
"""
|
|
if self._image_version is None:
|
|
self._store_cluster_info()
|
|
return self._image_version
|
|
|
|
def _store_cluster_info(self):
|
|
"""Set self._image_version and self._hadoop_version."""
|
|
if not self._cluster_id:
|
|
raise ValueError('cluster has not yet been created')
|
|
|
|
cluster = self._get_cluster(self._cluster_id)
|
|
self._image_version = (
|
|
cluster.config.software_config.image_version)
|
|
# protect against new versions, including patch versions
|
|
# we didn't explicitly request. See #1428
|
|
self._hadoop_version = map_version(
|
|
self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION)
|
|
|
|
def _bootstrap_pre_commands(self):
|
|
# don't run the bootstrap script in / (see #1601)
|
|
return [
|
|
'mkdir /tmp/mrjob',
|
|
'cd /tmp/mrjob',
|
|
]
|
|
|
|
### Bootstrapping ###
|
|
|
|
def _bootstrap_python(self):
|
|
"""Return a (possibly empty) list of parsed commands (in the same
|
|
format as returned by parse_setup_cmd())'"""
|
|
if not self._opts['bootstrap_python']:
|
|
return []
|
|
|
|
if PY2:
|
|
# Python 2 is already installed; install pip and dev packages
|
|
return [
|
|
['sudo apt-get install -y python-pip python-dev'],
|
|
]
|
|
else:
|
|
return [
|
|
['sudo apt-get install -y python3 python3-pip python3-dev'],
|
|
]
|
|
|
|
def get_cluster_id(self):
|
|
return self._cluster_id
|
|
|
|
def _cluster_create_kwargs(self):
|
|
gcs_init_script_uris = []
|
|
if self._master_bootstrap_script_path:
|
|
gcs_init_script_uris.append(
|
|
self._upload_mgr.uri(self._master_bootstrap_script_path))
|
|
|
|
cluster_metadata = dict()
|
|
cluster_metadata['mrjob-version'] = mrjob.__version__
|
|
|
|
# TODO: remove mrjob-max-secs-idle once lifecycle_config is visible
|
|
# through the gcloud utility and the Google Cloud Console
|
|
cluster_metadata['mrjob-max-secs-idle'] = str(int(
|
|
self._opts['max_mins_idle'] * 60))
|
|
|
|
gce_cluster_config = dict(
|
|
metadata=cluster_metadata,
|
|
service_account_scopes=self._opts['service_account_scopes'],
|
|
)
|
|
|
|
if self._opts['network']:
|
|
gce_cluster_config['network_uri'] = self._opts['network']
|
|
|
|
if self._opts['subnet']:
|
|
gce_cluster_config['subnetwork_uri'] = self._opts['subnet']
|
|
|
|
if self._opts['service_account']:
|
|
gce_cluster_config['service_account'] = (
|
|
self._opts['service_account'])
|
|
|
|
if self._opts['service_account_scopes']:
|
|
gce_cluster_config['service_account_scopes'] = (
|
|
self._opts['service_account_scopes'])
|
|
|
|
if self._opts['zone']:
|
|
gce_cluster_config['zone_uri'] = _gcp_zone_uri(
|
|
project=self._project_id, zone=self._opts['zone'])
|
|
|
|
cluster_config = dict(
|
|
gce_cluster_config=gce_cluster_config,
|
|
initialization_actions=[
|
|
dict(executable_file=init_script_uri)
|
|
for init_script_uri in gcs_init_script_uris
|
|
]
|
|
)
|
|
|
|
# Task tracker
|
|
master_conf = _gcp_instance_group_config(
|
|
project=self._project_id, zone=self._opts['zone'],
|
|
count=1, instance_type=self._opts['master_instance_type'],
|
|
)
|
|
if self._opts['master_instance_config']:
|
|
master_conf.update(self._opts['master_instance_config'])
|
|
|
|
# Compute + storage
|
|
worker_conf = _gcp_instance_group_config(
|
|
project=self._project_id, zone=self._opts['zone'],
|
|
count=self._opts['num_core_instances'],
|
|
instance_type=self._opts['core_instance_type']
|
|
)
|
|
if self._opts['core_instance_config']:
|
|
worker_conf.update(self._opts['core_instance_config'])
|
|
|
|
# Compute ONLY
|
|
secondary_worker_conf = _gcp_instance_group_config(
|
|
project=self._project_id, zone=self._opts['zone'],
|
|
count=self._opts['num_task_instances'],
|
|
instance_type=self._opts['task_instance_type'],
|
|
is_preemptible=True
|
|
)
|
|
if self._opts['task_instance_config']:
|
|
secondary_worker_conf.update(self._opts['task_instance_config'])
|
|
|
|
cluster_config['master_config'] = master_conf
|
|
cluster_config['worker_config'] = worker_conf
|
|
if secondary_worker_conf.get('num_instances'):
|
|
cluster_config['secondary_worker_config'] = secondary_worker_conf
|
|
|
|
cluster_config['lifecycle_config'] = dict(
|
|
idle_delete_ttl=dict(
|
|
seconds=int(self._opts['max_mins_idle'] * 60)))
|
|
|
|
software_config = {}
|
|
|
|
if self._opts['cluster_properties']:
|
|
software_config['properties'] = _values_to_text(
|
|
self._opts['cluster_properties'])
|
|
|
|
# See - https://cloud.google.com/dataproc/dataproc-versions
|
|
if self._opts['image_version']:
|
|
software_config['image_version'] = self._opts['image_version']
|
|
|
|
if software_config:
|
|
cluster_config['software_config'] = software_config
|
|
|
|
# in Python 2, dict keys loaded from JSON will be unicode, which
|
|
# the Google protobuf objects don't like
|
|
if PY2:
|
|
cluster_config = _clean_json_dict_keys(cluster_config)
|
|
|
|
kwargs = dict(project_id=self._project_id,
|
|
cluster_name=self._cluster_id,
|
|
config=cluster_config)
|
|
|
|
return self._add_extra_cluster_params(kwargs)
|
|
|
|
### Dataproc-specific Stuff ###
|
|
|
|
def _get_cluster(self, cluster_id):
|
|
return self.cluster_client.get_cluster(
|
|
cluster_name=cluster_id,
|
|
**self._project_id_and_region()
|
|
)
|
|
|
|
def _create_cluster(self, cluster_data):
|
|
# https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa
|
|
# https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa
|
|
|
|
self.cluster_client.create_cluster(
|
|
cluster=cluster_data,
|
|
**self._project_id_and_region()
|
|
)
|
|
|
|
def _delete_cluster(self, cluster_id):
|
|
return self.cluster_client.delete_cluster(
|
|
cluster_name=cluster_id,
|
|
**self._project_id_and_region()
|
|
)
|
|
|
|
def _list_jobs(self, cluster_name=None, state_matcher=None):
|
|
# https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa
|
|
list_kwargs = self._project_id_and_region()
|
|
|
|
if cluster_name:
|
|
list_kwargs['cluster_name'] = cluster_name
|
|
|
|
if state_matcher:
|
|
list_kwargs['job_state_matcher'] = state_matcher
|
|
|
|
return self.job_client.list_jobs(**list_kwargs)
|
|
|
|
def _get_job(self, job_id):
|
|
return self.job_client.get_job(
|
|
job_id=job_id,
|
|
**self._project_id_and_region()
|
|
)
|
|
|
|
def _cancel_job(self, job_id):
|
|
return self.job_client.cancel_job(
|
|
job_id=job_id,
|
|
**self._project_id_and_region()
|
|
)
|
|
|
|
def _submit_job(self, step_name, job_kwarg):
|
|
# https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa
|
|
# https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa
|
|
# https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa
|
|
|
|
submit_job_kwargs = dict(
|
|
job=dict(
|
|
reference=dict(project_id=self._project_id, job_id=step_name),
|
|
placement=dict(cluster_name=self._cluster_id),
|
|
**job_kwarg
|
|
),
|
|
**self._project_id_and_region()
|
|
)
|
|
|
|
log.debug(' submit_job(%s)' % ', '.join(
|
|
'%s=%r' % (k, v) for k, v in sorted(submit_job_kwargs.items())))
|
|
|
|
return self.job_client.submit_job(**submit_job_kwargs)
|
|
|
|
def _project_id_and_region(self):
|
|
return dict(
|
|
project_id=self._project_id,
|
|
region=(self._opts['region'] or 'global'),
|
|
)
|
|
|
|
def _manifest_download_commands(self):
|
|
return [
|
|
# TODO: SSH in and figure out how to use gsutil or similar
|
|
# ('gs://*', 'gsutil cp'),
|
|
('*://*', 'hadoop fs -copyToLocal'),
|
|
]
|
|
|
|
### SSH hooks ###
|
|
|
|
def _job_tracker_host(self):
|
|
return '%s-m' % self._cluster_id
|
|
|
|
def _ssh_tunnel_config(self):
|
|
return _SSH_TUNNEL_CONFIG
|
|
|
|
def _launch_ssh_proc(self, args):
|
|
ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args)
|
|
|
|
# enter an empty passphrase if creating a key for the first time
|
|
ssh_proc.stdin.write(b'\n\n')
|
|
|
|
return ssh_proc
|
|
|
|
def _ssh_launch_wait_secs(self):
|
|
"""Wait 20 seconds because gcloud has to update project metadata
|
|
(unless we were going to check the cluster sooner anyway)."""
|
|
return min(20.0, self._opts['check_cluster_every'])
|
|
|
|
def _ssh_tunnel_args(self, bind_port):
|
|
if not self._cluster_id:
|
|
return
|
|
|
|
gcloud_bin = self._opts['gcloud_bin'] or ['gcloud']
|
|
|
|
cluster = self._get_cluster(self._cluster_id)
|
|
zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1]
|
|
|
|
return gcloud_bin + [
|
|
'compute', 'ssh',
|
|
'--zone', zone,
|
|
self._job_tracker_host(),
|
|
'--',
|
|
] + self._ssh_tunnel_opts(bind_port)
|
|
|
|
|
|
def _log_filter_str(name_to_value):
|
|
"""return a map from name to value into a log filter query that requires
|
|
each name to equal the given value."""
|
|
return ' AND '.join(
|
|
'%s = %s' % (name, _quote_filter_value(value))
|
|
for name, value in sorted(name_to_value.items()))
|
|
|
|
|
|
def _quote_filter_value(s):
|
|
"""Put a string in double quotes, escaping double quote characters"""
|
|
return '"%s"' % s.replace('"', r'\"')
|
|
|
|
|
|
def _log_entries_to_log4j(entries):
|
|
"""Convert log entries from a single log file to log4j format, tracking
|
|
line number.
|
|
|
|
See :py:meth:`mrjob.logs.log4j._parse_hadoop_log4j_records`
|
|
for format.
|
|
"""
|
|
line_num = 0
|
|
|
|
for entry in entries:
|
|
message = entry.payload.get('message') or ''
|
|
|
|
# NOTE: currently, google.cloud.logging seems strip newlines :(
|
|
num_lines = len(message.split('\n'))
|
|
|
|
yield dict(
|
|
caller_location='',
|
|
level=(entry.severity or ''),
|
|
logger=(entry.payload.get('class') or ''),
|
|
message=message,
|
|
num_lines=num_lines,
|
|
start_line=line_num,
|
|
thread='',
|
|
timestamp=(entry.timestamp or ''),
|
|
)
|
|
|
|
line_num += num_lines
|
|
|
|
|
|
def _fix_java_stack_trace(s):
|
|
# this is what we get from `gcloud logging`
|
|
if '\n' in s:
|
|
return s
|
|
else:
|
|
return s.replace('\t', '\n\t')
|
|
|
|
|
|
def _fix_traceback(s):
|
|
lines = s.split('\n')
|
|
|
|
# strip log4j warnings (which do have proper linebreaks)
|
|
lines = [
|
|
line for line in lines
|
|
if line and not _STDERR_LOG4J_WARNING.match(line)
|
|
]
|
|
|
|
s = '\n'.join(lines)
|
|
|
|
if '\n' in s:
|
|
return s # traceback does have newlines
|
|
|
|
s = s.replace(' File', '\n File')
|
|
s = s.replace(' ', '\n ')
|
|
s = _TRACEBACK_EXCEPTION_RE.sub(lambda m: '\n' + m.group(0), s)
|
|
|
|
return s
|
|
|
|
|
|
def _clean_json_dict_keys(x):
|
|
"""Cast any dictionary keys in the given JSON object to str.
|
|
We can assume that x isn't a recursive data structure, and that
|
|
this is only called in Python 2."""
|
|
if isinstance(x, dict):
|
|
return {str(k): _clean_json_dict_keys(v) for k, v in x.items()}
|
|
elif isinstance(x, list):
|
|
return [_clean_json_dict_keys(item) for item in x]
|
|
else:
|
|
return x
|
|
|
|
|
|
def _values_to_text(d):
|
|
"""Return a dictionary with the same keys as *d*, but where the
|
|
non-string, non-bytes values have been JSON-encoded.
|
|
|
|
Used to encode cluster properties.
|
|
"""
|
|
result = {}
|
|
|
|
for k, v in d.items():
|
|
if not isinstance(v, (string_types, bytes)):
|
|
v = json.dumps(v)
|
|
|
|
result[k] = v
|
|
|
|
return result
|
|
|
|
|
|
def _fully_qualify_scope_uri(uri):
|
|
if is_uri(uri):
|
|
return uri
|
|
else:
|
|
return 'https://www.googleapis.com/auth/%s' % uri
|