mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-09 03:31:28 +00:00
591 lines
18 KiB
Python
591 lines
18 KiB
Python
# Copyright 2009-2012 Yelp
|
|
# Copyright 2013 David Marin
|
|
# Copyright 2015-2018 Yelp
|
|
# Copyright 2019 Yelp
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
""""mrjob.conf" is the name of both this module, and the global config file
|
|
for :py:mod:`mrjob`.
|
|
"""
|
|
import glob
|
|
import json
|
|
import logging
|
|
import os
|
|
import os.path
|
|
|
|
# yaml is nice to have, but we can fall back on JSON if need be
|
|
try:
|
|
import yaml
|
|
yaml # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
yaml = None
|
|
|
|
from mrjob.py2 import string_types
|
|
from mrjob.util import expand_path
|
|
from mrjob.util import shlex_split
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
### finding config files ###
|
|
|
|
def find_mrjob_conf():
|
|
"""Look for :file:`mrjob.conf`, and return its path. Places we look:
|
|
|
|
- The location specified by :envvar:`MRJOB_CONF`
|
|
- :file:`~/.mrjob.conf`
|
|
- :file:`/etc/mrjob.conf`
|
|
|
|
Return ``None`` if we can't find it.
|
|
"""
|
|
def candidates():
|
|
if 'MRJOB_CONF' in os.environ:
|
|
yield expand_path(os.environ['MRJOB_CONF'])
|
|
|
|
# $HOME isn't necessarily set on Windows, but ~ works
|
|
# use os.path.join() so we don't end up mixing \ and /
|
|
yield expand_path(os.path.join('~', '.mrjob.conf'))
|
|
|
|
# this only really makes sense on Unix, so no os.path.join()
|
|
yield '/etc/mrjob.conf'
|
|
|
|
for path in candidates():
|
|
log.debug('Looking for configs in %s' % path)
|
|
if os.path.exists(path):
|
|
log.info('Using configs in %s' % path)
|
|
return path
|
|
else:
|
|
log.info('No configs found; falling back on auto-configuration')
|
|
return None
|
|
|
|
|
|
def _expanded_mrjob_conf_path(conf_path=None):
|
|
"""Return the path of a single conf file. If *conf_path* is ``False``,
|
|
return ``None``, and if it's ``None``, return :py:func:`find_mrjob_conf`.
|
|
Otherwise, expand environment variables and ``~`` in *conf_path* and
|
|
return it.
|
|
|
|
Confusingly, this function doesn't actually return a "real" path according
|
|
to ``os.path.realpath()``; it just resolves environment variables and
|
|
``~``.
|
|
"""
|
|
if conf_path is False:
|
|
return None
|
|
elif conf_path is None:
|
|
return find_mrjob_conf()
|
|
else:
|
|
return expand_path(conf_path)
|
|
|
|
|
|
### !clear tag ###
|
|
|
|
class ClearedValue(object):
|
|
"""Wrap a value tagged with !clear in mrjob.conf"""
|
|
|
|
def __init__(self, value):
|
|
self.value = value
|
|
|
|
def __eq__(self, other):
|
|
if isinstance(other, ClearedValue):
|
|
return self.value == other.value
|
|
else:
|
|
return False
|
|
|
|
def __hash__(self):
|
|
return hash(self.value)
|
|
|
|
def __repr__(self):
|
|
return '%s(%s)' % (self.__class__.__name__, repr(self.value))
|
|
|
|
|
|
def _cleared_value_constructor(loader, node):
|
|
# tried construct_object(), got an unconstructable recursive node warning
|
|
if isinstance(node, yaml.MappingNode):
|
|
value = loader.construct_mapping(node)
|
|
elif isinstance(node, yaml.ScalarNode):
|
|
# resolve null as None, not u'null'
|
|
value = yaml.safe_load(node.value)
|
|
elif isinstance(node, yaml.SequenceNode):
|
|
value = loader.construct_sequence(node)
|
|
else:
|
|
raise TypeError
|
|
|
|
return ClearedValue(value)
|
|
|
|
|
|
def _load_yaml_with_clear_tag(stream):
|
|
"""Like yaml.safe_load(), but everything with a !clear tag before it
|
|
will be wrapped in ClearedValue()."""
|
|
loader = yaml.SafeLoader(stream)
|
|
loader.add_constructor('!clear', _cleared_value_constructor)
|
|
try:
|
|
return loader.get_single_data()
|
|
finally:
|
|
if hasattr(loader, 'dispose'): # it doesn't in PyYAML 3.09
|
|
loader.dispose()
|
|
|
|
|
|
def _cleared_value_representer(dumper, data):
|
|
if not isinstance(data, ClearedValue):
|
|
raise TypeError
|
|
node = dumper.represent_data(data.value)
|
|
node.tag = '!clear'
|
|
return node
|
|
|
|
|
|
def _dump_yaml_with_clear_tags(data, stream=None, **kwds):
|
|
class ClearedValueSafeDumper(yaml.SafeDumper):
|
|
pass
|
|
|
|
ClearedValueSafeDumper.add_representer(
|
|
ClearedValue, _cleared_value_representer)
|
|
|
|
return yaml.dump_all([data], stream, Dumper=ClearedValueSafeDumper, **kwds)
|
|
|
|
|
|
def _fix_clear_tags(x):
|
|
"""Recursively resolve :py:class:`ClearedValue` wrappers so that
|
|
``ClearedValue(...)`` can only wrap values in dicts (and in the top-level
|
|
value we return).
|
|
|
|
In dicts, we treat ``ClearedValue(k): v`` or
|
|
``ClearedValue(k): ClearedValue(v)`` as equivalent to
|
|
``k: ClearedValue(v)``. ``ClearedValue(k): v1`` overrides ``k: v2``.
|
|
|
|
In lists, any ClearedValue wrappers are simply stripped.
|
|
"""
|
|
_fix = _fix_clear_tags
|
|
|
|
if isinstance(x, list):
|
|
return [_fix(_strip_clear_tag(item)) for item in x]
|
|
|
|
elif isinstance(x, dict):
|
|
d = dict((_fix(k), _fix(v)) for k, v in x.items())
|
|
|
|
# handle cleared keys
|
|
for k, v in list(d.items()):
|
|
if isinstance(k, ClearedValue):
|
|
del d[k]
|
|
d[_strip_clear_tag(k)] = ClearedValue(_strip_clear_tag(v))
|
|
|
|
return d
|
|
|
|
elif isinstance(x, ClearedValue):
|
|
return ClearedValue(_fix(x.value))
|
|
|
|
else:
|
|
return x
|
|
|
|
|
|
def _resolve_clear_tags_in_list(items):
|
|
"""Create a list from *items*. If we encounter a :py:class:`ClearedValue`,
|
|
unwrap it and ignore previous values. Used by ``combine_*()`` functions
|
|
to combine lists of values.
|
|
"""
|
|
result = []
|
|
|
|
for item in items:
|
|
if isinstance(item, ClearedValue):
|
|
result = [item.value]
|
|
else:
|
|
result.append(item)
|
|
|
|
return result
|
|
|
|
|
|
def _strip_clear_tag(v):
|
|
"""remove the clear tag from the given value."""
|
|
if isinstance(v, ClearedValue):
|
|
return v.value
|
|
else:
|
|
return v
|
|
|
|
|
|
### reading mrjob.conf ###
|
|
|
|
def _conf_object_at_path(conf_path):
|
|
if conf_path is None:
|
|
return None
|
|
|
|
with open(conf_path) as f:
|
|
if yaml:
|
|
return _fix_clear_tags(_load_yaml_with_clear_tag(f))
|
|
else:
|
|
try:
|
|
return json.load(f)
|
|
except ValueError as e:
|
|
raise ValueError(
|
|
'Could not read JSON from %s\n %s\n\n'
|
|
'If your conf file is in YAML, you need to'
|
|
' `pip install PyYAML` to read it' % (conf_path, str(e)))
|
|
|
|
|
|
def load_opts_from_mrjob_conf(runner_alias, conf_path=None,
|
|
already_loaded=None):
|
|
"""Load a list of dictionaries representing the options in a given
|
|
mrjob.conf for a specific runner, resolving includes. Returns
|
|
``[(path, values)]``. If *conf_path* is not found, return ``[(None, {})]``.
|
|
|
|
:type runner_alias: str
|
|
:param runner_alias: String identifier of the runner type, e.g. ``emr``,
|
|
``local``, etc.
|
|
:type conf_path: str
|
|
:param conf_path: location of the file to load
|
|
:type already_loaded: list
|
|
:param already_loaded: list of real (according to ``os.path.realpath()``)
|
|
conf paths that have already
|
|
been loaded (used by
|
|
:py:func:`load_opts_from_mrjob_confs`).
|
|
|
|
Relative ``include:`` paths are relative to the real (after resolving
|
|
symlinks) path of the including conf file
|
|
|
|
This will only load each config file once, even if it's referenced
|
|
from multiple paths due to symlinks.
|
|
"""
|
|
if already_loaded is None:
|
|
already_loaded = []
|
|
|
|
conf_path = _expanded_mrjob_conf_path(conf_path)
|
|
return _load_opts_from_mrjob_conf(runner_alias, conf_path, already_loaded)
|
|
|
|
|
|
def _load_opts_from_mrjob_conf(runner_alias, conf_path, already_loaded):
|
|
"""Helper for :py:func:`load_opts_from_mrjob_conf` for recursive use.
|
|
This doesn't expand or default *conf_path*.
|
|
"""
|
|
conf = _conf_object_at_path(conf_path)
|
|
|
|
if conf is None:
|
|
return [(None, {})]
|
|
|
|
# don't load same conf file twice
|
|
real_conf_path = os.path.realpath(conf_path)
|
|
|
|
if real_conf_path in already_loaded:
|
|
return []
|
|
else:
|
|
already_loaded.append(real_conf_path)
|
|
|
|
# get configs for our runner out of conf file
|
|
try:
|
|
values = conf['runners'][runner_alias] or {}
|
|
except (KeyError, TypeError, ValueError):
|
|
values = {}
|
|
|
|
inherited = []
|
|
if conf.get('include', None):
|
|
includes = conf['include']
|
|
if isinstance(includes, string_types):
|
|
includes = [includes]
|
|
|
|
# handle includes in reverse order so that include order takes
|
|
# precedence over inheritance
|
|
for include in reversed(includes):
|
|
# make include relative to (real) conf_path (see #1166)
|
|
# expand ~ *before* joining to dir of including file (see #1308)
|
|
include = os.path.join(os.path.dirname(real_conf_path),
|
|
expand_path(include))
|
|
|
|
inherited = _load_opts_from_mrjob_conf(
|
|
runner_alias, include, already_loaded) + inherited
|
|
|
|
return inherited + [(conf_path, values)]
|
|
|
|
|
|
def load_opts_from_mrjob_confs(runner_alias, conf_paths=None):
|
|
"""Load a list of dictionaries representing the options in a given
|
|
list of mrjob config files for a specific runner. Returns
|
|
``[(path, values), ...]``. If a path is not found, use ``(None, {})`` as
|
|
its value.
|
|
|
|
If *conf_paths* is ``None``, look for a config file in the default
|
|
locations (see :py:func:`find_mrjob_conf`).
|
|
|
|
:type runner_alias: str
|
|
:param runner_alias: String identifier of the runner type, e.g. ``emr``,
|
|
``local``, etc.
|
|
:type conf_paths: list or ``None``
|
|
:param conf_path: locations of the files to load
|
|
|
|
This will only load each config file once, even if it's referenced
|
|
from multiple paths due to symlinks.
|
|
"""
|
|
if conf_paths is None:
|
|
results = load_opts_from_mrjob_conf(runner_alias)
|
|
else:
|
|
# don't include conf files that were loaded earlier in conf_paths
|
|
already_loaded = []
|
|
|
|
# load configs in reversed order so that order of conf paths takes
|
|
# precedence over inheritance
|
|
results = []
|
|
|
|
for path in reversed(conf_paths):
|
|
results = load_opts_from_mrjob_conf(
|
|
runner_alias, path, already_loaded=already_loaded) + results
|
|
|
|
if runner_alias and not any(conf for path, conf in results):
|
|
log.warning('No configs specified for %s runner' % runner_alias)
|
|
|
|
return results
|
|
|
|
|
|
### writing mrjob.conf ###
|
|
|
|
def dump_mrjob_conf(conf, f):
|
|
"""Write out configuration options to a file.
|
|
|
|
Useful if you don't want to bother to figure out YAML.
|
|
|
|
*conf* should look something like this:
|
|
|
|
{'runners':
|
|
'local': {'OPTION': VALUE, ...}
|
|
'emr': {'OPTION': VALUE, ...}
|
|
'hadoop: {'OPTION': VALUE, ...}
|
|
}
|
|
|
|
:param f: a file object to write to (e.g. ``open('mrjob.conf', 'w')``)
|
|
"""
|
|
if yaml:
|
|
_dump_yaml_with_clear_tags(conf, f, default_flow_style=False)
|
|
else:
|
|
json.dump(conf, f, indent=2)
|
|
f.flush()
|
|
|
|
|
|
### COMBINING OPTIONS ###
|
|
|
|
# combiners generally consider earlier values to be defaults, and later
|
|
# options to override or add on to them.
|
|
|
|
# combiners assume that the list of values passed to them has already been
|
|
# passed through _fix_clear_tags() (that is, the only place ClearedValue
|
|
# appears is values in dicts).
|
|
|
|
|
|
def combine_values(*values):
|
|
"""Return the last value in *values* that is not ``None``.
|
|
|
|
The default combiner; good for simple values (booleans, strings, numbers).
|
|
"""
|
|
for v in reversed(values):
|
|
if v is not None:
|
|
return v
|
|
else:
|
|
return None
|
|
|
|
|
|
def combine_lists(*seqs):
|
|
"""Concatenate the given sequences into a list. Ignore ``None`` values.
|
|
|
|
Generally this is used for a list of commands we want to run; the
|
|
"default" commands get run before any commands specific to your job.
|
|
|
|
Strings, bytes, and non-sequence objects (e.g. numbers) are treated as
|
|
single-item lists.
|
|
"""
|
|
result = []
|
|
|
|
for seq in seqs:
|
|
if seq is None:
|
|
continue
|
|
|
|
if isinstance(seq, (bytes, string_types, dict)):
|
|
result.append(seq)
|
|
else:
|
|
try:
|
|
result.extend(seq)
|
|
except:
|
|
result.append(seq)
|
|
|
|
return result
|
|
|
|
|
|
def combine_cmds(*cmds):
|
|
"""Take zero or more commands to run on the command line, and return
|
|
the last one that is not ``None``. Each command should either be a list
|
|
containing the command plus switches, or a string, which will be parsed
|
|
with :py:func:`shlex.split`. The string must either be a byte string or a
|
|
unicode string containing no non-ASCII characters.
|
|
|
|
Returns either ``None`` or a list containing the command plus arguments.
|
|
"""
|
|
cmd = combine_values(*cmds)
|
|
|
|
if cmd is None:
|
|
return None
|
|
elif isinstance(cmd, string_types):
|
|
return shlex_split(cmd)
|
|
else:
|
|
return list(cmd)
|
|
|
|
|
|
def combine_dicts(*dicts):
|
|
"""Combine zero or more dictionaries. Values from dicts later in the list
|
|
take precedence over values earlier in the list.
|
|
|
|
If you pass in ``None`` in place of a dictionary, it will be ignored.
|
|
"""
|
|
result = {}
|
|
|
|
for d in dicts:
|
|
if d:
|
|
for k, v in d.items():
|
|
# delete cleared key
|
|
if isinstance(v, ClearedValue) and v.value is None:
|
|
result.pop(k, None)
|
|
|
|
# just set the value
|
|
else:
|
|
result[k] = _strip_clear_tag(v)
|
|
|
|
return result
|
|
|
|
|
|
def combine_envs(*envs):
|
|
"""Combine zero or more dictionaries containing environment variables.
|
|
Environment variable values may be wrapped in :py:class:`ClearedValue`.
|
|
|
|
Environment variables later from dictionaries later in the list take
|
|
priority over those earlier in the list.
|
|
|
|
For variables ending with ``PATH``, we prepend (and add a colon) rather
|
|
than overwriting. Wrapping a path value in :py:class:`ClearedValue`
|
|
disables this behavior.
|
|
|
|
Environment set to ``ClearedValue(None)`` will *delete* environment
|
|
variables earlier in the list, rather than setting them to ``None``.
|
|
|
|
If you pass in ``None`` in place of a dictionary in **envs**, it will be
|
|
ignored.
|
|
"""
|
|
return _combine_envs_helper(envs, local=False)
|
|
|
|
|
|
def combine_local_envs(*envs):
|
|
"""Same as :py:func:`combine_envs`, except that paths are combined
|
|
using the local path separator (e.g ``;`` on Windows rather than ``:``).
|
|
"""
|
|
return _combine_envs_helper(envs, local=True)
|
|
|
|
|
|
def _combine_envs_helper(envs, local):
|
|
if local:
|
|
pathsep = os.pathsep
|
|
else:
|
|
pathsep = ':'
|
|
|
|
result = {}
|
|
for env in envs:
|
|
if env:
|
|
for k, v in env.items():
|
|
# delete cleared keys
|
|
if isinstance(v, ClearedValue) and v.value is None:
|
|
result.pop(k, None)
|
|
|
|
# append paths
|
|
elif (k.endswith('PATH') and result.get(k) and
|
|
not isinstance(v, ClearedValue)):
|
|
result[k] = v + pathsep + result[k]
|
|
|
|
# just set the value
|
|
else:
|
|
result[k] = _strip_clear_tag(v)
|
|
|
|
return result
|
|
|
|
|
|
def combine_jobconfs(*jobconfs):
|
|
"""Like combine_dicts(), but non-string values are converted to
|
|
Java-readable string (e.g. True becomes 'true'). Keys whose
|
|
value is None are blanked out."""
|
|
j = combine_dicts(*jobconfs)
|
|
|
|
return {k: _to_java_str(v) for k, v in j.items() if v is not None}
|
|
|
|
|
|
def combine_paths(*paths):
|
|
"""Returns the last value in *paths* that is not ``None``.
|
|
Resolve ``~`` (home dir) and environment variables."""
|
|
return expand_path(combine_values(*paths))
|
|
|
|
|
|
def combine_path_lists(*path_seqs):
|
|
"""Concatenate the given sequences into a list. Ignore None values.
|
|
Resolve ``~`` (home dir) and environment variables, and expand globs
|
|
that refer to the local filesystem.
|
|
|
|
Can take single strings as well as lists.
|
|
"""
|
|
results = []
|
|
|
|
for path in combine_lists(*path_seqs):
|
|
expanded = expand_path(path)
|
|
# if we can't expand a glob, leave as-is (maybe it refers to
|
|
# S3 or HDFS)
|
|
paths = sorted(glob.glob(expanded)) or [expanded]
|
|
|
|
results.extend(paths)
|
|
|
|
return results
|
|
|
|
|
|
def combine_opts(combiners, *opts_list):
|
|
"""The master combiner, used to combine dictionaries of options with
|
|
appropriate sub-combiners.
|
|
|
|
:param combiners: a map from option name to a combine_*() function to
|
|
combine options by that name. By default, we combine
|
|
options using :py:func:`combine_values`.
|
|
:param opts_list: one or more dictionaries to combine
|
|
|
|
The dict in *opts_list* may not be wrapped in :py:class:`ClearedValue`,
|
|
but their values may, in which case values of that key from previous
|
|
opt dicts will be ignored.
|
|
"""
|
|
final_opts = {}
|
|
|
|
keys = set()
|
|
for opts in opts_list:
|
|
if isinstance(opts, ClearedValue):
|
|
raise TypeError
|
|
elif opts:
|
|
keys.update(opts)
|
|
|
|
for key in keys:
|
|
values = _resolve_clear_tags_in_list(
|
|
opts[key] for opts in opts_list if opts and key in opts)
|
|
|
|
combine_func = combiners.get(key) or combine_values
|
|
final_opts[key] = combine_func(*values)
|
|
|
|
return final_opts
|
|
|
|
|
|
def _to_java_str(x):
|
|
"""Convert a value (usually for a configuration property) into its
|
|
Java string representation, falling back to the Python representation
|
|
if None is available."""
|
|
# e.g. True -> 'true', None -> 'null'. See #323
|
|
if isinstance(x, string_types):
|
|
return x
|
|
elif x is None:
|
|
# Note: combine_jobconfs() blanks out keys with None values
|
|
return 'null'
|
|
elif isinstance(x, bool):
|
|
return 'true' if x else 'false'
|
|
else:
|
|
return str(x)
|