GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/conf.py
2025-06-22 13:36:01 +05:30

591 lines
18 KiB
Python

# Copyright 2009-2012 Yelp
# Copyright 2013 David Marin
# Copyright 2015-2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""""mrjob.conf" is the name of both this module, and the global config file
for :py:mod:`mrjob`.
"""
import glob
import json
import logging
import os
import os.path
# yaml is nice to have, but we can fall back on JSON if need be
try:
import yaml
yaml # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
yaml = None
from mrjob.py2 import string_types
from mrjob.util import expand_path
from mrjob.util import shlex_split
log = logging.getLogger(__name__)
### finding config files ###
def find_mrjob_conf():
"""Look for :file:`mrjob.conf`, and return its path. Places we look:
- The location specified by :envvar:`MRJOB_CONF`
- :file:`~/.mrjob.conf`
- :file:`/etc/mrjob.conf`
Return ``None`` if we can't find it.
"""
def candidates():
if 'MRJOB_CONF' in os.environ:
yield expand_path(os.environ['MRJOB_CONF'])
# $HOME isn't necessarily set on Windows, but ~ works
# use os.path.join() so we don't end up mixing \ and /
yield expand_path(os.path.join('~', '.mrjob.conf'))
# this only really makes sense on Unix, so no os.path.join()
yield '/etc/mrjob.conf'
for path in candidates():
log.debug('Looking for configs in %s' % path)
if os.path.exists(path):
log.info('Using configs in %s' % path)
return path
else:
log.info('No configs found; falling back on auto-configuration')
return None
def _expanded_mrjob_conf_path(conf_path=None):
"""Return the path of a single conf file. If *conf_path* is ``False``,
return ``None``, and if it's ``None``, return :py:func:`find_mrjob_conf`.
Otherwise, expand environment variables and ``~`` in *conf_path* and
return it.
Confusingly, this function doesn't actually return a "real" path according
to ``os.path.realpath()``; it just resolves environment variables and
``~``.
"""
if conf_path is False:
return None
elif conf_path is None:
return find_mrjob_conf()
else:
return expand_path(conf_path)
### !clear tag ###
class ClearedValue(object):
"""Wrap a value tagged with !clear in mrjob.conf"""
def __init__(self, value):
self.value = value
def __eq__(self, other):
if isinstance(other, ClearedValue):
return self.value == other.value
else:
return False
def __hash__(self):
return hash(self.value)
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__, repr(self.value))
def _cleared_value_constructor(loader, node):
# tried construct_object(), got an unconstructable recursive node warning
if isinstance(node, yaml.MappingNode):
value = loader.construct_mapping(node)
elif isinstance(node, yaml.ScalarNode):
# resolve null as None, not u'null'
value = yaml.safe_load(node.value)
elif isinstance(node, yaml.SequenceNode):
value = loader.construct_sequence(node)
else:
raise TypeError
return ClearedValue(value)
def _load_yaml_with_clear_tag(stream):
"""Like yaml.safe_load(), but everything with a !clear tag before it
will be wrapped in ClearedValue()."""
loader = yaml.SafeLoader(stream)
loader.add_constructor('!clear', _cleared_value_constructor)
try:
return loader.get_single_data()
finally:
if hasattr(loader, 'dispose'): # it doesn't in PyYAML 3.09
loader.dispose()
def _cleared_value_representer(dumper, data):
if not isinstance(data, ClearedValue):
raise TypeError
node = dumper.represent_data(data.value)
node.tag = '!clear'
return node
def _dump_yaml_with_clear_tags(data, stream=None, **kwds):
class ClearedValueSafeDumper(yaml.SafeDumper):
pass
ClearedValueSafeDumper.add_representer(
ClearedValue, _cleared_value_representer)
return yaml.dump_all([data], stream, Dumper=ClearedValueSafeDumper, **kwds)
def _fix_clear_tags(x):
"""Recursively resolve :py:class:`ClearedValue` wrappers so that
``ClearedValue(...)`` can only wrap values in dicts (and in the top-level
value we return).
In dicts, we treat ``ClearedValue(k): v`` or
``ClearedValue(k): ClearedValue(v)`` as equivalent to
``k: ClearedValue(v)``. ``ClearedValue(k): v1`` overrides ``k: v2``.
In lists, any ClearedValue wrappers are simply stripped.
"""
_fix = _fix_clear_tags
if isinstance(x, list):
return [_fix(_strip_clear_tag(item)) for item in x]
elif isinstance(x, dict):
d = dict((_fix(k), _fix(v)) for k, v in x.items())
# handle cleared keys
for k, v in list(d.items()):
if isinstance(k, ClearedValue):
del d[k]
d[_strip_clear_tag(k)] = ClearedValue(_strip_clear_tag(v))
return d
elif isinstance(x, ClearedValue):
return ClearedValue(_fix(x.value))
else:
return x
def _resolve_clear_tags_in_list(items):
"""Create a list from *items*. If we encounter a :py:class:`ClearedValue`,
unwrap it and ignore previous values. Used by ``combine_*()`` functions
to combine lists of values.
"""
result = []
for item in items:
if isinstance(item, ClearedValue):
result = [item.value]
else:
result.append(item)
return result
def _strip_clear_tag(v):
"""remove the clear tag from the given value."""
if isinstance(v, ClearedValue):
return v.value
else:
return v
### reading mrjob.conf ###
def _conf_object_at_path(conf_path):
if conf_path is None:
return None
with open(conf_path) as f:
if yaml:
return _fix_clear_tags(_load_yaml_with_clear_tag(f))
else:
try:
return json.load(f)
except ValueError as e:
raise ValueError(
'Could not read JSON from %s\n %s\n\n'
'If your conf file is in YAML, you need to'
' `pip install PyYAML` to read it' % (conf_path, str(e)))
def load_opts_from_mrjob_conf(runner_alias, conf_path=None,
already_loaded=None):
"""Load a list of dictionaries representing the options in a given
mrjob.conf for a specific runner, resolving includes. Returns
``[(path, values)]``. If *conf_path* is not found, return ``[(None, {})]``.
:type runner_alias: str
:param runner_alias: String identifier of the runner type, e.g. ``emr``,
``local``, etc.
:type conf_path: str
:param conf_path: location of the file to load
:type already_loaded: list
:param already_loaded: list of real (according to ``os.path.realpath()``)
conf paths that have already
been loaded (used by
:py:func:`load_opts_from_mrjob_confs`).
Relative ``include:`` paths are relative to the real (after resolving
symlinks) path of the including conf file
This will only load each config file once, even if it's referenced
from multiple paths due to symlinks.
"""
if already_loaded is None:
already_loaded = []
conf_path = _expanded_mrjob_conf_path(conf_path)
return _load_opts_from_mrjob_conf(runner_alias, conf_path, already_loaded)
def _load_opts_from_mrjob_conf(runner_alias, conf_path, already_loaded):
"""Helper for :py:func:`load_opts_from_mrjob_conf` for recursive use.
This doesn't expand or default *conf_path*.
"""
conf = _conf_object_at_path(conf_path)
if conf is None:
return [(None, {})]
# don't load same conf file twice
real_conf_path = os.path.realpath(conf_path)
if real_conf_path in already_loaded:
return []
else:
already_loaded.append(real_conf_path)
# get configs for our runner out of conf file
try:
values = conf['runners'][runner_alias] or {}
except (KeyError, TypeError, ValueError):
values = {}
inherited = []
if conf.get('include', None):
includes = conf['include']
if isinstance(includes, string_types):
includes = [includes]
# handle includes in reverse order so that include order takes
# precedence over inheritance
for include in reversed(includes):
# make include relative to (real) conf_path (see #1166)
# expand ~ *before* joining to dir of including file (see #1308)
include = os.path.join(os.path.dirname(real_conf_path),
expand_path(include))
inherited = _load_opts_from_mrjob_conf(
runner_alias, include, already_loaded) + inherited
return inherited + [(conf_path, values)]
def load_opts_from_mrjob_confs(runner_alias, conf_paths=None):
"""Load a list of dictionaries representing the options in a given
list of mrjob config files for a specific runner. Returns
``[(path, values), ...]``. If a path is not found, use ``(None, {})`` as
its value.
If *conf_paths* is ``None``, look for a config file in the default
locations (see :py:func:`find_mrjob_conf`).
:type runner_alias: str
:param runner_alias: String identifier of the runner type, e.g. ``emr``,
``local``, etc.
:type conf_paths: list or ``None``
:param conf_path: locations of the files to load
This will only load each config file once, even if it's referenced
from multiple paths due to symlinks.
"""
if conf_paths is None:
results = load_opts_from_mrjob_conf(runner_alias)
else:
# don't include conf files that were loaded earlier in conf_paths
already_loaded = []
# load configs in reversed order so that order of conf paths takes
# precedence over inheritance
results = []
for path in reversed(conf_paths):
results = load_opts_from_mrjob_conf(
runner_alias, path, already_loaded=already_loaded) + results
if runner_alias and not any(conf for path, conf in results):
log.warning('No configs specified for %s runner' % runner_alias)
return results
### writing mrjob.conf ###
def dump_mrjob_conf(conf, f):
"""Write out configuration options to a file.
Useful if you don't want to bother to figure out YAML.
*conf* should look something like this:
{'runners':
'local': {'OPTION': VALUE, ...}
'emr': {'OPTION': VALUE, ...}
'hadoop: {'OPTION': VALUE, ...}
}
:param f: a file object to write to (e.g. ``open('mrjob.conf', 'w')``)
"""
if yaml:
_dump_yaml_with_clear_tags(conf, f, default_flow_style=False)
else:
json.dump(conf, f, indent=2)
f.flush()
### COMBINING OPTIONS ###
# combiners generally consider earlier values to be defaults, and later
# options to override or add on to them.
# combiners assume that the list of values passed to them has already been
# passed through _fix_clear_tags() (that is, the only place ClearedValue
# appears is values in dicts).
def combine_values(*values):
"""Return the last value in *values* that is not ``None``.
The default combiner; good for simple values (booleans, strings, numbers).
"""
for v in reversed(values):
if v is not None:
return v
else:
return None
def combine_lists(*seqs):
"""Concatenate the given sequences into a list. Ignore ``None`` values.
Generally this is used for a list of commands we want to run; the
"default" commands get run before any commands specific to your job.
Strings, bytes, and non-sequence objects (e.g. numbers) are treated as
single-item lists.
"""
result = []
for seq in seqs:
if seq is None:
continue
if isinstance(seq, (bytes, string_types, dict)):
result.append(seq)
else:
try:
result.extend(seq)
except:
result.append(seq)
return result
def combine_cmds(*cmds):
"""Take zero or more commands to run on the command line, and return
the last one that is not ``None``. Each command should either be a list
containing the command plus switches, or a string, which will be parsed
with :py:func:`shlex.split`. The string must either be a byte string or a
unicode string containing no non-ASCII characters.
Returns either ``None`` or a list containing the command plus arguments.
"""
cmd = combine_values(*cmds)
if cmd is None:
return None
elif isinstance(cmd, string_types):
return shlex_split(cmd)
else:
return list(cmd)
def combine_dicts(*dicts):
"""Combine zero or more dictionaries. Values from dicts later in the list
take precedence over values earlier in the list.
If you pass in ``None`` in place of a dictionary, it will be ignored.
"""
result = {}
for d in dicts:
if d:
for k, v in d.items():
# delete cleared key
if isinstance(v, ClearedValue) and v.value is None:
result.pop(k, None)
# just set the value
else:
result[k] = _strip_clear_tag(v)
return result
def combine_envs(*envs):
"""Combine zero or more dictionaries containing environment variables.
Environment variable values may be wrapped in :py:class:`ClearedValue`.
Environment variables later from dictionaries later in the list take
priority over those earlier in the list.
For variables ending with ``PATH``, we prepend (and add a colon) rather
than overwriting. Wrapping a path value in :py:class:`ClearedValue`
disables this behavior.
Environment set to ``ClearedValue(None)`` will *delete* environment
variables earlier in the list, rather than setting them to ``None``.
If you pass in ``None`` in place of a dictionary in **envs**, it will be
ignored.
"""
return _combine_envs_helper(envs, local=False)
def combine_local_envs(*envs):
"""Same as :py:func:`combine_envs`, except that paths are combined
using the local path separator (e.g ``;`` on Windows rather than ``:``).
"""
return _combine_envs_helper(envs, local=True)
def _combine_envs_helper(envs, local):
if local:
pathsep = os.pathsep
else:
pathsep = ':'
result = {}
for env in envs:
if env:
for k, v in env.items():
# delete cleared keys
if isinstance(v, ClearedValue) and v.value is None:
result.pop(k, None)
# append paths
elif (k.endswith('PATH') and result.get(k) and
not isinstance(v, ClearedValue)):
result[k] = v + pathsep + result[k]
# just set the value
else:
result[k] = _strip_clear_tag(v)
return result
def combine_jobconfs(*jobconfs):
"""Like combine_dicts(), but non-string values are converted to
Java-readable string (e.g. True becomes 'true'). Keys whose
value is None are blanked out."""
j = combine_dicts(*jobconfs)
return {k: _to_java_str(v) for k, v in j.items() if v is not None}
def combine_paths(*paths):
"""Returns the last value in *paths* that is not ``None``.
Resolve ``~`` (home dir) and environment variables."""
return expand_path(combine_values(*paths))
def combine_path_lists(*path_seqs):
"""Concatenate the given sequences into a list. Ignore None values.
Resolve ``~`` (home dir) and environment variables, and expand globs
that refer to the local filesystem.
Can take single strings as well as lists.
"""
results = []
for path in combine_lists(*path_seqs):
expanded = expand_path(path)
# if we can't expand a glob, leave as-is (maybe it refers to
# S3 or HDFS)
paths = sorted(glob.glob(expanded)) or [expanded]
results.extend(paths)
return results
def combine_opts(combiners, *opts_list):
"""The master combiner, used to combine dictionaries of options with
appropriate sub-combiners.
:param combiners: a map from option name to a combine_*() function to
combine options by that name. By default, we combine
options using :py:func:`combine_values`.
:param opts_list: one or more dictionaries to combine
The dict in *opts_list* may not be wrapped in :py:class:`ClearedValue`,
but their values may, in which case values of that key from previous
opt dicts will be ignored.
"""
final_opts = {}
keys = set()
for opts in opts_list:
if isinstance(opts, ClearedValue):
raise TypeError
elif opts:
keys.update(opts)
for key in keys:
values = _resolve_clear_tags_in_list(
opts[key] for opts in opts_list if opts and key in opts)
combine_func = combiners.get(key) or combine_values
final_opts[key] = combine_func(*values)
return final_opts
def _to_java_str(x):
"""Convert a value (usually for a configuration property) into its
Java string representation, falling back to the Python representation
if None is available."""
# e.g. True -> 'true', None -> 'null'. See #323
if isinstance(x, string_types):
return x
elif x is None:
# Note: combine_jobconfs() blanks out keys with None values
return 'null'
elif isinstance(x, bool):
return 'true' if x else 'false'
else:
return str(x)