mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-08 19:21:28 +00:00
503 lines
16 KiB
Python
503 lines
16 KiB
Python
# Copyright 2009-2013 Yelp and Contributors
|
|
# Copyright 2015-2017 Yelp
|
|
# Copyright 2019 Yelp
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Protocols translate raw bytes into key, value pairs.
|
|
|
|
Typically, protocols encode a key and value into bytes, and join them together
|
|
with a tab character.
|
|
|
|
However, protocols with ``Value`` in their name ignore
|
|
keys and simply read/write values (with key read in as ``None``), allowing
|
|
you to read and write data in arbitrary formats.
|
|
|
|
For more information, see :ref:`job-protocols` and :ref:`writing-protocols`.
|
|
"""
|
|
# This is one of the few places where efficiency really matters; to that end,
|
|
# we maintain separate code for Python 2 and 3 where necessary. Tests of
|
|
# protocols should *not* have different code for different versions of Python.
|
|
|
|
# don't add imports here that aren't part of the standard Python library,
|
|
# since MRJobs need to run in Amazon's generic EMR environment
|
|
import json
|
|
|
|
try:
|
|
import cPickle as pickle # Python 2 only
|
|
except ImportError:
|
|
import pickle
|
|
|
|
from mrjob.py2 import PY2
|
|
from mrjob.util import safeeval
|
|
|
|
|
|
try:
|
|
import rapidjson
|
|
rapidjson
|
|
except ImportError:
|
|
rapidjson = None
|
|
|
|
try:
|
|
import simplejson
|
|
simplejson # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
simplejson = None
|
|
|
|
try:
|
|
import ujson
|
|
ujson # quiet "redefinition of unused ..." warning from pyflakes
|
|
except ImportError:
|
|
ujson = None
|
|
|
|
|
|
class _KeyCachingProtocol(object):
|
|
"""Protocol that caches the last decoded key.
|
|
|
|
We're not currently exposing this class; inheriting from this class
|
|
will result in almost as much code as simply writing your own read/write
|
|
methods. You should probably cache keys, but in a way that makes sense for
|
|
your use case.
|
|
"""
|
|
_last_key_encoded = None
|
|
_last_key_decoded = None
|
|
|
|
def _loads(self, value):
|
|
"""Decode a single key/value, and return it."""
|
|
raise NotImplementedError
|
|
|
|
def _dumps(self, value):
|
|
"""Encode a single key/value, and return it."""
|
|
raise NotImplementedError
|
|
|
|
def read(self, line):
|
|
"""Decode a line of input.
|
|
|
|
:type line: str
|
|
:param line: A line of raw input to the job, without trailing newline.
|
|
|
|
:return: A tuple of ``(key, value)``."""
|
|
|
|
raw_key, raw_value = line.split(b'\t', 1)
|
|
|
|
if raw_key != self._last_key_encoded:
|
|
self._last_key_encoded = raw_key
|
|
self._last_key_decoded = self._loads(raw_key)
|
|
return (self._last_key_decoded, self._loads(raw_value))
|
|
|
|
def write(self, key, value):
|
|
"""Encode a key and value.
|
|
|
|
:param key: A key (of any type) yielded by a mapper/reducer
|
|
:param value: A value (of any type) yielded by a mapper/reducer
|
|
|
|
:rtype: str
|
|
:return: A line, without trailing newline."""
|
|
return self._dumps(key) + b'\t' + self._dumps(value)
|
|
|
|
|
|
# JSONProtocol (below) is just an alias, but we treat it as a class for the
|
|
# purpose of documentation. It encodes key and value as two JSONs separated
|
|
# by a tab.
|
|
#
|
|
# Same for JSONValueProtocol, except it encodes only the value (key
|
|
# is read as ``None``).
|
|
|
|
class StandardJSONProtocol(_KeyCachingProtocol):
|
|
"""Implements :py:class:`JSONProtocol` using Python's built-in JSON
|
|
library.
|
|
|
|
.. note::
|
|
|
|
The built-in ``json`` library is (appropriately) strict about the JSON
|
|
standard; it won't accept dictionaries with non-string keys, sets, or
|
|
(on Python 3) bytestrings.
|
|
"""
|
|
if PY2:
|
|
def _loads(self, value):
|
|
return json.loads(value)
|
|
|
|
def _dumps(self, value):
|
|
return json.dumps(value)
|
|
else:
|
|
def _loads(self, value):
|
|
# Python 3's json module does not accept bytes
|
|
return json.loads(value.decode('utf_8'))
|
|
|
|
def _dumps(self, value):
|
|
return json.dumps(value).encode('utf_8')
|
|
|
|
|
|
class StandardJSONValueProtocol(object):
|
|
"""Implements :py:class:`JSONValueProtocol` using Python's built-in JSON
|
|
library.
|
|
"""
|
|
if PY2:
|
|
def read(self, line):
|
|
return (None, json.loads(line))
|
|
|
|
def write(self, key, value):
|
|
return json.dumps(value)
|
|
else:
|
|
def read(self, line):
|
|
# Python 3's json module does not accept bytes
|
|
return (None, json.loads(line.decode('utf_8')))
|
|
|
|
def write(self, key, value):
|
|
return json.dumps(value).encode('utf_8')
|
|
|
|
|
|
class RapidJSONProtocol(_KeyCachingProtocol):
|
|
"""Implements :py:class:`JSONProtocol` using the :py:mod:`rapidjson`
|
|
library.
|
|
"""
|
|
# rapidjson only exists in Python 3, so no special cases for Python 3
|
|
|
|
def _loads(self, value):
|
|
return rapidjson.loads(value)
|
|
|
|
def _dumps(self, value):
|
|
return rapidjson.dumps(value).encode('utf_8')
|
|
|
|
|
|
class RapidJSONValueProtocol(object):
|
|
"""Implements :py:class:`JSONValueProtocol` using the :py:mod:`rapidjson`
|
|
library.
|
|
"""
|
|
# rapidjson only exists in Python 3, so no special cases for Python 3
|
|
|
|
def read(self, line):
|
|
return (None, rapidjson.loads(line))
|
|
|
|
def write(self, key, value):
|
|
return rapidjson.dumps(value).encode('utf_8')
|
|
|
|
|
|
class SimpleJSONProtocol(_KeyCachingProtocol):
|
|
"""Implements :py:class:`JSONProtocol` using the :py:mod:`simplejson`
|
|
library."""
|
|
def _loads(self, value):
|
|
# simplejson can handle bytes even in Python 3
|
|
return simplejson.loads(value)
|
|
|
|
if PY2:
|
|
def _dumps(self, value):
|
|
return simplejson.dumps(value)
|
|
else:
|
|
def _dumps(self, value):
|
|
return simplejson.dumps(value).encode('utf_8')
|
|
|
|
|
|
class SimpleJSONValueProtocol(object):
|
|
"""Implements :py:class:`JSONValueProtocol` using the :py:mod:`simplejson`
|
|
library.
|
|
"""
|
|
def read(self, line):
|
|
# simplejson can handle bytes even in Python 3
|
|
return (None, simplejson.loads(line))
|
|
|
|
if PY2:
|
|
def write(self, key, value):
|
|
return simplejson.dumps(value)
|
|
else:
|
|
def write(self, key, value):
|
|
return simplejson.dumps(value).encode('utf_8')
|
|
|
|
|
|
class UltraJSONProtocol(_KeyCachingProtocol):
|
|
"""Implements :py:class:`JSONProtocol` using the :py:mod:`ujson` library.
|
|
|
|
.. warning::
|
|
|
|
:py:mod:`ujson` is about five times faster than the standard
|
|
implementation, but is more willing to encode things that aren't
|
|
strictly JSON-encodable, including sets, dictionaries with
|
|
tuples as keys, UTF-8 encoded bytes, and objects (!). Relying on this
|
|
behavior won't stop your job from working, but it can
|
|
make your job *dependent* on :py:mod:`ujson`, rather than just using
|
|
it as a speedup.
|
|
|
|
.. note::
|
|
|
|
:py:mod:`ujson` also differs from the standard implementation in that
|
|
it doesn't add spaces to its JSONs (``{"foo":"bar"}`` versus
|
|
``{"foo": "bar"}``). This probably won't affect anything but test
|
|
cases and readability.
|
|
"""
|
|
def _loads(self, value):
|
|
# ujson can handle bytes even in Python 3
|
|
return ujson.loads(value)
|
|
|
|
if PY2:
|
|
def _dumps(self, value):
|
|
return ujson.dumps(value)
|
|
else:
|
|
def _dumps(self, value):
|
|
return ujson.dumps(value).encode('utf_8')
|
|
|
|
|
|
class UltraJSONValueProtocol(object):
|
|
"""Implements :py:class:`JSONValueProtocol` using the :py:mod:`ujson`
|
|
library.
|
|
"""
|
|
def read(self, line):
|
|
# ujson can handle bytes even in Python 3
|
|
return (None, ujson.loads(line))
|
|
|
|
if PY2:
|
|
def write(self, key, value):
|
|
return ujson.dumps(value)
|
|
else:
|
|
def write(self, key, value):
|
|
return ujson.dumps(value).encode('utf_8')
|
|
|
|
|
|
# use ujson by default if available
|
|
if ujson:
|
|
JSONProtocol = UltraJSONProtocol
|
|
JSONValueProtocol = UltraJSONValueProtocol
|
|
# otherwise, try rapidjson. This library is supposed to be Python 3+
|
|
# only, so don't try to use it on Python 2
|
|
elif rapidjson and not PY2:
|
|
JSONProtocol = RapidJSONProtocol
|
|
JSONValueProtocol = RapidJSONValueProtocol
|
|
# otherwise, try simplejson
|
|
elif simplejson:
|
|
JSONProtocol = SimpleJSONProtocol
|
|
JSONValueProtocol = SimpleJSONValueProtocol
|
|
# fall back to the built-in JSON module
|
|
else:
|
|
JSONProtocol = StandardJSONProtocol
|
|
JSONValueProtocol = StandardJSONValueProtocol
|
|
|
|
|
|
class PickleProtocol(_KeyCachingProtocol):
|
|
"""Encode ``(key, value)`` as two string-escaped pickles separated
|
|
by a tab.
|
|
|
|
We string-escape the pickles to avoid having to deal with stray
|
|
``\\t`` and ``\\n`` characters, which would confuse Hadoop
|
|
Streaming.
|
|
|
|
Ugly, but should work for any type.
|
|
|
|
.. warning::
|
|
|
|
Pickling is only *backwards*-compatible across Python versions. If your
|
|
job uses this as an output protocol, you should use at least the same
|
|
version of Python to parse the job's output. Vice versa for using this
|
|
as an input protocol.
|
|
"""
|
|
|
|
# string_escape doesn't exist on Python 3 (you can't .decode() bytes).
|
|
# Since efficiency matters for protocols, keeping separate code
|
|
# for Python 2 and 3
|
|
if PY2:
|
|
def _loads(self, value):
|
|
return pickle.loads(value.decode('string_escape'))
|
|
|
|
def _dumps(self, value):
|
|
return pickle.dumps(value).encode('string_escape')
|
|
else:
|
|
def _loads(self, value):
|
|
return pickle.loads(
|
|
value.decode('unicode_escape').encode('latin_1'))
|
|
|
|
def _dumps(self, value):
|
|
return pickle.dumps(value).decode(
|
|
'latin_1').encode('unicode_escape')
|
|
|
|
|
|
class PickleValueProtocol(object):
|
|
"""Encode ``value`` as a string-escaped pickle and discard ``key``
|
|
(``key`` is read in as ``None``).
|
|
|
|
See :py:class:`PickleProtocol` for details.
|
|
"""
|
|
if PY2:
|
|
def read(self, line):
|
|
return (None, pickle.loads(line.decode('string_escape')))
|
|
|
|
def write(self, key, value):
|
|
return pickle.dumps(value).encode('string_escape')
|
|
else:
|
|
def read(self, line):
|
|
return (None, pickle.loads(
|
|
line.decode('unicode_escape').encode('latin_1')))
|
|
|
|
def write(self, key, value):
|
|
return pickle.dumps(value).decode(
|
|
'latin_1').encode('unicode_escape')
|
|
|
|
|
|
# RawValueProtocol (below) is just an alias, but we treat it as a class for the
|
|
# purpose of documentation. All it does is output the value (key is read as
|
|
# ``None``).
|
|
#
|
|
# Same for RawProtocol, except it encodes key and value, separated by a tab.
|
|
|
|
class BytesProtocol(object):
|
|
"""Encode ``(key, value)`` (bytestrings) as ``key`` and ``value``
|
|
separated by a tab.
|
|
|
|
If ``key`` or ``value`` is ``None``, don't include a tab. When decoding a
|
|
line with no tab in it, ``value`` will be ``None``.
|
|
|
|
When reading from a line with multiple tabs, we break on the first one.
|
|
|
|
Your key should probably not be ``None`` or have tab characters in it, but
|
|
we don't check.
|
|
"""
|
|
def read(self, line):
|
|
key_value = line.split(b'\t', 1)
|
|
if len(key_value) == 1:
|
|
key_value.append(None)
|
|
|
|
return tuple(key_value)
|
|
|
|
def write(self, key, value):
|
|
return b'\t'.join(x for x in (key, value) if x is not None)
|
|
|
|
|
|
class BytesValueProtocol(object):
|
|
"""Read line (without trailing newline) directly into ``value`` (``key``
|
|
is always ``None``). Output ``value`` (bytes) directly, discarding ``key``.
|
|
|
|
**This is the default protocol used by jobs to read input on Python 2.**
|
|
|
|
.. warning::
|
|
|
|
Typical usage on Python 2 is to have your mapper parse (byte) strings
|
|
out of your input files, and then include them in the output to the
|
|
reducer. Since this output is then (by default) JSON-encoded, encoding
|
|
will fail if the bytestrings are not UTF-8 decodable. If this is an
|
|
issue, consider using :py:class:`TextValueProtocol` instead.
|
|
"""
|
|
def read(self, line):
|
|
return (None, line)
|
|
|
|
def write(self, key, value):
|
|
return value
|
|
|
|
|
|
class TextProtocol(object):
|
|
"""UTF-8 encode ``key`` and ``value`` (unicode strings) and join them
|
|
with a tab character. When reading input, we fall back to latin-1 if
|
|
we can't UTF-8 decode the line.
|
|
|
|
If ``key`` or ``value`` is ``None``, don't include a tab. When decoding a
|
|
line with no tab in it, ``value`` will be ``None``.
|
|
|
|
When reading from a line with multiple tabs, we break on the first one.
|
|
|
|
Your key should probably not be ``None`` or have tab characters in it, but
|
|
we don't check.
|
|
"""
|
|
def read(self, line):
|
|
try:
|
|
line = line.decode('utf_8')
|
|
except UnicodeDecodeError:
|
|
line = line.decode('latin_1')
|
|
|
|
key_value = line.split(u'\t', 1)
|
|
if len(key_value) == 1:
|
|
key_value.append(None)
|
|
|
|
return tuple(key_value)
|
|
|
|
def write(self, key, value):
|
|
return b'\t'.join(
|
|
x.encode('utf_8') for x in (key, value) if x is not None)
|
|
|
|
|
|
class TextValueProtocol(object):
|
|
"""Attempt to UTF-8 decode line (without trailing newline) into ``value``,
|
|
falling back to latin-1. (``key`` is always ``None``). Output ``value``
|
|
UTF-8 encoded, discarding ``key``.
|
|
|
|
**This is the default protocol used by jobs to read input on Python 3.**
|
|
|
|
This is a good solution for reading text files which are mostly ASCII but
|
|
may have some other bytes of unknown encoding (e.g. logs).
|
|
|
|
If you wish to enforce a particular encoding, use
|
|
:py:class:`BytesValueProtocol` instead::
|
|
|
|
class MREncodingEnforcer(MRJob):
|
|
|
|
INPUT_PROTOCOL = BytesValueProtocol
|
|
|
|
def mapper(self, _, value):
|
|
value = value.decode('utf_8')
|
|
...
|
|
"""
|
|
def read(self, line):
|
|
try:
|
|
return (None, line.decode('utf_8'))
|
|
except UnicodeDecodeError:
|
|
return (None, line.decode('latin_1'))
|
|
|
|
def write(self, key, value):
|
|
return value.encode('utf_8')
|
|
|
|
|
|
# RawValueProtocol is the default way of reading input. Historically
|
|
# (in Python 2), it's always read raw bytes, but Python 3 is pickier about
|
|
# bytes, so we use TextValueProcotol (unicode) instead.
|
|
if PY2:
|
|
RawProtocol = BytesProtocol
|
|
RawValueProtocol = BytesValueProtocol
|
|
else:
|
|
RawProtocol = TextProtocol
|
|
RawValueProtocol = TextValueProtocol
|
|
|
|
|
|
class ReprProtocol(_KeyCachingProtocol):
|
|
"""Encode ``(key, value)`` as two reprs separated by a tab.
|
|
|
|
This only works for basic types (we use :py:func:`mrjob.util.safeeval`).
|
|
|
|
.. warning::
|
|
|
|
The repr format changes between different versions of Python (for
|
|
example, braces for sets in Python 2.7, and different string contants
|
|
in Python 3). Plan accordingly.
|
|
"""
|
|
|
|
def _loads(self, value):
|
|
return safeeval(value)
|
|
|
|
if PY2:
|
|
def _dumps(self, value):
|
|
return repr(value)
|
|
else:
|
|
def _dumps(self, value):
|
|
return repr(value).encode('utf_8')
|
|
|
|
|
|
class ReprValueProtocol(object):
|
|
"""Encode ``value`` as a repr and discard ``key`` (``key`` is read
|
|
in as None).
|
|
|
|
See :py:class:`ReprProtocol` for details.
|
|
"""
|
|
def read(self, line):
|
|
return (None, safeeval(line))
|
|
|
|
if PY2:
|
|
def write(self, key, value):
|
|
return repr(value)
|
|
else:
|
|
def write(self, key, value):
|
|
return repr(value).encode('utf_8')
|