GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/protocol.py
2025-06-22 13:36:01 +05:30

503 lines
16 KiB
Python

# Copyright 2009-2013 Yelp and Contributors
# Copyright 2015-2017 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Protocols translate raw bytes into key, value pairs.
Typically, protocols encode a key and value into bytes, and join them together
with a tab character.
However, protocols with ``Value`` in their name ignore
keys and simply read/write values (with key read in as ``None``), allowing
you to read and write data in arbitrary formats.
For more information, see :ref:`job-protocols` and :ref:`writing-protocols`.
"""
# This is one of the few places where efficiency really matters; to that end,
# we maintain separate code for Python 2 and 3 where necessary. Tests of
# protocols should *not* have different code for different versions of Python.
# don't add imports here that aren't part of the standard Python library,
# since MRJobs need to run in Amazon's generic EMR environment
import json
try:
import cPickle as pickle # Python 2 only
except ImportError:
import pickle
from mrjob.py2 import PY2
from mrjob.util import safeeval
try:
import rapidjson
rapidjson
except ImportError:
rapidjson = None
try:
import simplejson
simplejson # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
simplejson = None
try:
import ujson
ujson # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
ujson = None
class _KeyCachingProtocol(object):
"""Protocol that caches the last decoded key.
We're not currently exposing this class; inheriting from this class
will result in almost as much code as simply writing your own read/write
methods. You should probably cache keys, but in a way that makes sense for
your use case.
"""
_last_key_encoded = None
_last_key_decoded = None
def _loads(self, value):
"""Decode a single key/value, and return it."""
raise NotImplementedError
def _dumps(self, value):
"""Encode a single key/value, and return it."""
raise NotImplementedError
def read(self, line):
"""Decode a line of input.
:type line: str
:param line: A line of raw input to the job, without trailing newline.
:return: A tuple of ``(key, value)``."""
raw_key, raw_value = line.split(b'\t', 1)
if raw_key != self._last_key_encoded:
self._last_key_encoded = raw_key
self._last_key_decoded = self._loads(raw_key)
return (self._last_key_decoded, self._loads(raw_value))
def write(self, key, value):
"""Encode a key and value.
:param key: A key (of any type) yielded by a mapper/reducer
:param value: A value (of any type) yielded by a mapper/reducer
:rtype: str
:return: A line, without trailing newline."""
return self._dumps(key) + b'\t' + self._dumps(value)
# JSONProtocol (below) is just an alias, but we treat it as a class for the
# purpose of documentation. It encodes key and value as two JSONs separated
# by a tab.
#
# Same for JSONValueProtocol, except it encodes only the value (key
# is read as ``None``).
class StandardJSONProtocol(_KeyCachingProtocol):
"""Implements :py:class:`JSONProtocol` using Python's built-in JSON
library.
.. note::
The built-in ``json`` library is (appropriately) strict about the JSON
standard; it won't accept dictionaries with non-string keys, sets, or
(on Python 3) bytestrings.
"""
if PY2:
def _loads(self, value):
return json.loads(value)
def _dumps(self, value):
return json.dumps(value)
else:
def _loads(self, value):
# Python 3's json module does not accept bytes
return json.loads(value.decode('utf_8'))
def _dumps(self, value):
return json.dumps(value).encode('utf_8')
class StandardJSONValueProtocol(object):
"""Implements :py:class:`JSONValueProtocol` using Python's built-in JSON
library.
"""
if PY2:
def read(self, line):
return (None, json.loads(line))
def write(self, key, value):
return json.dumps(value)
else:
def read(self, line):
# Python 3's json module does not accept bytes
return (None, json.loads(line.decode('utf_8')))
def write(self, key, value):
return json.dumps(value).encode('utf_8')
class RapidJSONProtocol(_KeyCachingProtocol):
"""Implements :py:class:`JSONProtocol` using the :py:mod:`rapidjson`
library.
"""
# rapidjson only exists in Python 3, so no special cases for Python 3
def _loads(self, value):
return rapidjson.loads(value)
def _dumps(self, value):
return rapidjson.dumps(value).encode('utf_8')
class RapidJSONValueProtocol(object):
"""Implements :py:class:`JSONValueProtocol` using the :py:mod:`rapidjson`
library.
"""
# rapidjson only exists in Python 3, so no special cases for Python 3
def read(self, line):
return (None, rapidjson.loads(line))
def write(self, key, value):
return rapidjson.dumps(value).encode('utf_8')
class SimpleJSONProtocol(_KeyCachingProtocol):
"""Implements :py:class:`JSONProtocol` using the :py:mod:`simplejson`
library."""
def _loads(self, value):
# simplejson can handle bytes even in Python 3
return simplejson.loads(value)
if PY2:
def _dumps(self, value):
return simplejson.dumps(value)
else:
def _dumps(self, value):
return simplejson.dumps(value).encode('utf_8')
class SimpleJSONValueProtocol(object):
"""Implements :py:class:`JSONValueProtocol` using the :py:mod:`simplejson`
library.
"""
def read(self, line):
# simplejson can handle bytes even in Python 3
return (None, simplejson.loads(line))
if PY2:
def write(self, key, value):
return simplejson.dumps(value)
else:
def write(self, key, value):
return simplejson.dumps(value).encode('utf_8')
class UltraJSONProtocol(_KeyCachingProtocol):
"""Implements :py:class:`JSONProtocol` using the :py:mod:`ujson` library.
.. warning::
:py:mod:`ujson` is about five times faster than the standard
implementation, but is more willing to encode things that aren't
strictly JSON-encodable, including sets, dictionaries with
tuples as keys, UTF-8 encoded bytes, and objects (!). Relying on this
behavior won't stop your job from working, but it can
make your job *dependent* on :py:mod:`ujson`, rather than just using
it as a speedup.
.. note::
:py:mod:`ujson` also differs from the standard implementation in that
it doesn't add spaces to its JSONs (``{"foo":"bar"}`` versus
``{"foo": "bar"}``). This probably won't affect anything but test
cases and readability.
"""
def _loads(self, value):
# ujson can handle bytes even in Python 3
return ujson.loads(value)
if PY2:
def _dumps(self, value):
return ujson.dumps(value)
else:
def _dumps(self, value):
return ujson.dumps(value).encode('utf_8')
class UltraJSONValueProtocol(object):
"""Implements :py:class:`JSONValueProtocol` using the :py:mod:`ujson`
library.
"""
def read(self, line):
# ujson can handle bytes even in Python 3
return (None, ujson.loads(line))
if PY2:
def write(self, key, value):
return ujson.dumps(value)
else:
def write(self, key, value):
return ujson.dumps(value).encode('utf_8')
# use ujson by default if available
if ujson:
JSONProtocol = UltraJSONProtocol
JSONValueProtocol = UltraJSONValueProtocol
# otherwise, try rapidjson. This library is supposed to be Python 3+
# only, so don't try to use it on Python 2
elif rapidjson and not PY2:
JSONProtocol = RapidJSONProtocol
JSONValueProtocol = RapidJSONValueProtocol
# otherwise, try simplejson
elif simplejson:
JSONProtocol = SimpleJSONProtocol
JSONValueProtocol = SimpleJSONValueProtocol
# fall back to the built-in JSON module
else:
JSONProtocol = StandardJSONProtocol
JSONValueProtocol = StandardJSONValueProtocol
class PickleProtocol(_KeyCachingProtocol):
"""Encode ``(key, value)`` as two string-escaped pickles separated
by a tab.
We string-escape the pickles to avoid having to deal with stray
``\\t`` and ``\\n`` characters, which would confuse Hadoop
Streaming.
Ugly, but should work for any type.
.. warning::
Pickling is only *backwards*-compatible across Python versions. If your
job uses this as an output protocol, you should use at least the same
version of Python to parse the job's output. Vice versa for using this
as an input protocol.
"""
# string_escape doesn't exist on Python 3 (you can't .decode() bytes).
# Since efficiency matters for protocols, keeping separate code
# for Python 2 and 3
if PY2:
def _loads(self, value):
return pickle.loads(value.decode('string_escape'))
def _dumps(self, value):
return pickle.dumps(value).encode('string_escape')
else:
def _loads(self, value):
return pickle.loads(
value.decode('unicode_escape').encode('latin_1'))
def _dumps(self, value):
return pickle.dumps(value).decode(
'latin_1').encode('unicode_escape')
class PickleValueProtocol(object):
"""Encode ``value`` as a string-escaped pickle and discard ``key``
(``key`` is read in as ``None``).
See :py:class:`PickleProtocol` for details.
"""
if PY2:
def read(self, line):
return (None, pickle.loads(line.decode('string_escape')))
def write(self, key, value):
return pickle.dumps(value).encode('string_escape')
else:
def read(self, line):
return (None, pickle.loads(
line.decode('unicode_escape').encode('latin_1')))
def write(self, key, value):
return pickle.dumps(value).decode(
'latin_1').encode('unicode_escape')
# RawValueProtocol (below) is just an alias, but we treat it as a class for the
# purpose of documentation. All it does is output the value (key is read as
# ``None``).
#
# Same for RawProtocol, except it encodes key and value, separated by a tab.
class BytesProtocol(object):
"""Encode ``(key, value)`` (bytestrings) as ``key`` and ``value``
separated by a tab.
If ``key`` or ``value`` is ``None``, don't include a tab. When decoding a
line with no tab in it, ``value`` will be ``None``.
When reading from a line with multiple tabs, we break on the first one.
Your key should probably not be ``None`` or have tab characters in it, but
we don't check.
"""
def read(self, line):
key_value = line.split(b'\t', 1)
if len(key_value) == 1:
key_value.append(None)
return tuple(key_value)
def write(self, key, value):
return b'\t'.join(x for x in (key, value) if x is not None)
class BytesValueProtocol(object):
"""Read line (without trailing newline) directly into ``value`` (``key``
is always ``None``). Output ``value`` (bytes) directly, discarding ``key``.
**This is the default protocol used by jobs to read input on Python 2.**
.. warning::
Typical usage on Python 2 is to have your mapper parse (byte) strings
out of your input files, and then include them in the output to the
reducer. Since this output is then (by default) JSON-encoded, encoding
will fail if the bytestrings are not UTF-8 decodable. If this is an
issue, consider using :py:class:`TextValueProtocol` instead.
"""
def read(self, line):
return (None, line)
def write(self, key, value):
return value
class TextProtocol(object):
"""UTF-8 encode ``key`` and ``value`` (unicode strings) and join them
with a tab character. When reading input, we fall back to latin-1 if
we can't UTF-8 decode the line.
If ``key`` or ``value`` is ``None``, don't include a tab. When decoding a
line with no tab in it, ``value`` will be ``None``.
When reading from a line with multiple tabs, we break on the first one.
Your key should probably not be ``None`` or have tab characters in it, but
we don't check.
"""
def read(self, line):
try:
line = line.decode('utf_8')
except UnicodeDecodeError:
line = line.decode('latin_1')
key_value = line.split(u'\t', 1)
if len(key_value) == 1:
key_value.append(None)
return tuple(key_value)
def write(self, key, value):
return b'\t'.join(
x.encode('utf_8') for x in (key, value) if x is not None)
class TextValueProtocol(object):
"""Attempt to UTF-8 decode line (without trailing newline) into ``value``,
falling back to latin-1. (``key`` is always ``None``). Output ``value``
UTF-8 encoded, discarding ``key``.
**This is the default protocol used by jobs to read input on Python 3.**
This is a good solution for reading text files which are mostly ASCII but
may have some other bytes of unknown encoding (e.g. logs).
If you wish to enforce a particular encoding, use
:py:class:`BytesValueProtocol` instead::
class MREncodingEnforcer(MRJob):
INPUT_PROTOCOL = BytesValueProtocol
def mapper(self, _, value):
value = value.decode('utf_8')
...
"""
def read(self, line):
try:
return (None, line.decode('utf_8'))
except UnicodeDecodeError:
return (None, line.decode('latin_1'))
def write(self, key, value):
return value.encode('utf_8')
# RawValueProtocol is the default way of reading input. Historically
# (in Python 2), it's always read raw bytes, but Python 3 is pickier about
# bytes, so we use TextValueProcotol (unicode) instead.
if PY2:
RawProtocol = BytesProtocol
RawValueProtocol = BytesValueProtocol
else:
RawProtocol = TextProtocol
RawValueProtocol = TextValueProtocol
class ReprProtocol(_KeyCachingProtocol):
"""Encode ``(key, value)`` as two reprs separated by a tab.
This only works for basic types (we use :py:func:`mrjob.util.safeeval`).
.. warning::
The repr format changes between different versions of Python (for
example, braces for sets in Python 2.7, and different string contants
in Python 3). Plan accordingly.
"""
def _loads(self, value):
return safeeval(value)
if PY2:
def _dumps(self, value):
return repr(value)
else:
def _dumps(self, value):
return repr(value).encode('utf_8')
class ReprValueProtocol(object):
"""Encode ``value`` as a repr and discard ``key`` (``key`` is read
in as None).
See :py:class:`ReprProtocol` for details.
"""
def read(self, line):
return (None, safeeval(line))
if PY2:
def write(self, key, value):
return repr(value)
else:
def write(self, key, value):
return repr(value).encode('utf_8')