GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/cat.py
2025-06-22 13:36:01 +05:30

125 lines
3.9 KiB
Python

# Copyright 2017 Yelp
# Copyright 2018 Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Emulating the way Hadoop handles input files, decompressing compressed
files based on their file extension.
This module also functions as a :command:`cat` substitute that can handle
compressed files. It it used by :py:mod:`local <mrjob.local>` mode and can
function without the rest of the mrjob library.
"""
import zlib
try:
import bz2
bz2 # redefine bz2 for pepflakes
except ImportError:
bz2 = None
def bunzip2_stream(fileobj, bufsize=1024):
"""Decompress gzipped data on the fly.
:param fileobj: object supporting ``read()``
:param bufsize: number of bytes to read from *fileobj* at a time.
.. warning::
This yields decompressed chunks; it does *not* split on lines. To get
lines, wrap this in :py:func:`to_lines`.
"""
if bz2 is None:
raise Exception(
'bz2 module was not successfully imported (likely not installed).')
d = bz2.BZ2Decompressor()
for chunk in to_chunks(fileobj):
part = d.decompress(chunk)
if part:
yield part
def gunzip_stream(fileobj, bufsize=1024):
"""Decompress gzipped data on the fly.
:param fileobj: object supporting ``read()``
:param bufsize: number of bytes to read from *fileobj* at a time. The
default is the same as in :py:mod:`gzip`.
.. warning::
This yields decompressed chunks; it does *not* split on lines. To get
lines, wrap this in :py:func:`to_lines`.
"""
# see Issue #601 for why we need this.
# we need this flag to read gzip rather than raw zlib, but it's not
# actually defined in zlib, so we define it here.
READ_GZIP_DATA = 16
d = zlib.decompressobj(READ_GZIP_DATA | zlib.MAX_WBITS)
for chunk in to_chunks(fileobj, bufsize):
data = d.decompress(chunk)
if data:
yield data
def decompress(readable, path, bufsize=1024):
"""Take a *readable* which supports the ``.read()`` method correponding to
the given path and returns an iterator that yields chunks of bytes,
possibly decompressing based on *path*.
if *readable* appears to be a fileobj, pass it through as-is.
if *readable* does not have a ``read()`` method, assume that it's
a generator that yields chunks of bytes
"""
if path.endswith('.gz'):
return gunzip_stream(readable)
elif path.endswith('.bz2'):
if bz2 is None:
raise Exception('bz2 module was not successfully imported'
' (likely not installed).')
return bunzip2_stream(readable)
elif hasattr(readable, '__iter__'):
return readable
else:
# not a real readable (e.g. boto3 StreamingBody)
return to_chunks(readable, bufsize=bufsize)
def is_compressed(path):
return path.endswith('.bz2') or path.endswith('.gz')
def to_chunks(readable, bufsize=1024):
"""Convert *readable*, which is any object supporting ``read()``
(e.g. fileobjs) to a stream of non-empty ``bytes``.
If *readable* has an ``__iter__`` method but not a ``read`` method,
pass through as-is.
"""
if hasattr(readable, '__iter__') and not hasattr(readable, 'read'):
for chunk in readable:
yield chunk
return
while True:
chunk = readable.read(bufsize)
if chunk:
yield chunk
else:
return