mirror of
https://github.com/emilybache/GildedRose-Refactoring-Kata.git
synced 2026-02-09 03:31:28 +00:00
125 lines
3.9 KiB
Python
125 lines
3.9 KiB
Python
# Copyright 2017 Yelp
|
|
# Copyright 2018 Google, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Emulating the way Hadoop handles input files, decompressing compressed
|
|
files based on their file extension.
|
|
|
|
This module also functions as a :command:`cat` substitute that can handle
|
|
compressed files. It it used by :py:mod:`local <mrjob.local>` mode and can
|
|
function without the rest of the mrjob library.
|
|
"""
|
|
import zlib
|
|
|
|
try:
|
|
import bz2
|
|
bz2 # redefine bz2 for pepflakes
|
|
except ImportError:
|
|
bz2 = None
|
|
|
|
|
|
def bunzip2_stream(fileobj, bufsize=1024):
|
|
"""Decompress gzipped data on the fly.
|
|
|
|
:param fileobj: object supporting ``read()``
|
|
:param bufsize: number of bytes to read from *fileobj* at a time.
|
|
|
|
.. warning::
|
|
|
|
This yields decompressed chunks; it does *not* split on lines. To get
|
|
lines, wrap this in :py:func:`to_lines`.
|
|
"""
|
|
if bz2 is None:
|
|
raise Exception(
|
|
'bz2 module was not successfully imported (likely not installed).')
|
|
|
|
d = bz2.BZ2Decompressor()
|
|
|
|
for chunk in to_chunks(fileobj):
|
|
part = d.decompress(chunk)
|
|
if part:
|
|
yield part
|
|
|
|
|
|
def gunzip_stream(fileobj, bufsize=1024):
|
|
"""Decompress gzipped data on the fly.
|
|
|
|
:param fileobj: object supporting ``read()``
|
|
:param bufsize: number of bytes to read from *fileobj* at a time. The
|
|
default is the same as in :py:mod:`gzip`.
|
|
|
|
.. warning::
|
|
|
|
This yields decompressed chunks; it does *not* split on lines. To get
|
|
lines, wrap this in :py:func:`to_lines`.
|
|
"""
|
|
# see Issue #601 for why we need this.
|
|
|
|
# we need this flag to read gzip rather than raw zlib, but it's not
|
|
# actually defined in zlib, so we define it here.
|
|
READ_GZIP_DATA = 16
|
|
d = zlib.decompressobj(READ_GZIP_DATA | zlib.MAX_WBITS)
|
|
for chunk in to_chunks(fileobj, bufsize):
|
|
data = d.decompress(chunk)
|
|
if data:
|
|
yield data
|
|
|
|
|
|
def decompress(readable, path, bufsize=1024):
|
|
"""Take a *readable* which supports the ``.read()`` method correponding to
|
|
the given path and returns an iterator that yields chunks of bytes,
|
|
possibly decompressing based on *path*.
|
|
|
|
if *readable* appears to be a fileobj, pass it through as-is.
|
|
|
|
if *readable* does not have a ``read()`` method, assume that it's
|
|
a generator that yields chunks of bytes
|
|
"""
|
|
if path.endswith('.gz'):
|
|
return gunzip_stream(readable)
|
|
elif path.endswith('.bz2'):
|
|
if bz2 is None:
|
|
raise Exception('bz2 module was not successfully imported'
|
|
' (likely not installed).')
|
|
|
|
return bunzip2_stream(readable)
|
|
elif hasattr(readable, '__iter__'):
|
|
return readable
|
|
else:
|
|
# not a real readable (e.g. boto3 StreamingBody)
|
|
return to_chunks(readable, bufsize=bufsize)
|
|
|
|
|
|
def is_compressed(path):
|
|
return path.endswith('.bz2') or path.endswith('.gz')
|
|
|
|
|
|
def to_chunks(readable, bufsize=1024):
|
|
"""Convert *readable*, which is any object supporting ``read()``
|
|
(e.g. fileobjs) to a stream of non-empty ``bytes``.
|
|
|
|
If *readable* has an ``__iter__`` method but not a ``read`` method,
|
|
pass through as-is.
|
|
"""
|
|
if hasattr(readable, '__iter__') and not hasattr(readable, 'read'):
|
|
for chunk in readable:
|
|
yield chunk
|
|
return
|
|
|
|
while True:
|
|
chunk = readable.read(bufsize)
|
|
if chunk:
|
|
yield chunk
|
|
else:
|
|
return
|