GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/fs/gcs.py
2025-06-22 13:36:01 +05:30

330 lines
10 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2016 Google Inc.
# Copyright 2017 Yelp
# Copyright 2018 Google Inc.
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import binascii
import fnmatch
import logging
from base64 import b64decode
from mrjob.cat import decompress
from mrjob.fs.base import Filesystem
from mrjob.parse import urlparse
from mrjob.runner import GLOB_RE
try:
import google.api_core.exceptions
import google.auth.exceptions
import google.cloud.storage.client
except ImportError:
google = None
log = logging.getLogger(__name__)
# download this many bytes at once from cat()
_CAT_CHUNK_SIZE = 8192
def _path_glob_to_parsed_gcs_uri(path_glob):
# support globs
glob_match = GLOB_RE.match(path_glob)
# we're going to search for all keys starting with base_uri
if glob_match:
# cut it off at first wildcard
base_uri = glob_match.group(1)
else:
base_uri = path_glob
bucket_name, base_name = parse_gcs_uri(base_uri)
return bucket_name, base_name
class GCSFilesystem(Filesystem):
"""Filesystem for Google Cloud Storage (GCS) URIs
:param credentials: an optional
:py:class:`google.auth.credentials.Credentials`, used
to initialize the storage client
:param project_id: an optional project ID, used to initialize the storage
client
:param part_size: Part size for multi-part uploading, in bytes, or ``None``
:param location: Default location to use when creating a bucket
:param object_ttl_days: Default object expiry for newly created buckets
.. versionchanged:: 0.7.0
removed *local_tmp_dir*
.. versionchanged:: 0.6.8
deprecated *local_tmp_dir*, added *part_size*, *location*,
*object_ttl_days*
"""
def __init__(self, credentials=None, project_id=None,
part_size=None, location=None, object_ttl_days=None):
self._credentials = credentials
self._project_id = project_id
self._part_size = part_size
self._location = location
self._object_ttl_days = object_ttl_days
@property
def client(self):
return google.cloud.storage.client.Client(
project=self._project_id, credentials=self._credentials)
@property
def api_client(self):
raise NotImplementedError(
'"api_client" was disabled in v0.6.2. use "client" instead')
def can_handle_path(self, path):
return is_gcs_uri(path)
def du(self, path_glob):
"""Get the size of all files matching path_glob."""
return sum(blob.size for uri, blob in self._ls(path_glob))
def ls(self, path_glob):
for uri, blob in self._ls(path_glob):
# don't return directory "blobs"
if uri.endswith('/'):
continue
yield uri
def _ls(self, path_glob):
"""Helper method for :py:meth:`ls`; yields tuples of
``(uri, blob)`` where *blob* is the corresponding
:py:class:`google.cloud.storage.blob.Blob`.
This *will* return empty "directory" globs.
"""
# support globs
glob_match = GLOB_RE.match(path_glob)
# we're going to search for all keys starting with base_uri
if glob_match:
# cut it off at first wildcard
base_uri = glob_match.group(1)
else:
base_uri = path_glob
bucket_name, base_name = parse_gcs_uri(base_uri)
# allow subdirectories of the path/glob
if path_glob and not path_glob.endswith('/'):
dir_glob = path_glob + '/*'
else:
dir_glob = path_glob + '*'
try:
bucket = self.get_bucket(bucket_name)
except google.api_core.exceptions.NotFound:
return # treat nonexistent buckets as empty
for blob in bucket.list_blobs(prefix=base_name):
uri = "gs://%s/%s" % (bucket_name, blob.name)
# enforce globbing
if not (fnmatch.fnmatchcase(uri, path_glob) or
fnmatch.fnmatchcase(uri, dir_glob)):
continue
yield uri, blob
def md5sum(self, path):
blob = self._get_blob(path)
if not blob:
raise IOError('Object %r does not exist' % (path,))
return binascii.hexlify(b64decode(blob.md5_hash)).decode('ascii')
def _cat_file(self, path):
return decompress(self._cat_blob(path), path)
def _cat_blob(self, gcs_uri):
""":py:meth:`cat_file`, minus decompression."""
blob = self._get_blob(gcs_uri)
if not blob:
return # don't cat nonexistent files
start = 0
while True:
end = start + _CAT_CHUNK_SIZE
try:
chunk = blob.download_as_string(start=start, end=end)
except google.api_core.exceptions.RequestRangeNotSatisfiable:
return
yield chunk
if len(chunk) < _CAT_CHUNK_SIZE:
return
start = end
def mkdir(self, path):
"""Does not actually create a directory on GCS (because GCS doesn't
have directories), but creates the underlying bucket if it does not
exist already.
"""
bucket_name, base_name = parse_gcs_uri(path)
try:
self.get_bucket(bucket_name)
except google.api_core.exceptions.NotFound:
self.create_bucket(bucket_name)
def exists(self, path_glob):
"""Does the given path exist?
If dest is a directory (ends with a "/"), we check if there are
any files starting with that path.
"""
try:
paths = self.ls(path_glob)
except:
paths = []
return any(paths)
def rm(self, path_glob):
"""Remove all files matching the given glob."""
for uri, blob in self._ls(path_glob):
blob.delete()
def touchz(self, path):
# check if already exists
old_blob = self._get_blob(path)
if old_blob:
raise IOError('Non-empty file %r already exists!' % (path,))
self._blob(path).upload_from_string(b'')
def put(self, src, path):
"""Uploads a local file to a specific destination.
.. versionchanged::
0.7.0 removed *chunk_size* arg (use *part_size*
in the constructor)
.. versionchanged:: 0.6.8 deprecated *chunk_size*
"""
part_size = self._part_size
old_blob = self._get_blob(path)
if old_blob:
raise IOError('File already exists: %s' % path)
self._blob(path, chunk_size=part_size).upload_from_filename(src)
def get_all_bucket_names(self, prefix=None):
"""Yield the names of all buckets associated with this client.
:param prefix: optional prefix to search under (e.g. ``'mrjob-'``)
.. versionadded:: 0.6.2
"""
for b in self.client.list_buckets(prefix=prefix):
yield b.name
def list_buckets(self, project, prefix=None):
"""List buckets on GCS."""
raise NotImplementedError(
'list_buckets() was disabled in v0.6.2. Use'
'get_all_bucket_names() and get_bucket()')
def get_bucket(self, bucket_name):
"""Return a :py:class:`google.cloud.storage.bucket.Bucket`
Raises an exception if the bucket does not exist."""
return self.client.get_bucket(bucket_name)
def create_bucket(self, name,
location=None, object_ttl_days=None):
"""Create a bucket on GCS, optionally setting location constraint.
and time-to-live."""
bucket = self.client.bucket(name)
if location is None:
location = self._location
elif not location:
location = None # leave a way to use the API default
bucket.create(location=location)
if object_ttl_days is None:
object_ttl_days = self._object_ttl_days
if object_ttl_days:
bucket.lifecycle_rules = [
dict(
action=dict(type='Delete'),
condition=dict(age=object_ttl_days)
)
]
def delete_bucket(self, bucket):
raise NotImplementedError(
'delete_bucket() was disabled in v0.6.2. Use'
'fs.bucket(name).delete()')
def _get_blob(self, uri, chunk_size=None):
# NOTE: chunk_size seems not to work well with downloading
bucket_name, blob_name = parse_gcs_uri(uri)
bucket = self.client.get_bucket(bucket_name)
return bucket.get_blob(blob_name, chunk_size=chunk_size)
def _blob(self, uri, chunk_size=None):
# NOTE: chunk_size seems not to work well with downloading
bucket_name, blob_name = parse_gcs_uri(uri)
bucket = self.client.get_bucket(bucket_name)
return bucket.blob(blob_name, chunk_size=chunk_size)
# The equivalent S3 methods are in parse.py but it's cleaner to keep them
# in the filesystem module; let's do that going forward
def is_gcs_uri(uri):
"""Return True if *uri* can be parsed into an S3 URI, False otherwise.
"""
try:
parse_gcs_uri(uri)
return True
except ValueError:
return False
def parse_gcs_uri(uri):
"""Parse a GCS URI into (bucket, key)
>>> parse_gcs_uri("gs://walrus/tmp/")
('walrus', 'tmp/')
If ``uri`` is not a GCS URI, raise a ValueError
"""
components = urlparse(uri)
if components.scheme != "gs" or '/' not in components.path:
raise ValueError('Invalid GCS URI: %s' % uri)
return components.netloc, components.path[1:]
def _is_permanent_google_error(self, ex):
return isinstance(ex, google.auth.exceptions.DefaultCredentialsError)