GildedRose-Refactoring-Kata/.venv/lib/python3.12/site-packages/mrjob/fs/s3.py
2025-06-22 13:36:01 +05:30

358 lines
12 KiB
Python

# Copyright 2009-2016 Yelp and Contributors
# Copyright 2017-2019 Yelp
# Copyright 2020 Affirm, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""S3 Filesystem.
Also the place for common code used to establish and wrap AWS connections."""
import fnmatch
import logging
try:
import botocore.client
botocore # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
botocore = None
try:
import boto3
import boto3.s3.transfer
boto3 # quiet "redefinition of unused ..." warning from pyflakes
except ImportError:
boto3 = None
from mrjob.aws import _client_error_status
from mrjob.aws import _S3_REGION_WITH_NO_LOCATION_CONSTRAINT
from mrjob.aws import _wrap_aws_client
from mrjob.cat import decompress
from mrjob.fs.base import Filesystem
from mrjob.parse import is_uri
from mrjob.parse import is_s3_uri
from mrjob.parse import parse_s3_uri
from mrjob.parse import urlparse
from mrjob.runner import GLOB_RE
log = logging.getLogger(__name__)
_CHUNK_SIZE = 8192
# used to disable multipart upload
_HUGE_PART_SIZE = 2 ** 256
def _endpoint_url(host_or_uri):
"""If *host_or_uri* is non-empty and isn't a URI, prepend ``'https://'``.
Otherwise, pass through as-is.
"""
if not host_or_uri:
return host_or_uri
elif is_uri(host_or_uri):
return host_or_uri
else:
return 'https://' + host_or_uri
def _get_bucket_region(client, bucket_name):
"""Look up the given bucket's location constraint and translate
it to a region name."""
resp = client.get_bucket_location(Bucket=bucket_name)
return resp['LocationConstraint'] or _S3_REGION_WITH_NO_LOCATION_CONSTRAINT
class S3Filesystem(Filesystem):
"""Filesystem for Amazon S3 URIs. Typically you will get one of these via
``EMRJobRunner().fs``, composed with
:py:class:`~mrjob.fs.ssh.SSHFilesystem` and
:py:class:`~mrjob.fs.local.LocalFilesystem`.
:param aws_access_key_id: Your AWS access key ID
:param aws_secret_access_key: Your AWS secret access key
:param aws_session_token: session token for use with temporary
AWS credentials
:param s3_endpoint: If set, always use this endpoint
:param s3_region: Default region for connections to the S3 API and
newly created buckets.
:param part_size: Part size for multi-part uploading, in bytes, or
``None``
.. versionchanged:: 0.6.8 added *part_size*
"""
def __init__(self, aws_access_key_id=None, aws_secret_access_key=None,
aws_session_token=None, s3_endpoint=None, s3_region=None,
part_size=None):
super(S3Filesystem, self).__init__()
self._s3_endpoint_url = _endpoint_url(s3_endpoint)
self._s3_region = s3_region
self._aws_access_key_id = aws_access_key_id
self._aws_secret_access_key = aws_secret_access_key
self._aws_session_token = aws_session_token
self._part_size = part_size
def can_handle_path(self, path):
return is_s3_uri(path)
def du(self, path_glob):
"""Get the size of all files matching path_glob."""
return sum(key.size for uri, key in self._ls(path_glob))
def ls(self, path_glob):
"""Recursively yield the URIs of S3 keys matching the given glob.
*path_glob* can include ``?`` to match single characters or
``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
``/``.
"""
for uri, key in self._ls(path_glob):
yield uri
def _ls(self, path_glob):
"""Helper method for :py:meth:`ls`; yields tuples of
``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary.
"""
# clean up the base uri to ensure we have pass boto3 an s3:// URI
# (not s3n://)
scheme = urlparse(path_glob).scheme
# support globs
glob_match = GLOB_RE.match(path_glob)
# we're going to search for all keys starting with base_uri
if glob_match:
# cut it off at first wildcard
base_uri = glob_match.group(1)
else:
base_uri = path_glob
bucket_name, base_name = parse_s3_uri(base_uri)
# allow subdirectories of the path/glob
if path_glob and not path_glob.endswith('/'):
dir_glob = path_glob + '/*'
else:
dir_glob = path_glob + '*'
try:
bucket = self.get_bucket(bucket_name)
except botocore.exceptions.ClientError as ex:
if _client_error_status(ex) == 404: # treat nonexistent as empty
return
raise
for key in bucket.objects.filter(Prefix=base_name):
uri = "%s://%s/%s" % (scheme, bucket_name, key.key)
# enforce globbing
if not (fnmatch.fnmatchcase(uri, path_glob) or
fnmatch.fnmatchcase(uri, dir_glob)):
continue
yield uri, key
def md5sum(self, path):
k = self._get_s3_key(path)
if not k:
raise IOError('Key %r does not exist' % (path,))
return k.e_tag.strip('"')
def _cat_file(self, path):
# stream lines from the s3 key
s3_key = self._get_s3_key(path)
body = s3_key.get()['Body']
return decompress(body, path)
def exists(self, path_glob):
"""Does the given path exist?
If dest is a directory (ends with a "/"), we check if there are
any files starting with that path.
"""
# just fall back on _ls(); it's smart
return any(self._ls(path_glob))
def mkdir(self, path):
"""Make a directory. This doesn't actually create directories on S3
(because there is no such thing), but it will create the corresponding
bucket if it doesn't exist.
"""
bucket_name, key_name = parse_s3_uri(path)
client = self.make_s3_client()
try:
client.head_bucket(Bucket=bucket_name)
except botocore.exceptions.ClientError as ex:
if _client_error_status(ex) != 404:
raise
self.create_bucket(bucket_name)
def put(self, src, path):
"""Uploads a local file to a specific destination."""
s3_key = self._get_s3_key(path)
# if part_size is None or 0, disable multipart upload
part_size = self._part_size or _HUGE_PART_SIZE
s3_key.upload_file(
src,
Config=boto3.s3.transfer.TransferConfig(
multipart_chunksize=part_size,
multipart_threshold=part_size,
),
)
def rm(self, path_glob):
"""Remove all files matching the given glob."""
for uri, key in self._ls(path_glob):
log.debug('deleting ' + uri)
key.delete()
def touchz(self, path):
"""Make an empty file in the given location. Raises an error if
a non-empty file already exists in that location."""
key = self._get_s3_key(path)
data = None
try:
data = key.get()
except botocore.exceptions.ClientError as ex:
# okay if key doesn't exist
if _client_error_status(ex) != 404:
raise
if data and data['ContentLength'] != 0:
raise OSError('Non-empty file %r already exists!' % (path,))
key.put(Body=b'')
# Utilities for interacting with S3 using S3 URIs.
# Try to use the more general filesystem interface unless you really
# need to do something S3-specific (e.g. setting file permissions)
# sadly resources aren't as smart as we'd like; they provide a Bucket
# abstraction, but don't automatically connect to buckets on the
# correct region
def make_s3_resource(self, region_name=None):
"""Create a :py:mod:`boto3` S3 resource, with its client
wrapped in a :py:class:`mrjob.retry.RetryWrapper`
:param region: region to use to choose S3 endpoint
It's best to use :py:meth:`get_bucket` because it chooses the
appropriate S3 endpoint automatically. If you are trying to get
bucket metadata, use :py:meth:`make_s3_client`.
"""
# give a non-cryptic error message if boto3 isn't installed
if boto3 is None:
raise ImportError('You must install boto3 to connect to S3')
kwargs = self._client_kwargs(region_name)
s3_resource = boto3.resource('s3', **kwargs)
s3_resource.meta.client = _wrap_aws_client(s3_resource.meta.client)
return s3_resource
def make_s3_client(self, region_name=None):
"""Create a :py:mod:`boto3` S3 client,
wrapped in a :py:class:`mrjob.retry.RetryWrapper`
:param region: region to use to choose S3 endpoint.
"""
# give a non-cryptic error message if boto3 isn't installed
if boto3 is None:
raise ImportError('You must install boto3 to connect to S3')
kwargs = self._client_kwargs(region_name or self._s3_region)
return _wrap_aws_client(boto3.client('s3', **kwargs))
def _client_kwargs(self, region_name):
"""Keyword args for creating resources or clients."""
return dict(
aws_access_key_id=self._aws_access_key_id,
aws_secret_access_key=self._aws_secret_access_key,
aws_session_token=self._aws_session_token,
endpoint_url=self._s3_endpoint_url,
region_name=(region_name or self._s3_region),
)
def get_bucket(self, bucket_name):
"""Get the (:py:mod:`boto3`) bucket, connecting through the
appropriate endpoint."""
client = self.make_s3_client()
try:
region_name = _get_bucket_region(client, bucket_name)
except botocore.exceptions.ClientError as ex:
# it's possible to have access to a bucket but not access
# to its location metadata. This happens on the 'elasticmapreduce'
# bucket, for example (see #1170)
if _client_error_status(ex) != 403:
raise
log.warning('Could not infer endpoint for bucket %s; '
'assuming defaults', bucket_name)
region_name = None
resource = self.make_s3_resource(region_name)
return resource.Bucket(bucket_name)
def _get_s3_key(self, uri):
"""Get the boto3 s3.Object matching the given S3 uri, or
return None if that key doesn't exist.
uri is an S3 URI: ``s3://foo/bar``
"""
bucket_name, key_name = parse_s3_uri(uri)
return self.get_bucket(bucket_name).Object(key_name)
def get_all_bucket_names(self):
"""Get a list of the names of all buckets owned by this user
on S3.
"""
c = self.make_s3_client()
return [b['Name'] for b in c.list_buckets()['Buckets']]
def create_bucket(self, bucket_name, region=None):
"""Create a bucket on S3 with a location constraint
matching the given region.
"""
client = self.make_s3_client()
params = dict(Bucket=bucket_name)
if region is None:
region = self._s3_region
# CreateBucketConfiguration can't be empty, so don't set it
# unless there's a location constraint (see #1927)
if region and region != _S3_REGION_WITH_NO_LOCATION_CONSTRAINT:
params['CreateBucketConfiguration'] = dict(
LocationConstraint=region)
client.create_bucket(**params)
def _is_permanent_boto3_error(ex):
"""Used to disable S3Filesystem when boto3 is installed but
credentials aren't set up."""
return isinstance(ex, botocore.exceptions.NoCredentialsError)