# Copyright 2015-2018 Yelp # Copyright 2019 Yelp # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Minimal utilities to make Python work for 2.7+ and 3.4+ Strategies for making `mrjob` work across Python versions: Bytes vs. Unicode ----------------- It's tempting to use `from __future__ import unicode_literals` and require that all non-byte strings be unicode. But that doesn't really make sense for Python 2, where str (bytes) and unicode can be used interchangeably. So really our string datatypes fall into two categories, bytes, and "strings", which means either ``unicode``\\s or ASCII ``str``\\s in Python 2, and ``str``\\s (i.e. unicode) in Python 3. These things should always be bytes: - input data files - use ``'b'`` when opening files: ``open(..., 'rb')`` - read data from ``sys.stdin.buffer`` in Python 3, not ``sys.stdin`` - data encoded by protocols - data from subprocesses (this already happens by default) - log files parsed by mrjob - file content from our filesystem interfaces. Instead of using ``StringIO`` to deal with these, use ``io.BytesIO``. Note that both Python 2.6+ and Python 3.3+ have the ``bytes`` type and ``b''`` constants built-in. These things should always be strings: - streams that you print() to (e.g. ``sys.stdout`` if you mock it out) - streams that you log to - paths on filesystem - URIs - arguments to commands - option names - Hadoop counter names and groups - Hadoop status messages - anything else we parse out of log files These things are strings because it makes for simpler code: - contents of config files - contents of scripts output by mrjob (e.g. the setup wrapper script) - contents of empty files Use the ``StringIO`` from this module to deal with strings (it's ``StringIO.StringIO`` in Python 2 and ``io.StringIO`` in Python 3). Please use ``%`` for format strings and not ``format()``, which is much more picky about mixing unicode and bytes. We don't provide a ``unicode`` type: - Use ``isinstance(..., string_types)`` to check if something is a string - Use ``not isinstance(..., bytes)`` to check if a string is Unicode - To convert ``bytes`` to ``unicode``, use ``.decode('utf_8')`. - Python 3.3+ has ``u''`` literals; please use sparingly If you need to convert bytes of unknown encoding to a string (e.g. to ``print()`` or log them), use ``to_unicode()`` from this module. Iterables --------- Using ``.iteritems()`` or ``.itervalues()`` in Python 2 to iterate over a dictionary when you don't need a list is best practice, but it's also (in most cases) an over-optimization. We'd prefer clean code; just use ``.items()`` and ``.values()``. If you *do* have concerns about memory usage, ``for k in some_dict`` does not create a list in either version of Python. Same goes for ``xrange``; plain-old `range`` is almost always fine. Miscellany ---------- We provide an ``integer_types`` tuple so you can check if something is an integer: ``isinstance(..., integer_types)``. Any standard library function that deals with URLs (e.g. ``urlparse()``) should probably be imported from this module. You *usually* want to do ``from __future__ import print_function`` in modules where you use ``print()``. ``print(...)`` works fine, but ``print(..., file=...)`` doesn't, and ``print()`` prints ``()`` on Python 2. You shouldn't need any other ``__future__`` imports. """ import sys # use this to check if we're in Python 2 PY2 = (sys.version_info[0] == 2) # ``string_types``, for ``isinstance(..., string_types)`` if PY2: string_types = (basestring,) else: string_types = (str,) string_types # ``integer_types``, for ``isinstance(..., integer_types)`` if PY2: integer_types = (int, long) else: integer_types = (int,) integer_types # ``StringIO``. Useful for mocking out ``sys.stdout``, etc. if PY2: from StringIO import StringIO else: from io import StringIO StringIO # quiet, pyflakes # ``xrange``. Plain old ``range`` is almost always fine if PY2: xrange = xrange else: xrange = range xrange # quiet, pyflakes # urllib stuff # in most cases you should use ``mrjob.parse.urlparse()`` if PY2: from urlparse import ParseResult from urllib import pathname2url from urlparse import urljoin from urllib2 import urlopen from urlparse import urlparse else: from urllib.parse import ParseResult from urllib.request import pathname2url from urllib.parse import urljoin from urllib.request import urlopen from urllib.parse import urlparse ParseResult pathname2url urljoin urlopen urlparse def to_unicode(s): """Convert ``bytes`` to unicode. Use this if you need to ``print()`` or log bytes of an unknown encoding, or to parse strings out of bytes of unknown encoding (e.g. a log file). This hopes that your bytes are UTF-8 decodable, but if not, falls back to latin-1, which always works. """ if isinstance(s, bytes): try: return s.decode('utf_8') except UnicodeDecodeError: return s.decode('latin_1') elif isinstance(s, string_types): # e.g. is unicode return s else: raise TypeError