feat: add blob.open() for file-like I/O (#385)

Fixes #29
googleapis · Mar 24, 2021 · 440a0a4 · 440a0a4
1 parent 1a2734b
commit 440a0a4
Show file tree

Hide file tree

Showing 5 changed files with 1,228 additions and 1 deletion.
diff --git a/google/cloud/storage/blob.py b/google/cloud/storage/blob.py
@@ -30,6 +30,7 @@
 import copy
 import hashlib
 from io import BytesIO
+from io import TextIOWrapper
 import logging
 import mimetypes
 import os
@@ -78,6 +79,8 @@
 from google.cloud.storage.retry import DEFAULT_RETRY
 from google.cloud.storage.retry import DEFAULT_RETRY_IF_ETAG_IN_JSON
 from google.cloud.storage.retry import DEFAULT_RETRY_IF_GENERATION_SPECIFIED
+from google.cloud.storage.fileio import BlobReader
+from google.cloud.storage.fileio import BlobWriter
 
 
 _API_ACCESS_ENDPOINT = "https://storage.googleapis.com"
@@ -144,7 +147,9 @@ class Blob(_PropertyMixin):
     :type chunk_size: int
     :param chunk_size:
         (Optional) The size of a chunk of data whenever iterating (in bytes).
-        This must be a multiple of 256 KB per the API specification.
+        This must be a multiple of 256 KB per the API specification. If not
+        specified, the chunk_size of the blob itself is used. If that is not
+        specified, a default value of 40 MB is used.
 
     :type encryption_key: bytes
     :param encryption_key:
@@ -3407,6 +3412,126 @@ def update_storage_class(
                 retry=retry,
             )
 
+    def open(
+        self,
+        mode="r",
+        chunk_size=None,
+        encoding=None,
+        errors=None,
+        newline=None,
+        **kwargs
+    ):
+        r"""Create a file handler for file-like I/O to or from this blob.
+
+        This method can be used as a context manager, just like Python's
+        built-in 'open()' function.
+
+        While reading, as with other read methods, if blob.generation is not set
+        the most recent blob generation will be used. Because the file-like IO
+        reader downloads progressively in chunks, this could result in data from
+        multiple versions being mixed together. If this is a concern, use
+        either bucket.get_blob(), or blob.reload(), which will download the
+        latest generation number and set it; or, if the generation is known, set
+        it manually, for instance with bucket.blob(generation=123456).
+
+        :type mode: str
+        :param mode:
+            (Optional) A mode string, as per standard Python `open()` semantics.The first
+            character must be 'r', to open the blob for reading, or 'w' to open
+            it for writing. The second character, if present, must be 't' for
+            (unicode) text mode, or 'b' for bytes mode. If the second character
+            is omitted, text mode is the default.
+
+        :type chunk_size: long
+        :param chunk_size:
+            (Optional) For reads, the minimum number of bytes to read at a time.
+            If fewer bytes than the chunk_size are requested, the remainder is
+            buffered. For writes, the maximum number of bytes to buffer before
+            sending data to the server, and the size of each request when data
+            is sent. Writes are implemented as a "resumable upload", so
+            chunk_size for writes must be exactly a multiple of 256KiB as with
+            other resumable uploads. The default is 40 MiB.
+
+        :type encoding: str
+        :param encoding:
+            (Optional) For text mode only, the name of the encoding that the stream will
+            be decoded or encoded with. If omitted, it defaults to
+            locale.getpreferredencoding(False).
+
+        :type errors: str
+        :param errors:
+            (Optional) For text mode only, an optional string that specifies how encoding
+            and decoding errors are to be handled. Pass 'strict' to raise a
+            ValueError exception if there is an encoding error (the default of
+            None has the same effect), or pass 'ignore' to ignore errors. (Note
+            that ignoring encoding errors can lead to data loss.) Other more
+            rarely-used options are also available; see the Python 'io' module
+            documentation for 'io.TextIOWrapper' for a complete list.
+
+        :type newline: str
+        :param newline:
+            (Optional) For text mode only, controls how line endings are handled. It can
+            be None, '', '\n', '\r', and '\r\n'. If None, reads use "universal
+            newline mode" and writes use the system default. See the Python
+            'io' module documentation for 'io.TextIOWrapper' for details.
+
+        :param kwargs: Keyword arguments to pass to the underlying API calls.
+            For both uploads and downloads, the following arguments are
+            supported: "if_generation_match", "if_generation_not_match",
+            "if_metageneration_match", "if_metageneration_not_match", "timeout".
+            For uploads only, the following additional arguments are supported:
+            "content_type", "num_retries", "predefined_acl", "checksum".
+
+        :returns: A 'BlobReader' or 'BlobWriter' from
+            'google.cloud.storage.fileio', or an 'io.TextIOWrapper' around one
+            of those classes, depending on the 'mode' argument.
+
+        Example:
+            Read from a text blob by using open() as context manager.
+
+            Using bucket.get_blob() fetches metadata such as the generation,
+            which prevents race conditions in case the blob is modified.
+
+            >>> from google.cloud import storage
+            >>> client = storage.Client()
+            >>> bucket = client.bucket("bucket-name")
+
+            >>> blob = bucket.get_blob("blob-name.txt")
+            >>> with blob.open("rt") as f:
+            >>>     print(f.read())
+
+        """
+        if mode == "rb":
+            if encoding or errors or newline:
+                raise ValueError(
+                    "encoding, errors and newline arguments are for text mode only"
+                )
+            return BlobReader(self, chunk_size=chunk_size, **kwargs)
+        elif mode == "wb":
+            if encoding or errors or newline:
+                raise ValueError(
+                    "encoding, errors and newline arguments are for text mode only"
+                )
+            return BlobWriter(self, chunk_size=chunk_size, **kwargs)
+        elif mode in ("r", "rt"):
+            return TextIOWrapper(
+                BlobReader(self, chunk_size=chunk_size, **kwargs),
+                encoding=encoding,
+                errors=errors,
+                newline=newline,
+            )
+        elif mode in ("w", "wt"):
+            return TextIOWrapper(
+                BlobWriter(self, chunk_size=chunk_size, text_mode=True, **kwargs),
+                encoding=encoding,
+                errors=errors,
+                newline=newline,
+            )
+        else:
+            raise NotImplementedError(
+                "Supported modes strings are 'r', 'rb', 'rt', 'w', 'wb', and 'wt' only."
+            )
+
     cache_control = _scalar_property("cacheControl")
     """HTTP 'Cache-Control' header for this object.