Skip to content

Commit

Permalink
feat: add blob.open() for file-like I/O (#385)
Browse files Browse the repository at this point in the history
Fixes #29
  • Loading branch information
andrewsg authored Mar 24, 2021
1 parent 1a2734b commit 440a0a4
Show file tree
Hide file tree
Showing 5 changed files with 1,228 additions and 1 deletion.
127 changes: 126 additions & 1 deletion google/cloud/storage/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import copy
import hashlib
from io import BytesIO
from io import TextIOWrapper
import logging
import mimetypes
import os
Expand Down Expand Up @@ -78,6 +79,8 @@
from google.cloud.storage.retry import DEFAULT_RETRY
from google.cloud.storage.retry import DEFAULT_RETRY_IF_ETAG_IN_JSON
from google.cloud.storage.retry import DEFAULT_RETRY_IF_GENERATION_SPECIFIED
from google.cloud.storage.fileio import BlobReader
from google.cloud.storage.fileio import BlobWriter


_API_ACCESS_ENDPOINT = "https://storage.googleapis.com"
Expand Down Expand Up @@ -144,7 +147,9 @@ class Blob(_PropertyMixin):
:type chunk_size: int
:param chunk_size:
(Optional) The size of a chunk of data whenever iterating (in bytes).
This must be a multiple of 256 KB per the API specification.
This must be a multiple of 256 KB per the API specification. If not
specified, the chunk_size of the blob itself is used. If that is not
specified, a default value of 40 MB is used.
:type encryption_key: bytes
:param encryption_key:
Expand Down Expand Up @@ -3407,6 +3412,126 @@ def update_storage_class(
retry=retry,
)

def open(
self,
mode="r",
chunk_size=None,
encoding=None,
errors=None,
newline=None,
**kwargs
):
r"""Create a file handler for file-like I/O to or from this blob.
This method can be used as a context manager, just like Python's
built-in 'open()' function.
While reading, as with other read methods, if blob.generation is not set
the most recent blob generation will be used. Because the file-like IO
reader downloads progressively in chunks, this could result in data from
multiple versions being mixed together. If this is a concern, use
either bucket.get_blob(), or blob.reload(), which will download the
latest generation number and set it; or, if the generation is known, set
it manually, for instance with bucket.blob(generation=123456).
:type mode: str
:param mode:
(Optional) A mode string, as per standard Python `open()` semantics.The first
character must be 'r', to open the blob for reading, or 'w' to open
it for writing. The second character, if present, must be 't' for
(unicode) text mode, or 'b' for bytes mode. If the second character
is omitted, text mode is the default.
:type chunk_size: long
:param chunk_size:
(Optional) For reads, the minimum number of bytes to read at a time.
If fewer bytes than the chunk_size are requested, the remainder is
buffered. For writes, the maximum number of bytes to buffer before
sending data to the server, and the size of each request when data
is sent. Writes are implemented as a "resumable upload", so
chunk_size for writes must be exactly a multiple of 256KiB as with
other resumable uploads. The default is 40 MiB.
:type encoding: str
:param encoding:
(Optional) For text mode only, the name of the encoding that the stream will
be decoded or encoded with. If omitted, it defaults to
locale.getpreferredencoding(False).
:type errors: str
:param errors:
(Optional) For text mode only, an optional string that specifies how encoding
and decoding errors are to be handled. Pass 'strict' to raise a
ValueError exception if there is an encoding error (the default of
None has the same effect), or pass 'ignore' to ignore errors. (Note
that ignoring encoding errors can lead to data loss.) Other more
rarely-used options are also available; see the Python 'io' module
documentation for 'io.TextIOWrapper' for a complete list.
:type newline: str
:param newline:
(Optional) For text mode only, controls how line endings are handled. It can
be None, '', '\n', '\r', and '\r\n'. If None, reads use "universal
newline mode" and writes use the system default. See the Python
'io' module documentation for 'io.TextIOWrapper' for details.
:param kwargs: Keyword arguments to pass to the underlying API calls.
For both uploads and downloads, the following arguments are
supported: "if_generation_match", "if_generation_not_match",
"if_metageneration_match", "if_metageneration_not_match", "timeout".
For uploads only, the following additional arguments are supported:
"content_type", "num_retries", "predefined_acl", "checksum".
:returns: A 'BlobReader' or 'BlobWriter' from
'google.cloud.storage.fileio', or an 'io.TextIOWrapper' around one
of those classes, depending on the 'mode' argument.
Example:
Read from a text blob by using open() as context manager.
Using bucket.get_blob() fetches metadata such as the generation,
which prevents race conditions in case the blob is modified.
>>> from google.cloud import storage
>>> client = storage.Client()
>>> bucket = client.bucket("bucket-name")
>>> blob = bucket.get_blob("blob-name.txt")
>>> with blob.open("rt") as f:
>>> print(f.read())
"""
if mode == "rb":
if encoding or errors or newline:
raise ValueError(
"encoding, errors and newline arguments are for text mode only"
)
return BlobReader(self, chunk_size=chunk_size, **kwargs)
elif mode == "wb":
if encoding or errors or newline:
raise ValueError(
"encoding, errors and newline arguments are for text mode only"
)
return BlobWriter(self, chunk_size=chunk_size, **kwargs)
elif mode in ("r", "rt"):
return TextIOWrapper(
BlobReader(self, chunk_size=chunk_size, **kwargs),
encoding=encoding,
errors=errors,
newline=newline,
)
elif mode in ("w", "wt"):
return TextIOWrapper(
BlobWriter(self, chunk_size=chunk_size, text_mode=True, **kwargs),
encoding=encoding,
errors=errors,
newline=newline,
)
else:
raise NotImplementedError(
"Supported modes strings are 'r', 'rb', 'rt', 'w', 'wb', and 'wt' only."
)

cache_control = _scalar_property("cacheControl")
"""HTTP 'Cache-Control' header for this object.
Expand Down
Loading

0 comments on commit 440a0a4

Please sign in to comment.