Second Review: Enhanced Schunk Examples

Blosc · Sep 12, 2024 · d972e0b · d972e0b
1 parent 36d36aa
commit d972e0b
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 63 deletions.
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
@@ -1044,13 +1044,6 @@ cdef class SChunk:
     @property
     def blocksize(self):
         """The block size (in bytes).
-
-        Examples
-        --------
-        >>> import blosc2
-        >>> schunk = blosc2.SChunk(cparams={'blocksize': 100})
-        >>> print(schunk.blocksize)
-        100
         """
         return self.schunk.blocksize
 

diff --git a/src/blosc2/core.py b/src/blosc2/core.py
@@ -1500,6 +1500,27 @@ def schunk_from_cframe(cframe, copy=False):
     --------
     :func:`~blosc2.schunk.SChunk.to_cframe`
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import blosc2
+    >>> nchunks = 4
+    >>> chunk_size = 200 * 1000 * 4
+    >>> data = np.arange(nchunks * chunk_size // 4, dtype=np.int32)
+    >>> schunk = blosc2.SChunk(data=data, cparams={"typesize": 4})
+    >>> serialized_schunk = schunk.to_cframe()
+    >>> print(f"Serialized SChunk length: {len(serialized_schunk)} bytes")
+    Serialized SChunk length: 14129 bytes
+    >>> deserialized_schunk = blosc2.schunk_from_cframe(serialized_schunk)
+    >>> start = 1000
+    >>> stop = 1005
+    >>> sl_bytes = deserialized_schunk[start:stop]
+    >>> sl = np.frombuffer(sl_bytes, dtype=np.int32)
+    >>> print("Slice from deserialized SChunk:", sl)
+    Slice from deserialized SChunk: [1000 1001 1002 1003 1004]
+    >>> expected_slice = data[start:stop]
+    >>> print("Expected slice:", expected_slice)
+    Expected slice: [1000 1001 1002 1003 1004]
     """
     return blosc2_ext.schunk_from_cframe(cframe, copy)
 

diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py
@@ -429,8 +429,7 @@ def fill_special(self, nitems, special_value):
         Examples
         --------
         >>> import blosc2
-        >>> import numpy as np
-        >>> schunk = blosc2.SChunk(chunksize=200*1000*4)
+        >>> schunk = blosc2.SChunk()
         >>> # Fill the SChunk with the special value
         >>> nitems = 200 * 1000
         >>> print(f"Initial number of chunks: {len(schunk)}")
@@ -479,7 +478,7 @@ def decompress_chunk(self, nchunk, dst=None):
         Examples
         --------
         >>> import blosc2
-        >>> schunk = blosc2.SChunk(chunksize=11, cparams = {'typesize': 1})
+        >>> schunk = blosc2.SChunk(cparams = {'typesize': 1})
         >>> buffer = b"wermqeoir23"
         >>> print(schunk.append_data(buffer))
         1
@@ -518,14 +517,14 @@ def get_chunk(self, nchunk):
         >>> # Create an SChunk with 3 chunks
         >>> nchunks = 3
         >>> data = np.arange(200 * 1000 * nchunks, dtype=np.int32)
-        >>> schunk = blosc2.SChunk(chunksize=200 * 1000 * 4, data=data, cparams={"typesize": 4})
+        >>> schunk = blosc2.SChunk(data=data, cparams={"typesize": 4})
         >>> # Retrieve the first chunk (index 0)
         >>> chunk = schunk.get_chunk(0)
         >>> # Check the type and length of the compressed chunk
         >>> print(type(chunk))
         <class 'bytes'>
         >>> print(len(chunk))
-        3742 # The compressed size is smaller than the chunk size (200*1000*4)
+        10552
         """
         return super().get_chunk(nchunk)
 
@@ -772,20 +771,18 @@ def get_slice(self, start=0, stop=None, out=None):
         >>> nchunks = 4
         >>> chunk_size = 200 * 1000 * 4
         >>> data = np.arange(nchunks * chunk_size // 4, dtype=np.int32)
-        >>> schunk = blosc2.SChunk(chunksize=chunk_size, data=data, cparams={"typesize": 4})
+        >>> schunk = blosc2.SChunk(data=data, cparams={"typesize": 4})
         >>> # Define the slice parameters
         >>> start_index = 200 * 1000
         >>> stop_index = 2 * 200 * 1000
         >>> # Prepare an output buffer
         >>> slice_size = stop_index - start_index
         >>> out_buffer = bytearray(slice_size * 4)  # Ensure the buffer is large enough
         >>> result = schunk.get_slice(start=start_index, stop=stop_index, out=out_buffer)
-        >>> # Assert that the output buffer matches the expected slice from the original data
-        >>> assert out_buffer == data[start_index:stop_index].tobytes(), "Slice data does not match expected values."
         >>> # Convert bytearray to NumPy array for easier inspection
         >>> slice_array = np.frombuffer(out_buffer, dtype=np.int32)
         >>> print(f"Slice data: {slice_array[:10]} ...")  # Print the first 10 elements
-        Slice data: [100000 100001 100002 100003 100004 100005 100006 100007 100008 100009] ...
+        Slice data: [200000 200001 200002 200003 200004 200005 200006 200007 200008 200009] ...
         """
         return super().get_slice(start, stop, out)
 
@@ -828,9 +825,6 @@ def __getitem__(self, item):
         >>> sl = data[150:155]
         >>> # Use __getitem__ to retrieve the same slice of data from the SChunk
         >>> res = schunk[150:155]
-        >>> # Compare the result of __getitem__ with the original slice converted to bytes
-        >>> assert res == sl.tobytes(), "The result from __getitem__ does not match the original data."
-        >>> # Display the first few elements of the retrieved slice
         >>> print(f"Slice data: {np.frombuffer(res, dtype=np.int32)}")
         Slice data: [150 151 152 153 154]
         """
@@ -882,7 +876,7 @@ def __setitem__(self, key, value):
         >>> nchunks = 4
         >>> chunk_size = 200 * 1000 * 4
         >>> data = np.arange(nchunks * chunk_size // 4, dtype=np.int32)
-        >>> schunk = blosc2.SChunk(chunksize=chunk_size, data=data, cparams={"typesize": 4})
+        >>> schunk = blosc2.SChunk(data=data, cparams={"typesize": 4})
         >>> # Create a new array of values to update the slice (values from 1000 to 1999 multiplied by 2)
         >>> start_ = 1000
         >>> stop = 2000
@@ -891,10 +885,10 @@ def __setitem__(self, key, value):
         >>> schunk[start_:stop] = new_values
         >>> # Retrieve the updated slice using the slicing syntax
         >>> retrieved_slice = np.frombuffer(schunk[start_:stop], dtype=np.int32)
-        >>> # Compare the retrieved slice with the new values to ensure they match
-        >>> assert np.array_equal(retrieved_slice, new_values), "The updated slice does not match the new values."
-        >>> print("The slice comparison is successful!")
-        The slice comparison is successful!
+        >>> print("First 10 values of the updated slice:", retrieved_slice[:10])
+        >>> print("Last 10 values of the updated slice:", retrieved_slice[-10:])
+        First 10 values of the updated slice: [2000 2002 2004 2006 2008 2010 2012 2014 2016 2018]
+        Last 10 values of the updated slice: [3980 3982 3984 3986 3988 3990 3992 3994 3996 3998]
         """
         if key.step is not None and key.step != 1:
             raise IndexError("`step` must be 1")
@@ -920,19 +914,22 @@ def to_cframe(self):
         >>> nchunks = 4
         >>> chunk_size = 200 * 1000 * 4
         >>> data = np.arange(nchunks * chunk_size // 4, dtype=np.int32)
-        >>> schunk = blosc2.SChunk(chunksize=chunk_size, data=data, cparams={"typesize": 4})
+        >>> schunk = blosc2.SChunk(data=data, cparams={"typesize": 4})
         >>> # Serialize the SChunk instance to a bytes object
         >>> serialized_schunk = schunk.to_cframe()
         >>> print(f"Serialized SChunk length: {len(serialized_schunk)} bytes")
-        Serialized SChunk length: 15545 bytes
+        Serialized SChunk length: 14129 bytes
         >>> # Create a new SChunk from the serialized data
         >>> deserialized_schunk = blosc2.schunk_from_cframe(serialized_schunk)
-        >>> # Print a slice of the deserialized SChunk to verify
-        >>> start = 1000
-        >>> stop = 1005
-        >>> sl = deserialized_schunk[start:stop]
-        >>> res = schunk.get_slice(start, stop)
-        >>> assert res == sl
+        >>> start = 500
+        >>> stop = 505
+        >>> sl_bytes = deserialized_schunk[start:stop]
+        >>> sl = np.frombuffer(sl_bytes, dtype=np.int32)
+        >>> res = data[start:stop]
+        >>> print(f"Original slice: {res}")
+        Original slice: [500 501 502 503 504]
+        >>> print(f"Deserialized slice: {sl}")
+        Deserialized slice: [500 501 502 503 504]
         """
         return super().to_cframe()
 
@@ -955,17 +952,14 @@ def iterchunks(self, dtype):
         >>> import blosc2
         >>> import numpy as np
         >>> # Create sample data and an SChunk
-        >>> nchunks = 2     # Total data for 2 chunks
         >>> data = np.arange(400 * 1000, dtype=np.int32)
-        >>> schunk = blosc2.SChunk(chunksize=200*1000*4, data=data, cparams={"typesize": 4})
+        >>> schunk = blosc2.SChunk(data=data, cparams={"typesize": 4})
         >>> # Iterate over chunks using the iterchunks method
         >>> for chunk in schunk.iterchunks(dtype=np.int32):
         >>>     print("Chunk shape:", chunk.shape)
         >>>     print("First 5 elements of chunk:", chunk[:5])
-        Chunk shape: (200000,)
+        Chunk shape: (400000,)
         First 5 elements of chunk: [0 1 2 3 4]
-        Chunk shape: (200000,)
-        First 5 elements of chunk: [200000 200001 200002 200003 200004]
         """
         out = np.empty(self.chunkshape, dtype)
         for i in range(0, len(self), self.chunkshape):
@@ -998,20 +992,15 @@ def iterchunks_info(self):
         >>> import numpy as np
         >>> # Create sample data and an SChunk
         >>> data = np.arange(400 * 1000, dtype=np.int32)
-        >>> nchunks = 2 # Total data for 2 chunks
-        >>> schunk = blosc2.SChunk(chunksize=200*1000*4, data=data, cparams={"typesize": 4})
+        >>> schunk = blosc2.SChunk(data=data, cparams={"typesize": 4})
         >>> # Iterate over chunks and print detailed information
         >>> for chunk_info in schunk.iterchunks_info():
         >>>     print(f"Chunk index: {chunk_info.nchunk}")
         >>>     print(f"Compression ratio: {chunk_info.cratio:.2f}")
         >>>     print(f"Special value: {chunk_info.special.name}")
         >>>     print(f"Repeated value: {chunk_info.repeated_value[:10] if chunk_info.repeated_value else None}")
         Chunk index: 0
-        Compression ratio: 213.79
-        Special value: NOT_SPECIAL
-        Repeated value: None
-        Chunk index: 1
-        Compression ratio: 206.88
+        Compression ratio: 223.56
         Special value: NOT_SPECIAL
         Repeated value: None
         """
@@ -1110,12 +1099,11 @@ def remove_postfilter(self, func_name, _new_ctx=True):
         >>> import blosc2
         >>> import numpy as np
         >>> dtype = np.dtype(np.int32)
-        >>> chunk_size = 20_000 * input_dtype.itemsize
-        >>> storage = {"cparams": {"typesize": input_dtype.itemsize}, "dparams": {"nthreads": 1}}
+        >>> storage = {"cparams": {"typesize": dtype.itemsize}, "dparams": {"nthreads": 1}}
         >>> data = np.arange(500, dtype=np.int32)
-        >>> schunk = blosc2.SChunk(chunksize=chunk_size, data=data, **storage)
+        >>> schunk = blosc2.SChunk(data=data, **storage)
         >>> # Define the postfilter function
-        >>> @schunk.postfilter(input_dtype)
+        >>> @schunk.postfilter(dtype)
         >>> def postfilter(input, output, offset):
         >>>     output[:] = input + offset + np.arange(input.size)
         >>> out = np.empty(data.size, dtype=dtype)
@@ -1125,8 +1113,6 @@ def remove_postfilter(self, func_name, _new_ctx=True):
         >>> schunk.remove_postfilter('postfilter')
         >>> retrieved_data = np.empty(data.size, dtype=dtype)
         >>> schunk.get_slice(out=retrieved_data)
-        >>> # Compare the retrieved data to the original data
-        >>> assert np.array_equal(retrieved_data, data), "The data without postfilter does not match the original data."
         >>> print("Original data (first 8 elements):", data[:8])
         Original data (first 8 elements): [0 1 2 3 4 5 6 7]
         """
@@ -1293,33 +1279,25 @@ def remove_prefilter(self, func_name, _new_ctx=True):
         >>> import blosc2
         >>> import numpy as np
         >>> dtype = np.dtype(np.int32)
-        >>> chunk_size = 20_000 * dtype.itemsize
         >>> cparams = {"typesize": dtype.itemsize, "nthreads": 1}
         >>> data = np.arange(1000, dtype=np.int32)
         >>> output_dtype = np.float32
-        >>> schunk = blosc2.SChunk(chunksize=chunk_size, cparams=cparams)
+        >>> schunk = blosc2.SChunk(cparams=cparams)
         >>> # Define the prefilter function
-        >>> @schunk.prefilter(input_dtype, output_dtype)
+        >>> @schunk.prefilter(dtype, output_dtype)
         >>> def prefilter(input, output, offset):
         >>>     output[:] = input - np.pi
         >>> schunk[:1000] = data
-        >>> # Retrieve compressed data with prefilter applied
-        >>> compressed_data_with_filter = schunk.get_slice()
-        >>> # Convert the bytes to NumPy array for comparison
-        >>> compressed_array_with_filter = np.frombuffer(compressed_data_with_filter, dtype=output_dtype)
+        >>> # Retrieve and convert compressed data with the prefilter to a NumPy array.
+        >>> compressed_array_with_filter = np.frombuffer(schunk.get_slice(), dtype=output_dtype)
         >>> print("Compressed data with prefilter applied (first 8 elements):", compressed_array_with_filter[:8])
         Compressed data with prefilter applied (first 8 elements): [-3.1415927  -2.1415927  -1.1415926  -0.14159265  0.8584073   1.8584074
          2.8584073   3.8584073 ]
         >>> schunk.remove_prefilter('prefilter')
         >>> schunk[:1000] = data
-        >>> compressed_data_without_filter = schunk.get_slice()
-        >>> compressed_array_without_filter = np.frombuffer(compressed_data_without_filter, dtype=output_dtype)
+        >>> compressed_array_without_filter = np.frombuffer(schunk.get_slice(), dtype=dtype)
         >>> print("Compressed data without prefilter (first 8 elements):", compressed_array_without_filter[:8])
         Compressed data without prefilter (first 8 elements): [0. 1. 2. 3. 4. 5. 6. 7.]
-        >>> # Compare the decompressed data to the original data
-        >>> assert np.array_equal(compressed_array_without_filter, data), "The data without prefilter does not match the original data."
-        >>> print("The slice comparison is successful!")
-        The slice comparison is successful!
         """
         return super().remove_prefilter(func_name)