apache
diff --git a/‎examples/src/main/python/sql/streaming/structured_blockchain_admission_control.py‎
Lines changed: 165 additions & 0 deletions b/‎examples/src/main/python/sql/streaming/structured_blockchain_admission_control.py‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎python/docs/source/tutorial/sql/python_data_source.rst‎
Lines changed: 162 additions & 0 deletions b/‎python/docs/source/tutorial/sql/python_data_source.rst‎
Lines changed: 162 additions & 0 deletions
@@ -0,0 +1,165 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Demonstrates admission control in Python streaming data sources.
+
+This example implements a simple blockchain-like streaming source that generates
+sequential blocks and shows how to use admission control to limit batch sizes.
+
+Usage: structured_blockchain_admission_control.py [<max-blocks-per-batch>]
+  <max-blocks-per-batch> Maximum number of blocks to process per microbatch (default: 10)
+
+Run the example:
+   `$ bin/spark-submit examples/src/main/python/sql/streaming/structured_blockchain_admission_control.py 5`
+
+The example will process blocks in controlled batches of 5, demonstrating admission control.
+"""
+import sys
+import time
+
+from pyspark.sql import SparkSession
+from pyspark.sql.datasource import DataSource, DataSourceStreamReader, InputPartition
+from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType
+
+
+class SimpleBlockchainReader(DataSourceStreamReader):
+    """A simple streaming source that generates sequential blockchain blocks."""
+    
+    def __init__(self, max_block=1000):
+        self.max_block = max_block
+        self.current_block = 0
+    
+    def initialOffset(self):
+        """Start from block 0."""
+        return {"block": self.current_block}
+    
+    def latestOffset(self, start_offset=None, read_limit=None):
+        """
+        Return the latest offset, respecting admission control limits.
+        
+        This demonstrates the key admission control pattern:
+        - Without limit: process all available blocks
+        - With maxRows limit: cap the end block to respect batch size
+        """
+        # Determine where we are now
+        if start_offset is None:
+            start_block = self.current_block
+        else:
+            start_block = start_offset["block"]
+        
+        # Simulate blockchain growth - advance by 20 blocks each time
+        latest_available = min(start_block + 20, self.max_block)
+        
+        # Apply admission control if configured
+        if read_limit and read_limit.get("type") == "maxRows":
+            max_blocks = read_limit["maxRows"]
+            # Cap at the configured limit
+            end_block = min(start_block + max_blocks, latest_available)
+            print(f"  [Admission Control] Start: {start_block}, Available: {latest_available}, "
+                  f"Capped: {end_block} (limit: {max_blocks})")
+        else:
+            # No limit - process all available
+            end_block = latest_available
+            print(f"  [No Limit] Start: {start_block}, End: {end_block}")
+        
+        return {"block": end_block}
+    
+    def reportLatestOffset(self):
+        """Report the true latest block for monitoring."""
+        # In a real implementation, this would query the actual blockchain
+        return {"block": min(self.current_block + 20, self.max_block)}
+    
+    def partitions(self, start, end):
+        """Create a single partition for the block range."""
+        start_block = start["block"]
+        end_block = end["block"]
+        return [InputPartition(f"{start_block}:{end_block}".encode())]
+    
+    def read(self, partition):
+        """Generate block data for the partition."""
+        # Parse the block range
+        range_str = partition.value.decode()
+        start_block, end_block = map(int, range_str.split(":"))
+        
+        # Generate block data
+        for block_num in range(start_block, end_block):
+            # Simulate block data: block number, timestamp, simple hash
+            yield (
+                block_num,
+                int(time.time() * 1000),
+                f"0x{'0' * 60}{block_num:04x}"
+            )
+
+
+class SimpleBlockchainSource(DataSource):
+    """Data source for simple blockchain streaming."""
+    
+    @classmethod
+    def name(cls):
+        return "simple_blockchain"
+    
+    def schema(self):
+        return "block_number INT, timestamp LONG, block_hash STRING"
+    
+    def streamReader(self, schema):
+        return SimpleBlockchainReader(max_block=1000)
+
+
+if __name__ == "__main__":
+    max_blocks_per_batch = int(sys.argv[1]) if len(sys.argv) > 1 else 10
+    
+    print(f"""
+=================================================================
+Blockchain Streaming with Admission Control
+=================================================================
+Configuration:
+  - Max blocks per batch: {max_blocks_per_batch}
+  - Total blocks to generate: 1000
+  
+Watch how admission control limits each microbatch to process
+only {max_blocks_per_batch} blocks at a time, even when more data is available.
+=================================================================
+""")
+    
+    spark = SparkSession \
+        .builder \
+        .appName("StructuredBlockchainAdmissionControl") \
+        .getOrCreate()
+    
+    # Register the custom data source
+    spark.dataSource.register(SimpleBlockchainSource)
+    
+    # Create streaming DataFrame with admission control
+    blocks = spark \
+        .readStream \
+        .format("simple_blockchain") \
+        .option("maxRecordsPerBatch", str(max_blocks_per_batch)) \
+        .load()
+    
+    # Show block statistics per microbatch
+    query = blocks \
+        .writeStream \
+        .outputMode("append") \
+        .format("console") \
+        .option("numRows", "20") \
+        .option("truncate", "false") \
+        .trigger(processingTime="3 seconds") \
+        .start()
+    
+    query.awaitTermination()
+
@@ -266,6 +266,168 @@ This is a dummy streaming data reader that generate 2 rows in every microbatch.
             for i in range(start, end):
                 yield (i, str(i))
 
+Implementing Admission Control for Streaming Sources
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Admission control allows streaming sources to control batch sizes, ensuring predictable resource usage
+and preventing overload. This is essential for production systems that need to respect API rate limits,
+control memory usage, or maintain consistent processing times.
+
+**Understanding the Problem**
+
+Without admission control, a streaming source must process all available data in each microbatch,
+which can lead to:
+
+- Unpredictable batch sizes
+- Memory exhaustion during data bursts
+- Inability to honor external API rate limits
+- Difficult failure recovery
+
+**The Solution: Parameters in latestOffset()**
+
+The :meth:`DataSourceStreamReader.latestOffset` method now accepts optional parameters:
+
+- ``start_offset`` (dict): The current stream position
+- ``read_limit`` (dict): Configured batch size limits
+
+These parameters enable sources to return capped offsets that respect configured limits.
+
+**Example: Blockchain Source with Admission Control**
+
+Here's a complete example showing how to implement admission control for a blockchain-like streaming source:
+
+.. code-block:: python
+
+    from pyspark.sql.datasource import DataSource, DataSourceStreamReader, InputPartition
+
+    class BlockchainStreamReader(DataSourceStreamReader):
+        def __init__(self, initial_block=0, max_block=10000):
+            self.initial_block = initial_block
+            self.max_block = max_block
+        
+        def initialOffset(self) -> dict:
+            return {"block": self.initial_block}
+        
+        def latestOffset(self, start_offset=None, read_limit=None) -> dict:
+            """
+            Return capped offset respecting admission control limits.
+            """
+            # Determine current position
+            if start_offset is None:
+                start_block = self.initial_block
+            else:
+                start_block = start_offset["block"]
+            
+            # Get latest available block from blockchain
+            latest_available = self.get_chain_head()  # e.g., returns 5000
+            
+            # Apply admission control if configured
+            if read_limit and read_limit.get("type") == "maxRows":
+                max_blocks = read_limit["maxRows"]
+                # Cap the end block to respect batch size limit
+                end_block = min(start_block + max_blocks, latest_available)
+            else:
+                # No limit - process all available blocks
+                end_block = latest_available
+            
+            return {"block": end_block}
+        
+        def reportLatestOffset(self) -> dict:
+            """
+            Report true latest for monitoring (without limits applied).
+            """
+            return {"block": self.get_chain_head()}
+        
+        def partitions(self, start: dict, end: dict):
+            start_block = start["block"]
+            end_block = end["block"]
+            # Create partitions for the block range
+            return [InputPartition(f"{start_block}:{end_block}".encode())]
+        
+        def read(self, partition):
+            # Fetch and yield block data
+            range_str = partition.value.decode()
+            start_block, end_block = map(int, range_str.split(":"))
+            for block_num in range(start_block, end_block):
+                yield self.fetch_block(block_num)  # Returns block tuple
+
+**Configuring Admission Control**
+
+Use the ``maxRecordsPerBatch`` option when reading from the stream:
+
+.. code-block:: python
+
+    # Process maximum 50 blocks per microbatch
+    df = spark.readStream \\
+        .format("blockchain") \\
+        .option("maxRecordsPerBatch", "50") \\
+        .load()
+    
+    query = df.writeStream \\
+        .format("console") \\
+        .start()
+
+**Read Limit Types**
+
+The ``read_limit`` parameter supports several limit types:
+
+.. code-block:: python
+
+    # Maximum rows/records
+    {"type": "maxRows", "maxRows": 1000}
+    
+    # Maximum files (for file-based sources)
+    {"type": "maxFiles", "maxFiles": 10}
+    
+    # Maximum data size in bytes
+    {"type": "maxBytes", "maxBytes": 10485760}
+    
+    # Minimum rows with timeout (for low-throughput sources)
+    {"type": "minRows", "minRows": 100, "maxTriggerDelayMs": 30000}
+    
+    # Process all available data
+    {"type": "allAvailable"}
+    
+    # Composite limits (multiple constraints)
+    {"type": "composite", "limits": [
+        {"type": "minRows", "minRows": 100, "maxTriggerDelayMs": 30000},
+        {"type": "maxRows", "maxRows": 1000}
+    ]}
+
+**Best Practices**
+
+1. **Always handle None parameters**: For the first batch, ``start_offset`` will be None
+2. **Use min() for capping**: Ensure you don't exceed available data
+3. **Implement reportLatestOffset()**: Helps with monitoring when rate limiting is active
+4. **Validate limit types**: Check that the limit type is one your source supports
+5. **Test both modes**: Verify your source works with and without limits
+
+**Backward Compatibility**
+
+For backward compatibility, the parameters are optional. Existing sources that don't implement
+admission control can use the old signature and will continue to work. The framework automatically
+detects which signature is implemented:
+
+.. code-block:: python
+
+    def latestOffset(self):
+        # Old signature still works - framework detects this automatically
+        return {"offset": self.get_latest()}
+
+New sources can opt-in to admission control by accepting the parameters:
+
+.. code-block:: python
+
+    def latestOffset(self, start_offset=None, read_limit=None):
+        # New signature with admission control
+        if read_limit and read_limit.get("type") == "maxRows":
+            # Apply limit
+            ...
+        return {"offset": ...}
+
+Note: ``SimpleDataSourceStreamReader`` does not support admission control. If you need
+admission control, use ``DataSourceStreamReader`` instead.
+
 Alternative: Implement a Simple Streaming Reader
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~