@@ -266,6 +266,168 @@ This is a dummy streaming data reader that generate 2 rows in every microbatch.
266266 for i in range (start, end):
267267 yield (i, str (i))
268268
269+ Implementing Admission Control for Streaming Sources
270+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
271+
272+ Admission control allows streaming sources to control batch sizes, ensuring predictable resource usage
273+ and preventing overload. This is essential for production systems that need to respect API rate limits,
274+ control memory usage, or maintain consistent processing times.
275+
276+ **Understanding the Problem **
277+
278+ Without admission control, a streaming source must process all available data in each microbatch,
279+ which can lead to:
280+
281+ - Unpredictable batch sizes
282+ - Memory exhaustion during data bursts
283+ - Inability to honor external API rate limits
284+ - Difficult failure recovery
285+
286+ **The Solution: Parameters in latestOffset() **
287+
288+ The :meth: `DataSourceStreamReader.latestOffset ` method now accepts optional parameters:
289+
290+ - ``start_offset `` (dict): The current stream position
291+ - ``read_limit `` (dict): Configured batch size limits
292+
293+ These parameters enable sources to return capped offsets that respect configured limits.
294+
295+ **Example: Blockchain Source with Admission Control **
296+
297+ Here's a complete example showing how to implement admission control for a blockchain-like streaming source:
298+
299+ .. code-block :: python
300+
301+ from pyspark.sql.datasource import DataSource, DataSourceStreamReader, InputPartition
302+
303+ class BlockchainStreamReader (DataSourceStreamReader ):
304+ def __init__ (self , initial_block = 0 , max_block = 10000 ):
305+ self .initial_block = initial_block
306+ self .max_block = max_block
307+
308+ def initialOffset (self ) -> dict :
309+ return {" block" : self .initial_block}
310+
311+ def latestOffset (self , start_offset = None , read_limit = None ) -> dict :
312+ """
313+ Return capped offset respecting admission control limits.
314+ """
315+ # Determine current position
316+ if start_offset is None :
317+ start_block = self .initial_block
318+ else :
319+ start_block = start_offset[" block" ]
320+
321+ # Get latest available block from blockchain
322+ latest_available = self .get_chain_head() # e.g., returns 5000
323+
324+ # Apply admission control if configured
325+ if read_limit and read_limit.get(" type" ) == " maxRows" :
326+ max_blocks = read_limit[" maxRows" ]
327+ # Cap the end block to respect batch size limit
328+ end_block = min (start_block + max_blocks, latest_available)
329+ else :
330+ # No limit - process all available blocks
331+ end_block = latest_available
332+
333+ return {" block" : end_block}
334+
335+ def reportLatestOffset (self ) -> dict :
336+ """
337+ Report true latest for monitoring (without limits applied).
338+ """
339+ return {" block" : self .get_chain_head()}
340+
341+ def partitions (self , start : dict , end : dict ):
342+ start_block = start[" block" ]
343+ end_block = end[" block" ]
344+ # Create partitions for the block range
345+ return [InputPartition(f " { start_block} : { end_block} " .encode())]
346+
347+ def read (self , partition ):
348+ # Fetch and yield block data
349+ range_str = partition.value.decode()
350+ start_block, end_block = map (int , range_str.split(" :" ))
351+ for block_num in range (start_block, end_block):
352+ yield self .fetch_block(block_num) # Returns block tuple
353+
354+ **Configuring Admission Control **
355+
356+ Use the ``maxRecordsPerBatch `` option when reading from the stream:
357+
358+ .. code-block :: python
359+
360+ # Process maximum 50 blocks per microbatch
361+ df = spark.readStream \\
362+ .format(" blockchain" ) \\
363+ .option(" maxRecordsPerBatch" , " 50" ) \\
364+ .load()
365+
366+ query = df.writeStream \\
367+ .format(" console" ) \\
368+ .start()
369+
370+ **Read Limit Types **
371+
372+ The ``read_limit `` parameter supports several limit types:
373+
374+ .. code-block :: python
375+
376+ # Maximum rows/records
377+ {" type" : " maxRows" , " maxRows" : 1000 }
378+
379+ # Maximum files (for file-based sources)
380+ {" type" : " maxFiles" , " maxFiles" : 10 }
381+
382+ # Maximum data size in bytes
383+ {" type" : " maxBytes" , " maxBytes" : 10485760 }
384+
385+ # Minimum rows with timeout (for low-throughput sources)
386+ {" type" : " minRows" , " minRows" : 100 , " maxTriggerDelayMs" : 30000 }
387+
388+ # Process all available data
389+ {" type" : " allAvailable" }
390+
391+ # Composite limits (multiple constraints)
392+ {" type" : " composite" , " limits" : [
393+ {" type" : " minRows" , " minRows" : 100 , " maxTriggerDelayMs" : 30000 },
394+ {" type" : " maxRows" , " maxRows" : 1000 }
395+ ]}
396+
397+ **Best Practices **
398+
399+ 1. **Always handle None parameters **: For the first batch, ``start_offset `` will be None
400+ 2. **Use min() for capping **: Ensure you don't exceed available data
401+ 3. **Implement reportLatestOffset() **: Helps with monitoring when rate limiting is active
402+ 4. **Validate limit types **: Check that the limit type is one your source supports
403+ 5. **Test both modes **: Verify your source works with and without limits
404+
405+ **Backward Compatibility **
406+
407+ For backward compatibility, the parameters are optional. Existing sources that don't implement
408+ admission control can use the old signature and will continue to work. The framework automatically
409+ detects which signature is implemented:
410+
411+ .. code-block :: python
412+
413+ def latestOffset (self ):
414+ # Old signature still works - framework detects this automatically
415+ return {" offset" : self .get_latest()}
416+
417+ New sources can opt-in to admission control by accepting the parameters:
418+
419+ .. code-block :: python
420+
421+ def latestOffset (self , start_offset = None , read_limit = None ):
422+ # New signature with admission control
423+ if read_limit and read_limit.get(" type" ) == " maxRows" :
424+ # Apply limit
425+ ...
426+ return {" offset" : ... }
427+
428+ Note: ``SimpleDataSourceStreamReader `` does not support admission control. If you need
429+ admission control, use ``DataSourceStreamReader `` instead.
430+
269431Alternative: Implement a Simple Streaming Reader
270432~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
271433
0 commit comments