-
Notifications
You must be signed in to change notification settings - Fork 170
[runtime/buffer] avoid write-locking the Append buffer during blob writes #2679
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+161
−39
Merged
Changes from 2 commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -7,10 +7,20 @@ use std::{num::NonZeroUsize, sync::Arc}; | |||||
|
|
||||||
| /// A [Blob] wrapper that supports appending new data that is both read and write cached, and | ||||||
| /// provides buffer-pool managed read caching of older data. | ||||||
| /// | ||||||
| /// # Concurrent Access | ||||||
| /// | ||||||
| /// This implementation allows readers to proceed while flush I/O is in progress, as long as they | ||||||
| /// are reading from the write buffer or the pool cache. Readers that need to access data from the | ||||||
| /// underlying blob (cache miss) will wait for any in-progress write to complete. | ||||||
| /// | ||||||
| /// The implementation involves two locks: one for the write buffer (and blob size metadata), and | ||||||
| /// one for the underlying blob itself. To avoid deadlocks, the buffer lock is always acquired | ||||||
| /// before the blob lock. | ||||||
| #[derive(Clone)] | ||||||
| pub struct Append<B: Blob> { | ||||||
| /// The underlying blob being wrapped. | ||||||
| blob: B, | ||||||
| /// The underlying blob being wrapped, protected by a lock for I/O coordination. | ||||||
| blob: Arc<RwLock<B>>, | ||||||
|
|
||||||
| /// Unique id assigned by the buffer pool. | ||||||
| id: u64, | ||||||
|
|
@@ -58,7 +68,7 @@ impl<B: Blob> Append<B> { | |||||
| } | ||||||
|
|
||||||
| Ok(Self { | ||||||
| blob, | ||||||
| blob: Arc::new(RwLock::new(blob)), | ||||||
| id: pool_ref.next_id().await, | ||||||
| pool_ref, | ||||||
| buffer: Arc::new(RwLock::new((buffer, size))), | ||||||
|
|
@@ -71,7 +81,8 @@ impl<B: Blob> Append<B> { | |||||
| let buf = buf.into(); | ||||||
|
|
||||||
| // Acquire a write lock on the buffer and blob_size. | ||||||
| let (buffer, blob_size) = &mut *self.buffer.write().await; | ||||||
| let mut guard = self.buffer.write().await; | ||||||
| let (buffer, _) = &mut *guard; | ||||||
|
|
||||||
| // Ensure the write doesn't overflow. | ||||||
| buffer | ||||||
|
|
@@ -81,7 +92,7 @@ impl<B: Blob> Append<B> { | |||||
|
|
||||||
| if buffer.append(buf.as_ref()) { | ||||||
| // Buffer is over capacity, flush it to the underlying blob. | ||||||
| return self.flush(buffer, blob_size).await; | ||||||
| return self.flush_internal(guard).await; | ||||||
| } | ||||||
|
|
||||||
| Ok(()) | ||||||
|
|
@@ -97,16 +108,54 @@ impl<B: Blob> Append<B> { | |||||
|
|
||||||
| /// Flush the append buffer to the underlying blob, caching each page worth of written data in | ||||||
| /// the buffer pool. | ||||||
| async fn flush(&self, buffer: &mut Buffer, blob_size: &mut u64) -> Result<(), Error> { | ||||||
| // Take the buffered data, if any. | ||||||
| let Some((mut buf, offset)) = buffer.take() else { | ||||||
| /// | ||||||
| /// This method acquires the blob write lock before releasing the buffer lock, ensuring readers | ||||||
| /// that need blob access will wait for the write to complete. | ||||||
| async fn flush_internal( | ||||||
| &self, | ||||||
| mut guard: crate::RwLockWriteGuard<'_, (Buffer, u64)>, | ||||||
| ) -> Result<(), Error> { | ||||||
| let (buffer, blob_size) = &mut *guard; | ||||||
|
|
||||||
| // Prepare the data to be written. | ||||||
| let Some((buf, write_offset)) = self.prepare_flush_data(buffer, *blob_size).await else { | ||||||
| return Ok(()); | ||||||
| }; | ||||||
|
|
||||||
| // Insert the flushed data into the buffer pool. This step isn't just to ensure recently | ||||||
| // written data remains cached for future reads, but is in fact required to purge | ||||||
| // potentially stale cache data which might result from the edge the case of rewinding a | ||||||
| // blob across a page boundary. | ||||||
| // Update blob_size *before* releasing the lock. We do this optimistically; if the write | ||||||
| // fails below, the program will return an error and likely abort/panic, so maintaining | ||||||
| // exact consistency on error isn't strictly required. | ||||||
| *blob_size = write_offset + buf.len() as u64; | ||||||
|
|
||||||
| // Acquire blob write lock BEFORE releasing buffer lock. This ensures no reader can access | ||||||
| // the blob until the write completes. | ||||||
| let blob_guard = self.blob.write().await; | ||||||
|
|
||||||
| // Release buffer lock, allowing concurrent buffered reads while the write is in progress. | ||||||
| // Any attempts to read from the blob will block until the write completes. | ||||||
| drop(guard); | ||||||
|
|
||||||
| // Perform the write while holding only blob lock. | ||||||
| blob_guard.write_at(buf, write_offset).await?; | ||||||
|
|
||||||
| Ok(()) | ||||||
| } | ||||||
|
|
||||||
| /// Prepares data from the buffer to be flushed to the blob. | ||||||
| /// | ||||||
| /// This method: | ||||||
| /// 1. Takes the data from the write buffer. | ||||||
| /// 2. Caches it in the buffer pool. | ||||||
| /// 3. Returns the data to be written and the offset to write it at (if any). | ||||||
| async fn prepare_flush_data( | ||||||
| &self, | ||||||
| buffer: &mut Buffer, | ||||||
| blob_size: u64, | ||||||
| ) -> Option<(Vec<u8>, u64)> { | ||||||
| // Take the buffered data, if any. | ||||||
| let (mut buf, offset) = buffer.take()?; | ||||||
|
|
||||||
| // Insert the flushed data into the buffer pool. | ||||||
| let remaining = self.pool_ref.cache(self.id, &buf, offset).await; | ||||||
|
|
||||||
| // If there's any data left over that doesn't constitute an entire page, re-buffer it into | ||||||
|
|
@@ -121,22 +170,20 @@ impl<B: Blob> Append<B> { | |||||
|
|
||||||
| // Early exit if there's no new data to write. | ||||||
| if new_data_start >= buf.len() { | ||||||
| return Ok(()); | ||||||
| return None; | ||||||
| } | ||||||
|
|
||||||
| if new_data_start > 0 { | ||||||
| buf.drain(0..new_data_start); | ||||||
| } | ||||||
| let new_data_len = buf.len() as u64; | ||||||
| self.blob.write_at(buf, *blob_size).await?; | ||||||
| *blob_size += new_data_len; | ||||||
|
|
||||||
| Ok(()) | ||||||
| // Return the data to write, and the offset where to write it within the blob. | ||||||
| Some((buf, blob_size)) | ||||||
roberto-bayardo marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| } | ||||||
|
|
||||||
| /// Clones and returns the underlying blob. | ||||||
| pub fn clone_blob(&self) -> B { | ||||||
| self.blob.clone() | ||||||
| pub async fn clone_blob(&self) -> B { | ||||||
| self.blob.read().await.clone() | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -155,7 +202,8 @@ impl<B: Blob> Blob for Append<B> { | |||||
| .ok_or(Error::OffsetOverflow)?; | ||||||
|
|
||||||
| // Acquire a read lock on the buffer. | ||||||
| let (buffer, _) = &*self.buffer.read().await; | ||||||
| let guard = self.buffer.read().await; | ||||||
| let (buffer, _) = &*guard; | ||||||
|
|
||||||
| // If the data required is beyond the size of the blob, return an error. | ||||||
| if end_offset > buffer.size() { | ||||||
|
|
@@ -164,13 +212,40 @@ impl<B: Blob> Blob for Append<B> { | |||||
|
|
||||||
| // Extract any bytes from the buffer that overlap with the requested range. | ||||||
| let remaining = buffer.extract(buf.as_mut(), offset); | ||||||
|
|
||||||
| // Release buffer lock before potential I/O. | ||||||
| drop(guard); | ||||||
|
|
||||||
| if remaining == 0 { | ||||||
| return Ok(buf); | ||||||
| } | ||||||
|
|
||||||
| // If there are bytes remaining to be read, use the buffer pool to get them. | ||||||
| // Fast path: try to read *only from pool cache without acquiring blob lock. This allows | ||||||
roberto-bayardo marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| // concurrent reads even while a flush is in progress. | ||||||
| let cached = self | ||||||
| .pool_ref | ||||||
| .read_cached(self.id, &mut buf.as_mut()[..remaining], offset) | ||||||
| .await; | ||||||
|
|
||||||
| if cached == remaining { | ||||||
| // All bytes found in cache. | ||||||
| return Ok(buf); | ||||||
| } | ||||||
|
|
||||||
| // Slow path: cache miss (partial or full), acquire blob read lock to ensure any in-flight | ||||||
| // write completes before we read from the blob. | ||||||
| let blob_guard = self.blob.read().await; | ||||||
|
|
||||||
| // Read remaining bytes that were not already obtained from the earlier cache read. | ||||||
| let uncached_offset = offset + cached as u64; | ||||||
| let uncached_len = remaining - cached; | ||||||
| self.pool_ref | ||||||
| .read(&self.blob, self.id, &mut buf.as_mut()[..remaining], offset) | ||||||
| .read( | ||||||
| &*blob_guard, | ||||||
| self.id, | ||||||
| &mut buf.as_mut()[cached..cached + uncached_len], | ||||||
| uncached_offset, | ||||||
| ) | ||||||
roberto-bayardo marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| .await?; | ||||||
|
|
||||||
| Ok(buf) | ||||||
|
|
@@ -184,11 +259,15 @@ impl<B: Blob> Blob for Append<B> { | |||||
| } | ||||||
|
|
||||||
| async fn sync(&self) -> Result<(), Error> { | ||||||
| // Flush any buffered data. When flush_internal returns, the write_at has completed | ||||||
| // and data is in the OS buffer. | ||||||
| { | ||||||
| let (buffer, blob_size) = &mut *self.buffer.write().await; | ||||||
| self.flush(buffer, blob_size).await?; | ||||||
| let guard = self.buffer.write().await; | ||||||
| self.flush_internal(guard).await?; | ||||||
| } | ||||||
| self.blob.sync().await | ||||||
| // Sync the OS buffer to disk. We need the blob read lock here since sync() requires | ||||||
| // access to the blob, but only a read lock since we're not modifying blob state. | ||||||
| self.blob.read().await.sync().await | ||||||
| } | ||||||
|
|
||||||
| /// Resize the blob to the provided `size`. | ||||||
|
|
@@ -199,17 +278,30 @@ impl<B: Blob> Blob for Append<B> { | |||||
| // always updated should the blob grow back to the point where we have new data for the same | ||||||
| // page, if any old data hasn't expired naturally by then. | ||||||
|
|
||||||
| // Acquire a write lock on the buffer. | ||||||
| let (buffer, blob_size) = &mut *self.buffer.write().await; | ||||||
| // Acquire buffer lock first. | ||||||
| // NOTE: We MUST acquire the buffer lock before the blob lock to avoid deadlocks with | ||||||
| // `append`, which acquires buffer then blob (via `flush_internal`). | ||||||
| let mut guard = self.buffer.write().await; | ||||||
|
||||||
| let mut guard = self.buffer.write().await; | |
| let mut buf_guard = self.buffer.write().await; |
Collaborator
Author
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
roberto-bayardo marked this conversation as resolved.
Show resolved
Hide resolved
roberto-bayardo marked this conversation as resolved.
Show resolved
Hide resolved
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.