-
Notifications
You must be signed in to change notification settings - Fork 265
even faster unlock in contention #462
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -22,6 +22,9 @@ pub(crate) const TOKEN_NORMAL: UnparkToken = UnparkToken(0); | |||||
| // thread directly without unlocking it. | ||||||
| pub(crate) const TOKEN_HANDOFF: UnparkToken = UnparkToken(1); | ||||||
|
|
||||||
| // UnparkToken used to indicate that the waiter should restore PARKED_BIT. | ||||||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| pub(crate) const TOKEN_RESTORE_PARKED_BIT: UnparkToken = UnparkToken(2); | ||||||
|
|
||||||
| /// This bit is set in the `state` of a `RawMutex` when that mutex is locked by some thread. | ||||||
| const LOCKED_BIT: u8 = 0b01; | ||||||
| /// This bit is set in the `state` of a `RawMutex` just before parking a thread. A thread is being | ||||||
|
|
@@ -69,7 +72,7 @@ unsafe impl lock_api::RawMutex for RawMutex { | |||||
| .compare_exchange_weak(0, LOCKED_BIT, Ordering::Acquire, Ordering::Relaxed) | ||||||
| .is_err() | ||||||
| { | ||||||
| self.lock_slow(None); | ||||||
| self.lock_slow(None, false); | ||||||
| } | ||||||
| unsafe { deadlock::acquire_resource(self as *const _ as usize) }; | ||||||
| } | ||||||
|
|
@@ -99,11 +102,8 @@ unsafe impl lock_api::RawMutex for RawMutex { | |||||
| #[inline] | ||||||
| unsafe fn unlock(&self) { | ||||||
| deadlock::release_resource(self as *const _ as usize); | ||||||
| if self | ||||||
| .state | ||||||
| .compare_exchange(LOCKED_BIT, 0, Ordering::Release, Ordering::Relaxed) | ||||||
| .is_ok() | ||||||
| { | ||||||
| let prev = self.state.swap(0, Ordering::Release); | ||||||
| if prev == LOCKED_BIT { | ||||||
| return; | ||||||
| } | ||||||
| self.unlock_slow(false); | ||||||
|
|
@@ -151,7 +151,7 @@ unsafe impl lock_api::RawMutexTimed for RawMutex { | |||||
| { | ||||||
| true | ||||||
| } else { | ||||||
| self.lock_slow(Some(timeout)) | ||||||
| self.lock_slow(Some(timeout), false) | ||||||
| }; | ||||||
| if result { | ||||||
| unsafe { deadlock::acquire_resource(self as *const _ as usize) }; | ||||||
|
|
@@ -168,7 +168,7 @@ unsafe impl lock_api::RawMutexTimed for RawMutex { | |||||
| { | ||||||
| true | ||||||
| } else { | ||||||
| self.lock_slow(util::to_deadline(timeout)) | ||||||
| self.lock_slow(util::to_deadline(timeout), false) | ||||||
| }; | ||||||
| if result { | ||||||
| unsafe { deadlock::acquire_resource(self as *const _ as usize) }; | ||||||
|
|
@@ -199,23 +199,27 @@ impl RawMutex { | |||||
| } | ||||||
| } | ||||||
|
|
||||||
| // Used by Condvar when requeuing threads to us, must be called while | ||||||
| // holding the queue lock. | ||||||
| #[inline] | ||||||
| pub(crate) fn mark_parked(&self) { | ||||||
| self.state.fetch_or(PARKED_BIT, Ordering::Relaxed); | ||||||
| pub(crate) fn lock_contention(&self) { | ||||||
| self.lock_slow(None, true); | ||||||
| } | ||||||
|
|
||||||
| #[cold] | ||||||
| fn lock_slow(&self, timeout: Option<Instant>) -> bool { | ||||||
| fn lock_slow(&self, timeout: Option<Instant>, in_contention: bool) -> bool { | ||||||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I think |
||||||
| let mut spinwait = SpinWait::new(); | ||||||
| let mut state = self.state.load(Ordering::Relaxed); | ||||||
| let mut extra_flags; | ||||||
| if in_contention { | ||||||
| extra_flags = PARKED_BIT; | ||||||
| } else { | ||||||
| extra_flags = 0; | ||||||
| } | ||||||
| loop { | ||||||
| // Grab the lock if it isn't locked, even if there is a queue on it | ||||||
| if state & LOCKED_BIT == 0 { | ||||||
| match self.state.compare_exchange_weak( | ||||||
| state, | ||||||
| state | LOCKED_BIT, | ||||||
| state | LOCKED_BIT | extra_flags, | ||||||
| Ordering::Acquire, | ||||||
| Ordering::Relaxed, | ||||||
| ) { | ||||||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The spin loop on line 233 should be disabled if |
||||||
|
|
@@ -254,6 +258,7 @@ impl RawMutex { | |||||
| self.state.fetch_and(!PARKED_BIT, Ordering::Relaxed); | ||||||
| } | ||||||
| }; | ||||||
| extra_flags = 0; | ||||||
| // SAFETY: | ||||||
| // * `addr` is an address we control. | ||||||
| // * `validate`/`timed_out` does not panic or call into any function of `parking_lot`. | ||||||
|
|
@@ -271,12 +276,16 @@ impl RawMutex { | |||||
| // The thread that unparked us passed the lock on to us | ||||||
| // directly without unlocking it. | ||||||
| ParkResult::Unparked(TOKEN_HANDOFF) => return true, | ||||||
| ParkResult::Unparked(TOKEN_RESTORE_PARKED_BIT) => extra_flags = PARKED_BIT, | ||||||
|
|
||||||
| // We were unparked normally, try acquiring the lock again | ||||||
| ParkResult::Unparked(_) => (), | ||||||
|
|
||||||
| // The validation function failed, try locking again | ||||||
| ParkResult::Invalid => (), | ||||||
| // This thread doesn't sleep, so it's not sure whether it's the last thread | ||||||
| // in queue. Setting PARKED_BIT can lead to false wake up. But false wake up | ||||||
| // is good for throughput during high contention. | ||||||
| ParkResult::Invalid => extra_flags = PARKED_BIT, | ||||||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is correct: we should only set What numbers do you get on the benchmark without this?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The point here is to ask for more wake. During high contention, random wake up may make more threads stay on CPU. Because high contention means the lock is acquired and released very frequently, more on CPU time means higher possibility to acquire the lock. Leaving CPU and then being scheduled back up one by one is very slow, we should do that only when there is probably no way to make progress anytime soon. This is also why I name the new arg as When thread count is more than 9, the number can be lower than 30% ~ 40% without setting the bit.
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main effect that setting the parked bit has is that it prevents threads from spinning (since we only spin when the parked bit is clear). This has the effect of causing threads to go directly to parking, which as you said is quite slow. However since other threads are no longer actively trying to acquire the lock, it means that one thread can quickly acquire and release the lock since there is no cache interference from other threads. Although this may look good on benchmarks, it actually isn't good since other threads are wasting time doing work that isn't useful instead of attempting to acquire the lock. This is effectively equivalent to just pausing for a longer period between attempts to acquire the lock.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Perf stats shows that setting the PARKED_BIT here can lead to more context switch and a lot higher cache miss, this is the prof that more threads are staying on CPU instead of going to sleep. The reason why PARKED_BIT will wake more threads is because some thread will acquire lock without any competing during contention. For example, if thread a acquire lock, and thread b and c are waiting for a. When a release lock and wake thread b, another thread d that is on CPU right now may acquire lock earlier than thread b. There are two possible behavior of thread d, it can acquire lock directly, or it fails to try lock and try to park but fail again due to validation. Setting parked bit here is utilizing the second situation, so that when d acquire lock, it can still wake thread c later.
I notice this performance pitfall when I try to implement a linked list with mutex, which is short generally but can grow very long occasionally. After this PR, there is no obvious performance difference between pthread and parking_lot.
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In that situation, it's not thread D's job to set the parked bit: thread B will set it before parking itself.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thread B may not as it may be still spinning to try lock. |
||||||
|
|
||||||
| // Timeout expired | ||||||
| ParkResult::TimedOut => return false, | ||||||
|
|
@@ -296,7 +305,7 @@ impl RawMutex { | |||||
| let callback = |result: UnparkResult| { | ||||||
| // If we are using a fair unlock then we should keep the | ||||||
| // mutex locked and hand it off to the unparked thread. | ||||||
| if result.unparked_threads != 0 && (force_fair || result.be_fair) { | ||||||
| if result.unparked_threads != 0 && force_fair { | ||||||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic here is very different in the |
||||||
| // Clear the parked bit if there are no more parked | ||||||
| // threads. | ||||||
| if !result.have_more_threads { | ||||||
|
|
@@ -308,8 +317,12 @@ impl RawMutex { | |||||
| // Clear the locked bit, and the parked bit as well if there | ||||||
| // are no more parked threads. | ||||||
| if result.have_more_threads { | ||||||
| self.state.store(PARKED_BIT, Ordering::Release); | ||||||
| } else { | ||||||
| if force_fair { | ||||||
| self.state.store(PARKED_BIT, Ordering::Release); | ||||||
| } else { | ||||||
| return TOKEN_RESTORE_PARKED_BIT; | ||||||
| } | ||||||
| } else if force_fair { | ||||||
| self.state.store(0, Ordering::Release); | ||||||
| } | ||||||
| TOKEN_NORMAL | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TOKEN_HANDOFFis actually reachable if we are requeued onto a mutex and then another unlocks that mutex withunlock_fair.