facebook · hx235 · Aug 26, 2025 · Aug 27, 2025
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
@@ -3121,22 +3121,24 @@ TEST_F(DBWALTest, RecoveryFlushSwitchWALOnEmptyMemtable) {
   Destroy(options);
 }
 
-TEST_F(DBWALTest, WALWriteErrorNoRecovery) {
+TEST_F(DBWALTest, WALWriteErrorNoAutoRecovery) {
   Options options = CurrentOptions();
   auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
   std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   options.env = fault_fs_env.get();
-  options.manual_wal_flush = true;
+  options.atomic_flush = false;
   DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
   fault_fs->SetThreadLocalErrorContext(
       FaultInjectionIOType::kWrite, 7 /* seed*/, 1 /* one_in */,
       true /* retryable */, false /* has_data_loss*/);
   fault_fs->EnableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
-
-  ASSERT_OK(Put("k", "v"));
-  Status s;
-  s = db_->FlushWAL(false);
+  Status s = Put("k", "v");
   ASSERT_TRUE(s.IsIOError());
+  ASSERT_TRUE(
+      s.ToString().find("injected write error failed to write to WAL") !=
+      std::string::npos);
   s = dbfull()->TEST_GetBGError();
   ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
   ASSERT_FALSE(dbfull()->TEST_IsRecoveryInProgress());

diff --git a/db/error_handler.cc b/db/error_handler.cc
@@ -423,14 +423,17 @@ void ErrorHandler::SetBGError(const Status& bg_status,
            reason == BackgroundErrorReason::kMemTable ||
            reason == BackgroundErrorReason::kFlush);
   }
-  if (db_options_.manual_wal_flush && wal_related && bg_io_err.IsIOError()) {
-    // With manual_wal_flush, a WAL write failure can drop buffered WAL writes.
-    // Memtables and WAL then become inconsistent. A successful memtable flush
-    // on one CF can cause CFs to be inconsistent upon restart. Before we fix
-    // the bug in auto recovery from WAL write failures that can flush one CF
-    // at a time, we set the error severity to fatal to disallow auto recovery.
-    // TODO: remove parameter `wal_related` once we can automatically recover
-    //  from WAL write failures.
+
+  // When `atomic_flush = false` with multiple column families, when
+  // encountering WAL related IO error, individual CF flushing during auto
+  // recovery can create data inconsistencies where some column families advance
+  // past the corruption point while others remain behind, preventing successful
+  // database restart. Therefore we disable auto recovery by setting a higher
+  // severity `Status::Severity::kFatalError`.
+  bool has_multiple_cfs =
+      db_->versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1;
+  if (!db_options_.atomic_flush && has_multiple_cfs && wal_related &&
+      bg_io_err.IsIOError()) {
     bool auto_recovery = false;
     Status bg_err(new_bg_io_err, Status::Severity::kFatalError);
     CheckAndSetRecoveryAndBGError(bg_err);

diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc
@@ -418,6 +418,7 @@ TEST_F(DBErrorHandlingFSTest, FlushWALWriteRetryableError) {
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 0;
+  options.atomic_flush = true;
   Status s;
 
   IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
@@ -1463,6 +1464,7 @@ TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 32768;
   options.listeners.emplace_back(listener);
+  options.atomic_flush = true;
   Random rnd(301);
 
   listener->EnableAutoRecovery();

diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h
@@ -354,7 +354,8 @@ class StressTest {
     assert(!error_s.ok());
     return error_s.getState() &&
            FaultInjectionTestFS::IsInjectedError(error_s) &&
-           !status_to_io_status(Status(error_s)).GetDataLoss();
+           !status_to_io_status(Status(error_s)).GetDataLoss() &&
+           error_s.severity() <= Status::Severity::kHardError;
   }
 
   void ProcessStatus(SharedState* shared, std::string msg, const Status& s,

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
@@ -996,7 +996,7 @@ def finalize_and_sanitize(src_params):
         if (
             dest_params.get("reopen", 0) > 0
             or (
-                dest_params.get("manual_wal_flush_one_in")
+                dest_params.get("atomic_flush") != 1
                 and dest_params.get("column_families") != 1
             )
             or (
@@ -1010,10 +1010,10 @@ def finalize_and_sanitize(src_params):
             # To simplify, we disable any WAL write error injection.
             # TODO(hx235): support WAL write error injection with reopen
             #
-            # 2. WAL write failure can drop buffered WAL data. This can cause
-            # inconsistency when one CF has a successful flush during auto
-            # recovery. Disable the fault injection in this path for now until
-            # we have a fix that allows auto recovery.
+            # 2.  When `atomic_flush = false` with multiple column families, when encountering WAL related IO error,
+            # individual CF flushing during auto recovery can create data inconsistencies where some column families advance
+            # past the corruption point while others remain behind, preventing successful database restart.
+            # Therefore we disable auto recovery and testing for this case in crash test
             #
             # 3. Pessimistic transactions use 2PC, which can't auto-recover from WAL write errors.
             # This is because RocksDB cannot easily discard the corrupted WAL without risking the