Skip to content

Commit fcdbfe7

Browse files
pixccCyberROFL
andauthored
Add a way to disable CMS (#27336)
Co-authored-by: Ilnaz Nizametdinov <[email protected]>
1 parent bfc5f0e commit fcdbfe7

File tree

7 files changed

+164
-3
lines changed

7 files changed

+164
-3
lines changed

ydb/core/cms/cms_impl.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,17 @@ class TCms : public TActor<TCms>, public TTabletExecutedFlat {
223223
}
224224
}
225225

226+
#define HFuncChecked(TEvType, HandleFunc) \
227+
case TEvType::EventType: { \
228+
typename TEvType::TPtr* x = reinterpret_cast<typename TEvType::TPtr*>(&ev); \
229+
if (State->Config.Enable) { \
230+
HandleFunc(*x, this->ActorContext()); \
231+
} else { \
232+
ReplyWithError<TEvCms::TEvPermissionResponse>(*x, NKikimrCms::TStatus::ERROR_TEMP, "CMS is disabled", this->ActorContext()); \
233+
} \
234+
break; \
235+
} Y_SEMICOLON_GUARD
236+
226237
STFUNC(StateWork) {
227238
switch (ev->GetTypeRewrite()) {
228239
HFunc(TEvPrivate::TEvClusterInfo, Handle);
@@ -234,9 +245,9 @@ class TCms : public TActor<TCms>, public TTabletExecutedFlat {
234245
cFunc(TEvPrivate::EvStartCollecting, StartCollecting);
235246
cFunc(TEvPrivate::EvProcessQueue, ProcessQueue);
236247
FFunc(TEvCms::EvClusterStateRequest, EnqueueRequest);
237-
HFunc(TEvCms::TEvPermissionRequest, CheckAndEnqueueRequest);
248+
HFuncChecked(TEvCms::TEvPermissionRequest, CheckAndEnqueueRequest);
238249
HFunc(TEvCms::TEvManageRequestRequest, Handle);
239-
HFunc(TEvCms::TEvCheckRequest, CheckAndEnqueueRequest);
250+
HFuncChecked(TEvCms::TEvCheckRequest, CheckAndEnqueueRequest);
240251
HFunc(TEvCms::TEvManagePermissionRequest, Handle);
241252
HFunc(TEvCms::TEvConditionalPermissionRequest, CheckAndEnqueueRequest);
242253
HFunc(TEvCms::TEvNotification, CheckAndEnqueueRequest);

ydb/core/cms/cms_maintenance_api_ut.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,67 @@ Y_UNIT_TEST_SUITE(TMaintenanceApiTest) {
371371
UNIT_ASSERT_VALUES_EQUAL(actionState.status(), ActionState::ACTION_STATUS_PERFORMED);
372372
UNIT_ASSERT(actionState.action().has_cordon_action());
373373
}
374+
375+
Y_UNIT_TEST(DisableCMS){
376+
TCmsTestEnv env(16);
377+
378+
auto r1 = env.CheckMaintenanceTaskCreate("task-1", Ydb::StatusIds::SUCCESS,
379+
MakeActionGroup(
380+
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
381+
)
382+
);
383+
UNIT_ASSERT_VALUES_EQUAL(r1.action_group_states().size(), 1);
384+
UNIT_ASSERT_VALUES_EQUAL(r1.action_group_states(0).action_states().size(), 1);
385+
const auto &a1 = r1.action_group_states(0).action_states(0);
386+
UNIT_ASSERT_VALUES_EQUAL(a1.status(), ActionState::ACTION_STATUS_PERFORMED);
387+
388+
// Pending task
389+
auto r2 = env.CheckMaintenanceTaskCreate("task-2", Ydb::StatusIds::SUCCESS,
390+
MakeActionGroup(
391+
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
392+
)
393+
);
394+
UNIT_ASSERT_VALUES_EQUAL(r2.action_group_states().size(), 1);
395+
UNIT_ASSERT_VALUES_EQUAL(r2.action_group_states(0).action_states().size(), 1);
396+
const auto &a2 = r2.action_group_states(0).action_states(0);
397+
UNIT_ASSERT_VALUES_EQUAL(a2.status(), ActionState::ACTION_STATUS_PENDING);
398+
399+
// Disable CMS
400+
NKikimrCms::TCmsConfig config;
401+
config.SetEnable(false);
402+
env.SetCmsConfig(config);
403+
404+
env.CheckCompleteAction(a1.action_uid(), Ydb::StatusIds::SUCCESS);
405+
406+
// Requests should fail
407+
env.CheckMaintenanceTaskCreate("task-3", Ydb::StatusIds::UNAVAILABLE,
408+
MakeActionGroup(
409+
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
410+
)
411+
);
412+
env.CheckMaintenanceTaskRefresh("task-2", Ydb::StatusIds::UNAVAILABLE);
413+
414+
// Enable CMS back
415+
config.SetEnable(true);
416+
env.SetCmsConfig(config);
417+
418+
// Requests should be ok
419+
auto r3 = env.CheckMaintenanceTaskCreate("task-3", Ydb::StatusIds::SUCCESS,
420+
MakeActionGroup(
421+
MakeLockAction(env.GetNodeId(9), TDuration::Minutes(10))
422+
)
423+
);
424+
UNIT_ASSERT_VALUES_EQUAL(r3.action_group_states().size(), 1);
425+
UNIT_ASSERT_VALUES_EQUAL(r3.action_group_states(0).action_states().size(), 1);
426+
const auto &a3 = r3.action_group_states(0).action_states(0);
427+
UNIT_ASSERT_VALUES_EQUAL(a3.status(), ActionState::ACTION_STATUS_PERFORMED);
428+
429+
auto r4 = env.CheckMaintenanceTaskRefresh("task-2", Ydb::StatusIds::SUCCESS);
430+
UNIT_ASSERT_VALUES_EQUAL(r4.action_group_states().size(), 1);
431+
UNIT_ASSERT_VALUES_EQUAL(r4.action_group_states(0).action_states().size(), 1);
432+
const auto &a4 = r4.action_group_states(0).action_states(0);
433+
UNIT_ASSERT_VALUES_EQUAL(a4.status(), ActionState::ACTION_STATUS_PERFORMED);
434+
}
374435
}
375436

376437
} // namespace NKikimr::NCmsTest

ydb/core/cms/cms_ut.cpp

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2773,7 +2773,68 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
27732773
// tablet 'FLAT_BS_CONTROLLER' has too many unavailable nodes.
27742774
env.CheckPermissionRequest("user", false, false, false, true, MODE_MAX_AVAILABILITY, TStatus::DISALLOW_TEMP,
27752775
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"));
2776-
2776+
}
2777+
2778+
Y_UNIT_TEST(DisableCMS){
2779+
TCmsTestEnv env(16);
2780+
2781+
auto r1 = env.CheckPermissionRequest("user", false, false, true, true, TStatus::ALLOW,
2782+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000));
2783+
UNIT_ASSERT_VALUES_EQUAL(r1.PermissionsSize(), 1);
2784+
2785+
// Scheduled request
2786+
auto r2 = env.CheckPermissionRequest("user", false, false, /* scheduled */ true, true, TStatus::DISALLOW_TEMP,
2787+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000));
2788+
2789+
// Disable CMS
2790+
NKikimrCms::TCmsConfig config;
2791+
config.SetEnable(false);
2792+
env.SetCmsConfig(config);
2793+
2794+
env.CheckDonePermission("user", r1.GetPermissions(0).GetId());
2795+
2796+
// Requests should fail
2797+
env.CheckPermissionRequest("user", false, false, true, true, TStatus::ERROR_TEMP,
2798+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(9), 60000000));
2799+
env.CheckRequest("user", r2.GetRequestId(), true, TStatus::ERROR_TEMP);
2800+
2801+
// Enable CMS back
2802+
config.SetEnable(true);
2803+
env.SetCmsConfig(config);
2804+
2805+
// Requests should be ok
2806+
auto r3 = env.CheckPermissionRequest("user", false, false, true, true, TStatus::ALLOW,
2807+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(9), 60000000));
2808+
UNIT_ASSERT_VALUES_EQUAL(r3.PermissionsSize(), 1);
2809+
env.CheckRequest("user", r2.GetRequestId(), true, TStatus::ALLOW, 1);
2810+
}
2811+
2812+
Y_UNIT_TEST(WalleDisableCMS){
2813+
TCmsTestEnv env(16);
2814+
2815+
env.CheckWalleCreateTask("task-1", "reboot", false, TStatus::ALLOW, env.GetNodeId(0));
2816+
2817+
// Scheduled request
2818+
env.CheckWalleCreateTask("task-2", "reboot", false, TStatus::DISALLOW_TEMP, env.GetNodeId(0));
2819+
2820+
// Disable CMS
2821+
NKikimrCms::TCmsConfig config;
2822+
config.SetEnable(false);
2823+
env.SetCmsConfig(config);
2824+
2825+
env.CheckWalleRemoveTask("task-1");
2826+
2827+
// Requests should fail
2828+
env.CheckWalleCreateTask("task-3", "reboot", false, TStatus::ERROR_TEMP, env.GetNodeId(9));
2829+
env.CheckWalleCheckTask("task-2", TStatus::ERROR_TEMP);
2830+
2831+
// Enable CMS back
2832+
config.SetEnable(true);
2833+
env.SetCmsConfig(config);
2834+
2835+
// Requests should be ok
2836+
env.CheckWalleCreateTask("task-3", "reboot", false, TStatus::ALLOW, env.GetNodeId(9));
2837+
env.CheckWalleCheckTask("task-2", TStatus::ALLOW);
27772838
}
27782839
}
27792840

ydb/core/cms/cms_ut_common.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,24 @@ class TCmsTestEnv : public TTestBasicRuntime {
506506
return CheckMaintenanceTaskCreate(taskUid, code, Ydb::Maintenance::AVAILABILITY_MODE_STRONG, actionGroups...);
507507
}
508508

509+
Ydb::Maintenance::ManageActionResult CheckCompleteAction(
510+
const Ydb::Maintenance::ActionUid &actionUid,
511+
Ydb::StatusIds::StatusCode code)
512+
{
513+
auto ev = std::make_unique<NCms::TEvCms::TEvCompleteActionRequest>();
514+
515+
auto *req = ev->Record.MutableRequest();
516+
req->mutable_action_uids()->Add()->CopyFrom(actionUid);
517+
518+
SendToPipe(CmsId, Sender, ev.release(), 0, GetPipeConfigWithRetries());
519+
TAutoPtr<IEventHandle> handle;
520+
auto reply = GrabEdgeEventRethrow<NCms::TEvCms::TEvManageActionResponse>(handle);
521+
522+
const auto &rec = reply->Record;
523+
UNIT_ASSERT_VALUES_EQUAL(rec.GetStatus(), code);
524+
return rec.GetResult();
525+
}
526+
509527
void EnableBSBaseConfig();
510528
void DisableBSBaseConfig();
511529

ydb/core/cms/config.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ struct TCmsLogConfig {
257257
};
258258

259259
struct TCmsConfig {
260+
bool Enable = true;
260261
TDuration DefaultRetryTime;
261262
TDuration DefaultPermissionDuration;
262263
TDuration DefaultWalleCleanupPeriod = TDuration::Minutes(1);
@@ -275,6 +276,7 @@ struct TCmsConfig {
275276
}
276277

277278
void Serialize(NKikimrCms::TCmsConfig &config) const {
279+
config.SetEnable(Enable);
278280
config.SetDefaultRetryTime(DefaultRetryTime.GetValue());
279281
config.SetDefaultPermissionDuration(DefaultPermissionDuration.GetValue());
280282
config.SetInfoCollectionTimeout(InfoCollectionTimeout.GetValue());
@@ -285,6 +287,7 @@ struct TCmsConfig {
285287
}
286288

287289
void Deserialize(const NKikimrCms::TCmsConfig &config) {
290+
Enable = config.GetEnable();
288291
DefaultRetryTime = TDuration::MicroSeconds(config.GetDefaultRetryTime());
289292
DefaultPermissionDuration = TDuration::MicroSeconds(config.GetDefaultPermissionDuration());
290293
InfoCollectionTimeout = TDuration::MicroSeconds(config.GetInfoCollectionTimeout());

ydb/core/cms/walle_api_handler.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,12 @@ class TWalleCrateTaskHandler : public TActorBootstrapped<TWalleCrateTaskHandler>
277277
return false;
278278
}
279279

280+
if (status.GetCode() == TStatus::ERROR_TEMP) {
281+
auto err = Sprintf("HTTP/1.1 503 Service Unavailable\r\n\r\n%s", status.GetReason().data());
282+
ReplyWithError(err, ctx);
283+
return false;
284+
}
285+
280286
auto err = Sprintf("HTTP/1.1 500 Internal Server Error\r\n\r\n%s", status.GetReason().data());
281287
ReplyWithError(err, ctx);
282288
return false;

ydb/core/protos/cms.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,7 @@ message TCmsConfig {
514514
optional uint64 InfoCollectionTimeout = 6 [default = 15000000];
515515
optional TLogConfig LogConfig = 7;
516516
optional TSentinelConfig SentinelConfig = 8;
517+
optional bool Enable = 9 [default = true];
517518
}
518519

519520
message TPDiskID {

0 commit comments

Comments
 (0)