Skip to content

Commit

Permalink
Use token based approach for system-agent
Browse files Browse the repository at this point in the history
Reduce the footprint of the system-agent RBAC
Per each cluster there will be created:
- 1 system-agent ServiceAccount

Per each plan there will be temporarily created:
- 1 Role with access to all plan secrets for each machine
- 1 Rolebinging for the role and the cluster system-agent ServiceAccount
On plan completion/failure the role and rolebinding will be rewoked

Per each machine there will be created:
- 1 Secret for the system-agent authentication, with unique JWT bound to
  the secret existence in the API server, and a namespace/name pointer
  to the plan secret
- 1 Secret for the plan execution

Signed-off-by: Danil-Grigorev <[email protected]>
  • Loading branch information
Danil-Grigorev committed Oct 4, 2024
1 parent 54fc8ff commit c47f713
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 166 deletions.
2 changes: 1 addition & 1 deletion exp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export CLUSTER_NAMESPACE=default
export CLUSTER_NAME=rke2
export ETCD_MACHINE_SNAPSHOT_NAME="<snapshot_name_from_the_output>"

envsubst < etcdrestore/examples/etcd-restore.yaml | kubectl apply -f -
envsubst < exp/etcdrestore/examples/etcd-restore.yaml | kubectl apply -f -
```

## Cleanup
Expand Down
6 changes: 6 additions & 0 deletions exp/etcdrestore/config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ rules:
- patch
- update
- watch
- apiGroups:
- ""
resources:
- serviceaccounts/token
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
Expand Down
45 changes: 31 additions & 14 deletions exp/etcdrestore/controllers/etcdsnapshotrestore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ import (
snapshotrestorev1 "github.com/rancher/turtles/exp/etcdrestore/api/v1alpha1"
)

// InitMachine is a filter matching on init machine of the ETCD snapshot
func InitMachine(etcdMachineSnapshot *snapshotrestorev1.ETCDMachineSnapshot) collections.Func {
// initMachine is a filter matching on init machine of the ETCD snapshot
func initMachine(etcdMachineSnapshot *snapshotrestorev1.ETCDMachineSnapshot) collections.Func {
return func(machine *clusterv1.Machine) bool {
return machine.Name == etcdMachineSnapshot.Spec.MachineName
}
Expand Down Expand Up @@ -104,6 +104,7 @@ type scope struct {
//+kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters/status,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=secrets;events;configmaps;serviceaccounts,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=serviceaccounts/token,verbs=create
//+kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles;rolebindings,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="management.cattle.io",resources=*,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=bootstrap.cluster.x-k8s.io,resources=rke2configs;rke2configs/status;rke2configs/finalizers,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -159,7 +160,7 @@ func (r *ETCDSnapshotRestoreReconciler) reconcileNormal(ctx context.Context, etc
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}

if scope.machines.Filter(InitMachine(scope.etcdMachineSnapshot)).Len() != 1 {
if scope.machines.Filter(initMachine(scope.etcdMachineSnapshot)).Len() != 1 {
return ctrl.Result{}, fmt.Errorf(
"init machine %s for snapshot %s is not found",
scope.etcdMachineSnapshot.Spec.MachineName,
Expand Down Expand Up @@ -191,15 +192,13 @@ func (r *ETCDSnapshotRestoreReconciler) reconcileNormal(ctx context.Context, etc

return ctrl.Result{}, nil
case snapshotrestorev1.ETCDSnapshotRestorePhaseStarted:
etcdSnapshotRestore.Status.Phase = snapshotrestorev1.ETCDSnapshotRestorePhaseShutdown

return ctrl.Result{}, nil
return r.preparePlanPermissions(ctx, scope, etcdSnapshotRestore)
case snapshotrestorev1.ETCDSnapshotRestorePhaseShutdown:
// Stop RKE2 on all the machines.
return r.stopRKE2OnAllMachines(ctx, scope, etcdSnapshotRestore)
case snapshotrestorev1.ETCDSnapshotRestorePhaseRunning:
// Restore the etcd snapshot on the init machine.
return r.restoreSnaphotOnInitMachine(ctx, scope, etcdSnapshotRestore)
return r.restoreSnapshotOnInitMachine(ctx, scope, etcdSnapshotRestore)
case snapshotrestorev1.ETCDSnapshotRestorePhaseAgentRestart:
// Start RKE2 on all the machines.
return r.startRKE2OnAllMachines(ctx, scope, etcdSnapshotRestore)
Expand All @@ -212,7 +211,7 @@ func (r *ETCDSnapshotRestoreReconciler) reconcileNormal(ctx context.Context, etc
case snapshotrestorev1.ETCDSnapshotRestorePhaseJoinAgents:
return r.waitForMachinesToJoin(ctx, scope, etcdSnapshotRestore)
case snapshotrestorev1.ETCDSnapshotRestorePhaseFinished, snapshotrestorev1.ETCDSnapshotRestorePhaseFailed:
return ctrl.Result{}, nil
return r.revokePlanPermissions(ctx, scope, etcdSnapshotRestore)
}

return ctrl.Result{}, nil
Expand Down Expand Up @@ -251,6 +250,24 @@ func initScope(ctx context.Context, c client.Client, etcdSnapshotRestore *snapsh
}, nil
}

func (r *ETCDSnapshotRestoreReconciler) preparePlanPermissions(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
if err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, scope.machines.Newest(), scope.machines).Permit(ctx); err != nil {
return ctrl.Result{}, err
}

etcdSnapshotRestore.Status.Phase = snapshotrestorev1.ETCDSnapshotRestorePhaseShutdown

return ctrl.Result{}, nil
}

func (r *ETCDSnapshotRestoreReconciler) revokePlanPermissions(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
if err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, scope.machines.Newest(), scope.machines).Revoke(ctx); err != nil {
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
}

func (r *ETCDSnapshotRestoreReconciler) stopRKE2OnAllMachines(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
log := log.FromContext(ctx)

Expand All @@ -259,7 +276,7 @@ func (r *ETCDSnapshotRestoreReconciler) stopRKE2OnAllMachines(ctx context.Contex
log.Info("Stopping RKE2 on machine", "machine", machine.Name)

// Get the plan secret for the machine.
applied, err := Plan(ctx, r.Client, machine).Apply(ctx, RKE2KillAll())
applied, err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, machine, scope.machines).Apply(ctx, RKE2KillAll())
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to get plan secret for machine: %w", err)
}
Expand All @@ -286,15 +303,15 @@ func (r *ETCDSnapshotRestoreReconciler) stopRKE2OnAllMachines(ctx context.Contex
return ctrl.Result{}, nil
}

func (r *ETCDSnapshotRestoreReconciler) restoreSnaphotOnInitMachine(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
func (r *ETCDSnapshotRestoreReconciler) restoreSnapshotOnInitMachine(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
log := log.FromContext(ctx)

initMachine := scope.machines.Filter(InitMachine(scope.etcdMachineSnapshot)).UnsortedList()[0]
initMachine := scope.machines.Filter(initMachine(scope.etcdMachineSnapshot)).UnsortedList()[0]

log.Info("Filling plan secret with etcd restore instructions", "machine", initMachine.Name)

// Get the plan secret for the machine.
applied, err := Plan(ctx, r.Client, initMachine).Apply(
applied, err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, initMachine, scope.machines).Apply(
ctx,
RemoveServerURL(),
ManifestRemoval(),
Expand All @@ -318,7 +335,7 @@ func (r *ETCDSnapshotRestoreReconciler) restoreSnaphotOnInitMachine(ctx context.
func (r *ETCDSnapshotRestoreReconciler) startRKE2OnAllMachines(ctx context.Context, scope *scope, etcdSnapshotRestore *snapshotrestorev1.ETCDSnapshotRestore) (ctrl.Result, error) {
log := log.FromContext(ctx)

initMachine := scope.machines.Filter(InitMachine(scope.etcdMachineSnapshot)).UnsortedList()[0]
initMachine := scope.machines.Filter(initMachine(scope.etcdMachineSnapshot)).UnsortedList()[0]

// TODO: other registration methods
initMachineIP := getInternalMachineIP(initMachine)
Expand Down Expand Up @@ -350,7 +367,7 @@ func (r *ETCDSnapshotRestoreReconciler) startRKE2OnAllMachines(ctx context.Conte
StartRKE2())
}

applied, err := Plan(ctx, r.Client, machine).Apply(ctx, instructions...)
applied, err := Plan(ctx, r.Client, "restore"+etcdSnapshotRestore.Name, machine, scope.machines).Apply(ctx, instructions...)
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to patch plan secret: %w", err)
} else if !applied.Finished {
Expand Down
88 changes: 82 additions & 6 deletions exp/etcdrestore/controllers/planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,21 @@ import (
bootstrapv1 "github.com/rancher/cluster-api-provider-rke2/bootstrap/api/v1beta1"
snapshotrestorev1 "github.com/rancher/turtles/exp/etcdrestore/api/v1alpha1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

// Planner is responsible for executing instructions on the underlying machine host
// in the specified order, and collecting output from executed steps.
type Planner struct {
Name string
client.Client
machine *clusterv1.Machine
secret *corev1.Secret
machine *clusterv1.Machine
machines collections.Machines
secret *corev1.Secret
}

// Instructions is a one time operation, used to perform shell commands on the host
Expand All @@ -64,11 +68,13 @@ type plan struct {
}

// Plan is initializing Planner, used to perform instructions in a specific order and collect results
func Plan(ctx context.Context, c client.Client, machine *clusterv1.Machine) *Planner {
func Plan(ctx context.Context, c client.Client, name string, machine *clusterv1.Machine, machines collections.Machines) *Planner {
return &Planner{
Client: c,
machine: machine,
secret: initSecret(machine, map[string][]byte{}),
Client: c,
Name: name,
machine: machine,
machines: machines,
secret: initSecret(machine, map[string][]byte{}),
}
}

Expand Down Expand Up @@ -247,6 +253,76 @@ func (p *Planner) applied(plan, appliedChecksum []byte) bool {
return planHash == string(appliedChecksum)
}

// planRole returns the Role for the Plan.
func (p *Planner) planRole() *rbacv1.Role {
secrets := []string{}
for _, machine := range p.machines.UnsortedList() {
planSecretName := strings.Join([]string{machine.Spec.Bootstrap.ConfigRef.Name, "rke2config", "plan"}, "-")
secrets = append(secrets, planSecretName)
}

return &rbacv1.Role{
ObjectMeta: metav1.ObjectMeta{
Name: p.machine.Labels[clusterv1.ClusterNameLabel] + "-" + p.Name,
Namespace: p.machine.Namespace,
},
Rules: []rbacv1.PolicyRule{
{
Verbs: []string{"watch", "get", "update", "list"},
APIGroups: []string{""},
Resources: []string{"secrets"},
ResourceNames: secrets,
},
},
}
}

// planRoleBinding creates a RoleBinding for the plan.
func (p *Planner) planRoleBinding() *rbacv1.RoleBinding {
return &rbacv1.RoleBinding{
ObjectMeta: metav1.ObjectMeta{
Name: p.machine.Labels[clusterv1.ClusterNameLabel] + "-" + p.Name,
Namespace: p.machine.Namespace,
},
Subjects: []rbacv1.Subject{
{
Kind: "ServiceAccount",
Name: p.machine.Labels[clusterv1.ClusterNameLabel] + "-system-agent",
Namespace: p.machine.Namespace,
},
},
RoleRef: rbacv1.RoleRef{
APIGroup: rbacv1.GroupName,
Kind: "Role",
Name: p.machine.Labels[clusterv1.ClusterNameLabel] + "-" + p.Name,
},
}
}

func (p *Planner) Permit(ctx context.Context) error {
if err := p.Create(ctx, p.planRole()); client.IgnoreAlreadyExists(err) != nil {
return fmt.Errorf("unable to create plan role: %w", err)
}

if err := p.Create(ctx, p.planRoleBinding()); client.IgnoreAlreadyExists(err) != nil {
return fmt.Errorf("unable to create plan role binding: %w", err)
}

return nil
}

func (p *Planner) Revoke(ctx context.Context) error {
if err := p.Delete(ctx, p.planRole()); client.IgnoreNotFound(err) != nil {
return fmt.Errorf("unable to delete plan role: %w", err)
}

if err := p.Delete(ctx, p.planRoleBinding()); client.IgnoreNotFound(err) != nil {
return fmt.Errorf("unable to delete plan role binding: %w", err)
}

return nil
}

func (p *Planner) updatePlanSecret(ctx context.Context, data []byte) error {
log := log.FromContext(ctx)

Expand Down
2 changes: 1 addition & 1 deletion exp/etcdrestore/examples/etcd-restore.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
apiversion: turtles-capi.cattle.io/v1alpha1
apiVersion: turtles-capi.cattle.io/v1alpha1
kind: ETCDSnapshotRestore
metadata:
name: example-restore
Expand Down
2 changes: 1 addition & 1 deletion exp/etcdrestore/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
k8s.io/client-go v0.29.4
k8s.io/component-base v0.29.4
k8s.io/klog/v2 v2.110.1
k8s.io/utils v0.0.0-20231127182322-b307cd553661
sigs.k8s.io/cluster-api v1.7.3
sigs.k8s.io/cluster-api-operator v0.13.0
sigs.k8s.io/controller-runtime v0.17.3
Expand Down Expand Up @@ -86,7 +87,6 @@ require (
k8s.io/apiextensions-apiserver v0.29.4 // indirect
k8s.io/cluster-bootstrap v0.29.3 // indirect
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
k8s.io/utils v0.0.0-20231127182322-b307cd553661 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
Expand Down
Loading

0 comments on commit c47f713

Please sign in to comment.