From 2f82e428e6d8a6e24dc0b32745c4ae4d19e030c6 Mon Sep 17 00:00:00 2001
From: Derek Nola <derek.nola@suse.com>
Date: Mon, 6 May 2024 12:11:56 -0700
Subject: [PATCH] Add RKE2 Basic Resource Profiling (#202)

* Add resource profiling for RKE2

Signed-off-by: Derek Nola <derek.nola@suse.com>

* Fix broken zh anchors

Signed-off-by: Derek Nola <derek.nola@suse.com>

* Cleanup comments on script

Signed-off-by: Derek Nola <derek.nola@suse.com>

---------

Signed-off-by: Derek Nola <derek.nola@suse.com>
---
 docs/reference/resource_profiling.md          | 49 ++++++++++++++++
 .../current/install/ha.md                     |  2 +-
 .../current/networking/multus_sriov.md        |  4 +-
 .../current/networking/networking_services.md |  4 +-
 .../current/security/hardening_guide.md       |  2 +-
 scripts/graph_cgtop.py                        | 56 +++++++++++++++++++
 6 files changed, 111 insertions(+), 6 deletions(-)
 create mode 100644 docs/reference/resource_profiling.md
 create mode 100644 scripts/graph_cgtop.py

diff --git a/docs/reference/resource_profiling.md b/docs/reference/resource_profiling.md
new file mode 100644
index 00000000..409a1d3d
--- /dev/null
+++ b/docs/reference/resource_profiling.md
@@ -0,0 +1,49 @@
+---
+title: Resource Profiling
+---
+
+This section captures the results of tests to determine minimum resource requirements for RKE2.
+
+## Scope of Resource Testing
+
+The resource tests were intended to address the following problem statements:
+
+- On a single-node cluster, determine the legitimate minimum amount of CPU and memory entire RKE2 server stack, assuming that a real workload will be deployed on the cluster.
+- On an agent node, determine the legitimate minimum amount of CPU and memory that should be set aside for the kubelet and RKE2 agent components.
+
+### Environment and Components
+
+| Arch | OS | System | CPU | RAM | Disk | 
+|------|----|--------|--|----|------|
+| x86_64 | Ubuntu 22.04 | AWS c6id.xlarge | Intel Xeon Platinum 8375C CPU, 4 Core 2.90 GHz | 8 GB | NVME SSD |
+
+
+The tested components are:
+
+* RKE2 v1.27.12 with all packaged components enabled, canal as the CNI
+* [Kubernetes Example Nginx Deployment](https://kubernetes.io/docs/tasks/run-application/run-stateless-application-deployment/)
+
+### Methodology
+
+`systemd-cgtop` was used to track systemd cgroup-level CPU and memory utilization. 
+- `system.slice/rke2-server.service` tracks resource utilization for both RKE2 and containerd components.
+- `system.slice/rke2-agent.service` tracks resource utilization for the agent components.
+
+Utilization figures were based on 95th percentile readings from steady state operation on nodes running the described workloads, giving an upper bounds on typical resource usage.
+
+### RKE2 Server with a Workload
+
+These are the requirements for a single-node cluster in which the RKE2 server shares resources with a [simple workload](https://kubernetes.io/docs/tasks/run-application/run-stateless-application-deployment/).
+
+| System | CPU Core Usage | Memory |
+|--------|----------------| ------ |
+| Intel 8375C | 17% of a core | 4977 MB |
+
+### RKE2 Cluster with a Single Agent
+
+These are the baseline requirements for a RKE2 cluster with a RKE2 server node and a RKE2 agent, but no workload.
+
+| Node | System | CPU Core Usage | Memory |
+| ---- | -------|----------------| ------ |
+| Server | Intel 8375C | 18% of a core | 4804 MB |
+| Agent  | Intel 8375C | 5% of a core | 3590 MB |
diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/install/ha.md b/i18n/zh/docusaurus-plugin-content-docs/current/install/ha.md
index ed934171..e45c9581 100644
--- a/i18n/zh/docusaurus-plugin-content-docs/current/install/ha.md
+++ b/i18n/zh/docusaurus-plugin-content-docs/current/install/ha.md
@@ -96,7 +96,7 @@ tls-san:
 
 因为 RKE2 Server 节点默认是可调度的，所以 HA RKE2 Server 集群的最小节点数是三个 Server 节点和零个 Agent 节点。要添加用于运行应用程序和服务的节点，请将 Agent 节点加入到你的集群中。
 
-在 HA 集群中加入 Agent 节点与[在单个 Server 集群中加入 Agent 节点](quickstart.md#linux-agent-worker-节点安装)是一样的。你只需要指定 Agent 应该注册的 URL 和要使用的 Token 即可。
+在 HA 集群中加入 Agent 节点与[在单个 Server 集群中加入 Agent 节点](quickstart.md#linux-agentworker节点安装)是一样的。你只需要指定 Agent 应该注册的 URL 和要使用的 Token 即可。
 
 ```yaml
 server: https://my-kubernetes-domain.com:9345
diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/networking/multus_sriov.md b/i18n/zh/docusaurus-plugin-content-docs/current/networking/multus_sriov.md
index 4dd04e8d..4236f71b 100644
--- a/i18n/zh/docusaurus-plugin-content-docs/current/networking/multus_sriov.md
+++ b/i18n/zh/docusaurus-plugin-content-docs/current/networking/multus_sriov.md
@@ -57,7 +57,7 @@ host-local IPAM plugin allocates ip addresses out of a set of address ranges. It
 
 Multus provides an optional daemonset to deploy the DHCP daemon required to run the [DHCP IPAM plugin](https://www.cni.dev/plugins/current/ipam/dhcp/).
 
-You can do this by using the following [HelmChartConfig](../helm.md#customizing-packaged-components-with-helmchartconfig):
+You can do this by using the following [HelmChartConfig](../helm.md#使用-helmchartconfig-自定义打包组件):
 ```yaml
 # /var/lib/rancher/rke2/server/manifests/rke2-multus-config.yaml
 ---
@@ -81,7 +81,7 @@ NOTE: You should write this file before starting rke2.
 
 [Whereabouts](https://github.com/k8snetworkplumbingwg/whereabouts) is an IP Address Management (IPAM) CNI plugin that assigns IP addresses cluster-wide.
 RKE2 includes the option to use Whereabouts with Multus to manage the IP addresses of the additional interfaces created through Multus.
-In order to do this, you need to use [HelmChartConfig](../helm.md#customizing-packaged-components-with-helmchartconfig) to configure the Multus CNI to use Whereabouts.
+In order to do this, you need to use [HelmChartConfig](../helm.md#使用-helmchartconfig-自定义打包组件) to configure the Multus CNI to use Whereabouts.
 
 You can do this by using the following HelmChartConfig:
 
diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/networking/networking_services.md b/i18n/zh/docusaurus-plugin-content-docs/current/networking/networking_services.md
index 6f9d8ad6..f87ca4fd 100644
--- a/i18n/zh/docusaurus-plugin-content-docs/current/networking/networking_services.md
+++ b/i18n/zh/docusaurus-plugin-content-docs/current/networking/networking_services.md
@@ -14,7 +14,7 @@ CoreDNS is deployed by default when starting the server. To disable, run each se
 
 If you don't install CoreDNS, you will need to install a cluster DNS provider yourself.
 
-CoreDNS is deployed with the [autoscaler](https://github.com/kubernetes-incubator/cluster-proportional-autoscaler) by default. To disable it or change its config, use the [HelmChartConfig](../helm.md#customizing-packaged-components-with-helmchartconfig) resource.
+CoreDNS is deployed with the [autoscaler](https://github.com/kubernetes-incubator/cluster-proportional-autoscaler) by default. To disable it or change its config, use the [HelmChartConfig](../helm.md#使用-helmchartconfig-自定义打包组件) resource.
 
 ### NodeLocal DNSCache
 
@@ -57,7 +57,7 @@ spec:
 
 `nginx-ingress` is deployed by default when starting the server. Ports 80 and 443 will be bound by the ingress controller in its default configuration, making these unusable for HostPort or NodePort services in the cluster.
 
-Configuration options can be specified by creating a [HelmChartConfig manifest](../helm.md#customizing-packaged-components-with-helmchartconfig) to customize the `rke2-ingress-nginx` HelmChart values. For example, a HelmChartConfig at `/var/lib/rancher/rke2/server/manifests/rke2-ingress-nginx-config.yaml` with the following contents sets `use-forwarded-headers` to `"true"` in the ConfigMap storing the NGINX config:
+Configuration options can be specified by creating a [HelmChartConfig manifest](../helm.md#使用-helmchartconfig-自定义打包组件) to customize the `rke2-ingress-nginx` HelmChart values. For example, a HelmChartConfig at `/var/lib/rancher/rke2/server/manifests/rke2-ingress-nginx-config.yaml` with the following contents sets `use-forwarded-headers` to `"true"` in the ConfigMap storing the NGINX config:
 ```yaml
 # /var/lib/rancher/rke2/server/manifests/rke2-ingress-nginx-config.yaml
 ---
diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/security/hardening_guide.md b/i18n/zh/docusaurus-plugin-content-docs/current/security/hardening_guide.md
index a4cb00be..8d5a0d2c 100644
--- a/i18n/zh/docusaurus-plugin-content-docs/current/security/hardening_guide.md
+++ b/i18n/zh/docusaurus-plugin-content-docs/current/security/hardening_guide.md
@@ -110,7 +110,7 @@ When the `profile` flag is set it does the following:
 
 3. Configures the Pod Security Admission Controller to enforce restricted mode in all namespaces, with the exception of the `kube-system`, `cis-operator-system`, and `tigera-operator` namespaces.
    These namespaces are exempted to allow system pods to run without restrictions, which is required for proper operation of the cluster.  
-   For more information about the PSA configuration, see the default [Pod Security Admission configurations](pod_security_standards.md#pod-security-standards).  
+   For more information about the PSA configuration, see the default [Pod Security Admission configurations](pod_security_standards.md#pod-安全标准).  
    For more information about Pod Security Standards, please refer to the [official documentation](https://kubernetes.io/docs/concepts/security/pod-security-standards/).
 
 
diff --git a/scripts/graph_cgtop.py b/scripts/graph_cgtop.py
new file mode 100644
index 00000000..a0ba1d1d
--- /dev/null
+++ b/scripts/graph_cgtop.py
@@ -0,0 +1,56 @@
+# Script used to parse the output of systemd-cgtop and generate a histogram of the memory usage
+# Used for resource profiling page in the RKE2 documentation
+# Generate input file using the following command:
+# systemd-cgtop system.slice/rke2-server.service --raw -b -n 1200 -d 250ms > systemd_cgtop_output.txt
+# Pulls data every 0.25s for 5 minutes
+
+# Data arragment is:
+# cgroup name, # of tasks, CPU %, MEM usage (bytes)
+
+# import matplotlib.pyplot as plt
+import re
+import numpy
+
+tasks = []
+cpu_usage = []
+memory_usage = []
+input_file = "systemd_cgtop_output.txt"
+cgroup = "rke2-server"
+
+with open(input_file, 'r') as infile:
+    # Iterate over each line in the input file
+    for line in infile:
+        regex = r'system\.slice/' + cgroup
+        if re.search(regex, line):
+            # Split the line into fields
+            fields = line.split()
+            # first entry for cpu is blank, so we skip it
+            if fields[2] == "-":
+                continue
+            tasks.append(int(fields[1]))
+            cpu_usage.append(float(fields[2]))
+            memory_usage.append(int(fields[3])) 
+
+# Convert memory usage to megabytes
+memory_usage = [usage / 1024 ** 2 for usage in memory_usage]
+
+tasks_avg = numpy.average(tasks)
+cpu_95th = numpy.percentile(cpu_usage, 95)
+memory_95th = numpy.percentile(memory_usage, 95)
+print(f'Number of Tasks: ', tasks_avg)
+print(f'95th Percentile CPU Usage: {cpu_95th:.2f}%')
+print(f'95th Percentile Memory Usage: {memory_95th:.2f} MB')
+
+# Optional Plotting
+# plt.hist(cpu_usage, bins=20, alpha=0.7, label='CPU Usage')
+# plt.hist(memory_usage, bins=20, alpha=0.7, label='Memory Usage')
+# plt.axvline(memory_95th, linestyle='dashed', linewidth=1, label=f'95th Percentile Memory ({memory_95th:.2f} MB)')
+
+# plt.xlabel('Usage')
+# plt.ylabel('Ticks')
+# plt.title('Systemd-cgtop Resource Usage Histogram')
+# plt.legend()
+# plt.show()
+
+
+