Skip to content

Commit 60bd2f2

Browse files
committed
Handle initial node taints via Terraform
1 parent 1aafe74 commit 60bd2f2

File tree

5 files changed

+146
-122
lines changed

5 files changed

+146
-122
lines changed

misc/nvme-bootstrap/README.md

Lines changed: 33 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Materialize requires fast, locally-attached NVMe storage for optimal performance
1010

1111
1. Automatically detects NVMe instance store devices on your nodes
1212
2. Creates an LVM volume group from these devices
13-
3. Configures OpenEBS to provision persistent volumes from this storage
13+
3. Configures OpenEBS LVM Local-PV to provision persistent volumes from this storage
1414
4. Makes high-performance storage available to Materialize
1515

1616
## Prerequisites
@@ -36,10 +36,9 @@ module "materialize" {
3636
# Use an instance type with NVMe storage
3737
node_group_instance_types = ["r6gd.2xlarge"]
3838
node_group_ami_type = "BOTTLEROCKET_ARM_64"
39+
enable_nvme_storage = true
3940
40-
# Disable Materialize Operator installation as it will require the storage class
41-
# TODO: The module will support this configuration in the future
42-
install_materialize_operator = false
41+
install_materialize_operator = true
4342
4443
# Other module parameters...
4544
}
@@ -55,6 +54,8 @@ If you're setting up manually or need to customize the configuration, follow the
5554

5655
### Step 1: Build and Push the Container Image
5756

57+
> Note: This is temporary and will be replaced with a pre-built image which can be pulled from a public registry.
58+
5859
```bash
5960
# Clone the Materialize repository
6061
git clone https://github.com/MaterializeInc/materialize.git
@@ -70,32 +71,7 @@ docker build -t your-registry/nvme-bootstrap:latest .
7071
docker push your-registry/nvme-bootstrap:latest
7172
```
7273

73-
### Step 2: Install OpenEBS
74-
75-
OpenEBS provides the CSI driver that interfaces with LVM to provide persistent storage:
76-
77-
```bash
78-
# Add the OpenEBS Helm repository
79-
helm repo add openebs https://openebs.github.io/charts
80-
helm repo update
81-
82-
# Create namespace for OpenEBS
83-
kubectl create namespace openebs
84-
85-
# Install OpenEBS with only the necessary components
86-
helm install openebs openebs/openebs \
87-
--namespace openebs \
88-
--set engines.replicated.mayastor.enabled=false
89-
```
90-
91-
Verify the installation:
92-
93-
```bash
94-
# Check if the LVM controller is running
95-
kubectl get pods -n openebs -l role=openebs-lvm
96-
```
97-
98-
### Step 3: Deploy the NVMe Bootstrap Components
74+
### Step 2: Deploy the NVMe Bootstrap Components
9975

10076
```bash
10177
# Navigate to the Kubernetes manifests directory
@@ -121,9 +97,35 @@ The DaemonSet will:
12197
3. Create the "instance-store-vg" volume group
12298
4. Make the storage available for OpenEBS
12399

100+
### Step 3: Install OpenEBS
101+
102+
OpenEBS provides the CSI driver that interfaces with LVM to provide persistent storage:
103+
104+
```bash
105+
# Add the OpenEBS Helm repository
106+
helm repo add openebs https://openebs.github.io/charts
107+
helm repo update
108+
109+
# Create namespace for OpenEBS
110+
kubectl create namespace openebs
111+
112+
# Install OpenEBS with only the necessary components
113+
helm install openebs openebs/openebs \
114+
--namespace openebs \
115+
--set engines.replicated.mayastor.enabled=false
116+
```
117+
118+
Verify the installation:
119+
120+
```bash
121+
# Check if the LVM controller is running
122+
kubectl get pods -n openebs -l role=openebs-lvm
123+
```
124+
124125
### Step 4: Create and Test the Storage Class
125126

126127
```bash
128+
# TODO: remove this step once the Terraform module handles this
127129
# Create the StorageClass
128130
kubectl apply -f storageclass.yaml
129131

@@ -139,9 +141,8 @@ A successful test shows your storage class is working correctly.
139141
To clean up the test resources:
140142

141143
```bash
142-
# Delete the test PVC and StorageClass
144+
# Delete the test PVC
143145
kubectl delete -f test-pvc.yaml
144-
kubectl delete -f storageclass.yaml
145146
```
146147

147148
### Step 5: Configure Materialize to Use the Storage Class
@@ -246,19 +247,3 @@ kubectl get pvc -A | grep openebs-lvm-instance-store-ext4
246247
```bash
247248
kubectl get serviceaccount nvme-setup-sa -n kube-system
248249
```
249-
250-
## Clean Up
251-
252-
When you're done testing:
253-
254-
```bash
255-
# Delete test resources
256-
kubectl delete -f test-pvc.yaml
257-
kubectl delete -f daemonset.yaml
258-
kubectl delete -f rbac.yaml
259-
kubectl delete -f storageclass.yaml
260-
261-
# Delete OpenEBS if no longer needed
262-
helm uninstall openebs -n openebs
263-
kubectl delete namespace openebs
264-
```

misc/nvme-bootstrap/container/Dockerfile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,3 @@ COPY configure-disks.sh /usr/local/bin/configure-disks.sh
2626
COPY manage-taints.sh /usr/local/bin/manage-taints.sh
2727

2828
RUN chmod +x /usr/local/bin/configure-disks.sh /usr/local/bin/manage-taints.sh
29-
30-
ENTRYPOINT ["/usr/local/bin/configure-disks.sh"]

misc/nvme-bootstrap/container/configure-disks.sh

Lines changed: 90 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,47 +9,111 @@
99
# the Business Source License, use of this software will be governed
1010
# by the Apache License, Version 2.0.
1111

12-
set -xeo pipefail
12+
set -xeuo pipefail
1313

14-
# Volume group name
1514
VG_NAME="instance-store-vg"
15+
CLOUD_PROVIDER=""
16+
17+
while [[ $# -gt 0 ]]; do
18+
case $1 in
19+
--cloud-provider|-c)
20+
CLOUD_PROVIDER="$2"
21+
shift 2
22+
;;
23+
--vg-name|-v)
24+
VG_NAME="$2"
25+
shift 2
26+
;;
27+
--help|-h)
28+
echo "Usage: $0 [options]"
29+
echo "Options:"
30+
echo " --cloud-provider, -c PROVIDER Specify cloud provider (aws, gcp, azure, generic)"
31+
echo " --vg-name, -v NAME Specify volume group name (default: instance-store-vg)"
32+
echo " --help, -h Show this help message"
33+
exit 0
34+
;;
35+
*)
36+
echo "Unknown option: $1"
37+
exit 1
38+
;;
39+
esac
40+
done
1641

17-
# Function to detect cloud provider
1842
detect_cloud_provider() {
19-
# Check for AWS
20-
if curl -s -m 5 http://169.254.169.254/latest/meta-data/ >/dev/null; then
43+
# Only attempt detection if not explicitly provided
44+
if [[ -n "$CLOUD_PROVIDER" ]]; then
45+
echo "$CLOUD_PROVIDER"
46+
return
47+
fi
48+
49+
# Fall back to AWS detection if no provider is specified
50+
if curl -s -m 5 --fail http://169.254.169.254/latest/meta-data/ >/dev/null 2>&1; then
2151
echo "aws"
2252
return
2353
fi
2454

25-
# Default to generic
55+
# Default to generic if detection fails
2656
echo "generic"
2757
}
2858

29-
# Cloud provider-specific device detection
59+
find_aws_bottlerocket_devices() {
60+
local nvme_devices=()
61+
local BOTTLEROCKET_ROOT="/.bottlerocket/rootfs"
62+
63+
mapfile -t SSD_NVME_DEVICE_LIST < <(lsblk --json --output-all | \
64+
jq -r '.blockdevices[] | select(.model // empty | contains("Amazon EC2 NVMe Instance Storage")) | .path')
65+
66+
for device in "${SSD_NVME_DEVICE_LIST[@]}"; do
67+
nvme_devices+=("$BOTTLEROCKET_ROOT$device")
68+
done
69+
70+
echo "${nvme_devices[@]}"
71+
}
72+
73+
find_aws_standard_devices() {
74+
lsblk --json --output-all | \
75+
jq -r '.blockdevices[] | select(.model // empty | contains("Amazon EC2 NVMe Instance Storage")) | .path'
76+
}
77+
78+
find_aws_devices() {
79+
local nvme_devices=()
80+
81+
# Check if we're running in Bottlerocket
82+
if [[ -d "/.bottlerocket" ]]; then
83+
# Use mapfile to properly handle the output
84+
mapfile -t nvme_devices < <(find_aws_bottlerocket_devices)
85+
else
86+
# Use mapfile to properly handle the output
87+
mapfile -t nvme_devices < <(find_aws_standard_devices)
88+
fi
89+
90+
echo "${nvme_devices[@]}"
91+
}
92+
93+
find_generic_devices() {
94+
lsblk --json --output-all | \
95+
jq -r '.blockdevices[] | select(.name | startswith("nvme")) | select(.mountpoint == null and (.children | length == 0)) | .path'
96+
}
97+
3098
find_nvme_devices() {
3199
local cloud=$1
32100
local nvme_devices=()
33101

34102
case $cloud in
35103
aws)
36-
# Handle both standard Linux and Bottlerocket paths
37-
if [ -d "/.bottlerocket" ]; then
38-
# Bottlerocket specific path
39-
BOTTLEROCKET_ROOT="/.bottlerocket/rootfs"
40-
mapfile -t SSD_NVME_DEVICE_LIST < <(lsblk --json --output-all | jq -r '.blockdevices[] | select(.model // empty | contains("Amazon EC2 NVMe Instance Storage")) | .path')
41-
for device in "${SSD_NVME_DEVICE_LIST[@]}"; do
42-
nvme_devices+=("$BOTTLEROCKET_ROOT$device")
43-
done
44-
else
45-
# Standard EC2 instances
46-
mapfile -t nvme_devices < <(lsblk --json --output-all | jq -r '.blockdevices[] | select(.model // empty | contains("Amazon EC2 NVMe Instance Storage")) | .path')
47-
fi
104+
# Use mapfile to properly handle the output
105+
mapfile -t nvme_devices < <(find_aws_devices)
48106
;;
49-
# Add more cloud providers here
107+
# Add more cloud providers here as we support them
108+
# gcp)
109+
# mapfile -t nvme_devices < <(find_gcp_devices)
110+
# ;;
111+
# azure)
112+
# mapfile -t nvme_devices < <(find_azure_devices)
113+
# ;;
50114
*)
51-
# Generic approach - find all NVMe devices that are not mounted and don't have children (partitions)
52-
mapfile -t nvme_devices < <(lsblk --json --output-all | jq -r '.blockdevices[] | select(.name | startswith("nvme")) | select(.mountpoint == null and (.children | length == 0)) | .path')
115+
# Generic approach for any other cloud or environment
116+
mapfile -t nvme_devices < <(find_generic_devices)
53117
;;
54118
esac
55119

@@ -60,7 +124,7 @@ find_nvme_devices() {
60124
setup_lvm() {
61125
local -a devices=("$@")
62126

63-
if [ ${#devices[@]} -eq 0 ]; then
127+
if [[ ${#devices[@]} -eq 0 ]]; then
64128
echo "No suitable NVMe devices found"
65129
exit 1
66130
fi
@@ -89,27 +153,19 @@ setup_lvm() {
89153
return 0
90154
}
91155

92-
# Main execution
93156
echo "Starting NVMe disk configuration..."
94157

95-
# Detect cloud provider
158+
# Detect or use provided cloud provider
96159
CLOUD_PROVIDER=$(detect_cloud_provider)
97-
echo "Detected cloud provider: $CLOUD_PROVIDER"
160+
echo "Using cloud provider: $CLOUD_PROVIDER"
98161

99162
# Find NVMe devices
100163
mapfile -t NVME_DEVICES < <(find_nvme_devices "$CLOUD_PROVIDER")
101164

102165
# Setup LVM
103166
if setup_lvm "${NVME_DEVICES[@]}"; then
104167
echo "NVMe disk configuration completed successfully"
105-
# Call taint management script to remove the taint
106-
/usr/local/bin/manage-taints.sh remove
107-
108-
# Keep the container running
109-
echo "Setup complete. Container will now stay running for monitoring purposes."
110-
while true; do
111-
sleep 3600
112-
done
168+
exit 0
113169
else
114170
echo "NVMe disk configuration failed"
115171
exit 1

misc/nvme-bootstrap/container/manage-taints.sh

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,40 +9,26 @@
99
# the Business Source License, use of this software will be governed
1010
# by the Apache License, Version 2.0.
1111

12-
set -e
12+
set -euo pipefail
1313

1414
# Node name is provided via downward API
1515
NODE_NAME=${NODE_NAME:-$(hostname)}
1616
TAINT_KEY="disk-unconfigured"
17-
TAINT_VALUE="true"
18-
TAINT_EFFECT="NoSchedule"
1917

2018
echo "Starting taint management for node: $NODE_NAME"
2119
echo "Action: $1"
2220

2321
# Check if necessary environment variables and files exist
2422
if [ -z "$KUBERNETES_SERVICE_HOST" ] || [ -z "$KUBERNETES_SERVICE_PORT" ]; then
2523
echo "Error: Kubernetes service environment variables not found"
26-
exit 0 # Exit with success to avoid crash loop
24+
exit 1
2725
fi
2826

2927
if [ ! -f "/var/run/secrets/kubernetes.io/serviceaccount/token" ]; then
3028
echo "Error: Service account token not found"
31-
exit 0 # Exit with success to avoid crash loop
29+
exit 1
3230
fi
3331

34-
# Add the taint to the node
35-
add_taint() {
36-
echo "Adding taint $TAINT_KEY=$TAINT_VALUE:$TAINT_EFFECT to node $NODE_NAME"
37-
38-
kubectl --server="https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT}" \
39-
--token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
40-
--certificate-authority="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" \
41-
taint nodes "$NODE_NAME" "$TAINT_KEY=$TAINT_VALUE:$TAINT_EFFECT" --overwrite || {
42-
echo "Warning: Failed to add taint, but continuing anyway"
43-
}
44-
}
45-
4632
# Remove the taint from the node
4733
remove_taint() {
4834
echo "Removing taint $TAINT_KEY from node $NODE_NAME"
@@ -56,18 +42,15 @@ remove_taint() {
5642
}
5743

5844
# Main execution
59-
ACTION=${1:-"add"}
45+
ACTION=${1:-"remove"}
6046

6147
case "$ACTION" in
62-
add)
63-
add_taint
64-
;;
6548
remove)
6649
remove_taint
6750
;;
6851
*)
69-
echo "Usage: $0 [add|remove]"
70-
exit 0 # Exit with success to avoid crash loop
52+
echo "Usage: $0 [remove]"
53+
exit 1
7154
;;
7255
esac
7356

0 commit comments

Comments
 (0)