bottlerocket-os · etungsten · Jun 26, 2023 · Jun 22, 2023 · Jun 20, 2023
diff --git a/sources/api/pluto/src/aws.rs b/sources/api/pluto/src/aws.rs
@@ -0,0 +1,47 @@
+use aws_config::default_provider::credentials::DefaultCredentialsChain;
+use aws_config::imds;
+use aws_smithy_types::retry::{RetryConfig, RetryConfigBuilder};
+use aws_types::region::Region;
+use aws_types::SdkConfig;
+use snafu::{ResultExt, Snafu};
+use std::time::Duration;
+
+// Max request retry attempts; Retry many many times and let the caller decide when to terminate
+const MAX_ATTEMPTS: u32 = 100;
+const IMDS_CONNECT_TIMEOUT: Duration = Duration::from_secs(3);
+
+#[derive(Debug, Snafu)]
+pub(super) enum Error {
+    #[snafu(display("Failed to build IMDS client: {}", source))]
+    SdkImds {
+        source: imds::client::error::BuildError,
+    },
+}
+
+type Result<T> = std::result::Result<T, Error>;
+
+async fn sdk_imds_client() -> Result<imds::Client> {
+    imds::Client::builder()
+        .max_attempts(MAX_ATTEMPTS)
+        .connect_timeout(IMDS_CONNECT_TIMEOUT)
+        .build()
+        .await
+        .context(SdkImdsSnafu)
+}
+
+fn sdk_retry_config() -> RetryConfig {
+    RetryConfigBuilder::new().max_attempts(MAX_ATTEMPTS).build()
+}
+
+pub(crate) async fn sdk_config(region: &str) -> Result<SdkConfig> {
+    let provider = DefaultCredentialsChain::builder()
+        .imds_client(sdk_imds_client().await?)
+        .build()
+        .await;
+    Ok(aws_config::from_env()
+        .region(Region::new(region.to_owned()))
+        .credentials_provider(provider)
+        .retry_config(sdk_retry_config())
+        .load()
+        .await)
+}
diff --git a/sources/api/pluto/src/ec2.rs b/sources/api/pluto/src/ec2.rs
@@ -1,6 +1,6 @@
-use crate::proxy;
+use crate::aws::sdk_config;
+use crate::{aws, proxy};
 use aws_smithy_types::error::display::DisplayErrorContext;
-use aws_types::region::Region;
 use snafu::{OptionExt, ResultExt, Snafu};
 use std::time::Duration;
 
@@ -27,6 +27,9 @@ pub(super) enum Error {
 
     #[snafu(context(false), display("{}", source))]
     Proxy { source: proxy::Error },
+
+    #[snafu(context(false), display("{}", source))]
+    SdkConfig { source: aws::Error },
 }
 
 type Result<T> = std::result::Result<T, Error>;
@@ -35,10 +38,7 @@ pub(super) async fn get_private_dns_name(region: &str, instance_id: &str) -> Res
     // Respect proxy environment variables when making AWS EC2 API requests
     let (https_proxy, no_proxy) = proxy::fetch_proxy_env();
 
-    let config = aws_config::from_env()
-        .region(Region::new(region.to_owned()))
-        .load()
-        .await;
+    let config = sdk_config(region).await?;
 
     let client = if let Some(https_proxy) = https_proxy {
         let http_client = proxy::setup_http_client(https_proxy, no_proxy)?;

diff --git a/sources/api/pluto/src/eks.rs b/sources/api/pluto/src/eks.rs
@@ -1,6 +1,6 @@
-use crate::proxy;
+use crate::aws::sdk_config;
+use crate::{aws, proxy};
 use aws_sdk_eks::model::KubernetesNetworkConfigResponse;
-use aws_types::region::Region;
 use snafu::{OptionExt, ResultExt, Snafu};
 use std::time::Duration;
 
@@ -19,11 +19,14 @@ pub(super) enum Error {
     #[snafu(display("Timed-out waiting for EKS Describe Cluster API response: {}", source))]
     DescribeClusterTimeout { source: tokio::time::error::Elapsed },
 
-    #[snafu(display("Missing field '{}' EKS response", field))]
+    #[snafu(display("Missing field '{}' in EKS response", field))]
     Missing { field: &'static str },
 
     #[snafu(context(false), display("{}", source))]
     Proxy { source: proxy::Error },
+
+    #[snafu(context(false), display("{}", source))]
+    SdkConfig { source: aws::Error },
 }
 
 type Result<T> = std::result::Result<T, Error>;
@@ -37,10 +40,7 @@ pub(super) async fn get_cluster_network_config(
     // Respect proxy environment variables when making AWS EKS API requests
     let (https_proxy, no_proxy) = proxy::fetch_proxy_env();
 
-    let config = aws_config::from_env()
-        .region(Region::new(region.to_owned()))
-        .load()
-        .await;
+    let config = sdk_config(region).await?;
 
     let client = if let Some(https_proxy) = https_proxy {
         let http_client = proxy::setup_http_client(https_proxy, no_proxy)?;

diff --git a/sources/api/pluto/src/main.rs b/sources/api/pluto/src/main.rs
@@ -32,6 +32,7 @@ reasonable default is available.
 */
 
 mod api;
+mod aws;
 mod ec2;
 mod eks;
 mod proxy;

diff --git a/sources/api/sundog/Cargo.toml b/sources/api/sundog/Cargo.toml
@@ -20,7 +20,7 @@ serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 simplelog = "0.12"
 snafu = "0.7"
-tokio = { version = "~1.25", default-features = false, features = ["macros", "rt-multi-thread"] }  # LTS
+tokio = { version = "~1.25", default-features = false, features = ["process", "macros", "rt-multi-thread"] }  # LTS
 
 [build-dependencies]
 generate-readme = { version = "0.1", path = "../../generate-readme" }
diff --git a/sources/api/sundog/src/main.rs b/sources/api/sundog/src/main.rs
@@ -17,10 +17,15 @@ use std::env;
 use std::path::Path;
 use std::process;
 use std::str::{self, FromStr};
+use std::time::Duration;
+use tokio::process::Command as AsyncCommand;
 
 use datastore::serialization::to_pairs_with_prefix;
 use datastore::{self, deserialization, Key, KeyType};
 
+// Limit settings generator execution to at most 6 minutes to prevent boot from hanging for too long.
+const SETTINGS_GENERATOR_TIMEOUT: Duration = Duration::from_secs(360);
+
 /// Potential errors during Sundog execution
 mod error {
     use http::StatusCode;
@@ -38,6 +43,16 @@ mod error {
             source: std::io::Error,
         },
 
+        #[snafu(display(
+            "Timed-out waiting for settings generator '{}' to finish: {}",
+            generator,
+            source
+        ))]
+        CommandTimeout {
+            generator: String,
+            source: tokio::time::error::Elapsed,
+        },
+
         #[snafu(display("Generator command is invalid (empty, etc.) - '{}'", command))]
         InvalidCommand { command: String },
 
@@ -339,13 +354,20 @@ where
             command: generator.as_str(),
         })?;
 
-        let result = process::Command::new(command)
-            .envs(&proxy_envs)
-            .args(command_strings)
-            .output()
-            .context(error::CommandFailureSnafu {
-                program: generator.as_str(),
-            })?;
+        let result = tokio::time::timeout(
+            SETTINGS_GENERATOR_TIMEOUT,
+            AsyncCommand::new(command)
+                .envs(&proxy_envs)
+                .args(command_strings)
+                .output(),
+        )
+        .await
+        .context(error::CommandTimeoutSnafu {
+            generator: generator.as_str(),
+        })?
+        .context(error::CommandFailureSnafu {
+            program: generator.as_str(),
+        })?;
 
         // Match on the generator's exit code. This code lays the foundation
         // for handling alternative exit codes from generators.