Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1.14.x] cherry-pick: pluto: add more retries for requests out to AWS #3224

Merged
merged 2 commits into from
Jun 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions sources/api/pluto/src/aws.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
use aws_config::default_provider::credentials::DefaultCredentialsChain;
use aws_config::imds;
use aws_smithy_types::retry::{RetryConfig, RetryConfigBuilder};
use aws_types::region::Region;
use aws_types::SdkConfig;
use snafu::{ResultExt, Snafu};
use std::time::Duration;

// Max request retry attempts; Retry many many times and let the caller decide when to terminate
const MAX_ATTEMPTS: u32 = 100;
const IMDS_CONNECT_TIMEOUT: Duration = Duration::from_secs(3);

#[derive(Debug, Snafu)]
pub(super) enum Error {
#[snafu(display("Failed to build IMDS client: {}", source))]
SdkImds {
source: imds::client::error::BuildError,
},
}

type Result<T> = std::result::Result<T, Error>;

async fn sdk_imds_client() -> Result<imds::Client> {
imds::Client::builder()
.max_attempts(MAX_ATTEMPTS)
.connect_timeout(IMDS_CONNECT_TIMEOUT)
.build()
.await
.context(SdkImdsSnafu)
}

fn sdk_retry_config() -> RetryConfig {
RetryConfigBuilder::new().max_attempts(MAX_ATTEMPTS).build()
}

pub(crate) async fn sdk_config(region: &str) -> Result<SdkConfig> {
let provider = DefaultCredentialsChain::builder()
.imds_client(sdk_imds_client().await?)
.build()
.await;
Ok(aws_config::from_env()
.region(Region::new(region.to_owned()))
.credentials_provider(provider)
.retry_config(sdk_retry_config())
.load()
.await)
}
12 changes: 6 additions & 6 deletions sources/api/pluto/src/ec2.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::proxy;
use crate::aws::sdk_config;
use crate::{aws, proxy};
use aws_smithy_types::error::display::DisplayErrorContext;
use aws_types::region::Region;
use snafu::{OptionExt, ResultExt, Snafu};
use std::time::Duration;

Expand All @@ -27,6 +27,9 @@ pub(super) enum Error {

#[snafu(context(false), display("{}", source))]
Proxy { source: proxy::Error },

#[snafu(context(false), display("{}", source))]
SdkConfig { source: aws::Error },
}

type Result<T> = std::result::Result<T, Error>;
Expand All @@ -35,10 +38,7 @@ pub(super) async fn get_private_dns_name(region: &str, instance_id: &str) -> Res
// Respect proxy environment variables when making AWS EC2 API requests
let (https_proxy, no_proxy) = proxy::fetch_proxy_env();

let config = aws_config::from_env()
.region(Region::new(region.to_owned()))
.load()
.await;
let config = sdk_config(region).await?;

let client = if let Some(https_proxy) = https_proxy {
let http_client = proxy::setup_http_client(https_proxy, no_proxy)?;
Expand Down
14 changes: 7 additions & 7 deletions sources/api/pluto/src/eks.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::proxy;
use crate::aws::sdk_config;
use crate::{aws, proxy};
use aws_sdk_eks::model::KubernetesNetworkConfigResponse;
use aws_types::region::Region;
use snafu::{OptionExt, ResultExt, Snafu};
use std::time::Duration;

Expand All @@ -19,11 +19,14 @@ pub(super) enum Error {
#[snafu(display("Timed-out waiting for EKS Describe Cluster API response: {}", source))]
DescribeClusterTimeout { source: tokio::time::error::Elapsed },

#[snafu(display("Missing field '{}' EKS response", field))]
#[snafu(display("Missing field '{}' in EKS response", field))]
Missing { field: &'static str },

#[snafu(context(false), display("{}", source))]
Proxy { source: proxy::Error },

#[snafu(context(false), display("{}", source))]
SdkConfig { source: aws::Error },
}

type Result<T> = std::result::Result<T, Error>;
Expand All @@ -37,10 +40,7 @@ pub(super) async fn get_cluster_network_config(
// Respect proxy environment variables when making AWS EKS API requests
let (https_proxy, no_proxy) = proxy::fetch_proxy_env();

let config = aws_config::from_env()
.region(Region::new(region.to_owned()))
.load()
.await;
let config = sdk_config(region).await?;

let client = if let Some(https_proxy) = https_proxy {
let http_client = proxy::setup_http_client(https_proxy, no_proxy)?;
Expand Down
1 change: 1 addition & 0 deletions sources/api/pluto/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ reasonable default is available.
*/

mod api;
mod aws;
mod ec2;
mod eks;
mod proxy;
Expand Down
2 changes: 1 addition & 1 deletion sources/api/sundog/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ serde = { version = "1", features = ["derive"] }
serde_json = "1"
simplelog = "0.12"
snafu = "0.7"
tokio = { version = "~1.25", default-features = false, features = ["macros", "rt-multi-thread"] } # LTS
tokio = { version = "~1.25", default-features = false, features = ["process", "macros", "rt-multi-thread"] } # LTS

[build-dependencies]
generate-readme = { version = "0.1", path = "../../generate-readme" }
36 changes: 29 additions & 7 deletions sources/api/sundog/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,15 @@ use std::env;
use std::path::Path;
use std::process;
use std::str::{self, FromStr};
use std::time::Duration;
use tokio::process::Command as AsyncCommand;

use datastore::serialization::to_pairs_with_prefix;
use datastore::{self, deserialization, Key, KeyType};

// Limit settings generator execution to at most 6 minutes to prevent boot from hanging for too long.
const SETTINGS_GENERATOR_TIMEOUT: Duration = Duration::from_secs(360);

/// Potential errors during Sundog execution
mod error {
use http::StatusCode;
Expand All @@ -38,6 +43,16 @@ mod error {
source: std::io::Error,
},

#[snafu(display(
"Timed-out waiting for settings generator '{}' to finish: {}",
generator,
source
))]
CommandTimeout {
generator: String,
source: tokio::time::error::Elapsed,
},

#[snafu(display("Generator command is invalid (empty, etc.) - '{}'", command))]
InvalidCommand { command: String },

Expand Down Expand Up @@ -339,13 +354,20 @@ where
command: generator.as_str(),
})?;

let result = process::Command::new(command)
.envs(&proxy_envs)
.args(command_strings)
.output()
.context(error::CommandFailureSnafu {
program: generator.as_str(),
})?;
let result = tokio::time::timeout(
SETTINGS_GENERATOR_TIMEOUT,
AsyncCommand::new(command)
.envs(&proxy_envs)
.args(command_strings)
.output(),
)
.await
.context(error::CommandTimeoutSnafu {
generator: generator.as_str(),
})?
.context(error::CommandFailureSnafu {
program: generator.as_str(),
})?;

// Match on the generator's exit code. This code lays the foundation
// for handling alternative exit codes from generators.
Expand Down