Skip to content

Commit

Permalink
Enhancing global alerting
Browse files Browse the repository at this point in the history
  • Loading branch information
Saluki committed Jun 19, 2023
1 parent d74a181 commit 37dbe5e
Show file tree
Hide file tree
Showing 23 changed files with 564 additions and 212 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ serde_json = "1.0"

# CLI & utilities
clap = "4.2"
reqwest = { version = "0.11", features = ["rustls-tls"], default-features = false }
reqwest = { version = "0.11", features = ["rustls-tls", "json"], default-features = false }
chrono = "0.4"
ansi_term = "0.12"
8 changes: 7 additions & 1 deletion Commands.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,18 @@ server = { cmd = "cargo run server -c ./data/config.yaml", load_dotenv = true }
# Launch the watchdog relay for "region-north"
relay-north = { cmd = "cargo run relay --region region-north", load_dotenv = true }

# Launch the watchdog incident CLI
# Launch the watchdog relay for "region-west"
relay-west = { cmd = "cargo run relay --region region-west", load_dotenv = true }

# Launch the watchdog status CLI
status = { cmd = "cargo run status", load_dotenv = true }

# Launch the watchdog incident CLI
incident-ls = { cmd = "cargo run incident ls", load_dotenv = true }

# Launch the watchdog alerting test CLI
alerting-test = { cmd = "cargo run alerting test", load_dotenv = true }

# RELEASE COMMANDS
# ----------------

Expand Down
43 changes: 24 additions & 19 deletions data/config.yaml
Original file line number Diff line number Diff line change
@@ -1,43 +1,48 @@
alerters:
- name: telegram_default
medium: telegram
chat_env: TELEGRAM_CHAT
token_env: TELEGRAM_TOKEN
- name: sms
medium: spryng
recipients_env: SPRYNG_RECIPIENTS
token_env: SPRYNG_TOKEN

regions:
- name: region-north
interval: 5s
threshold: 3
send_interval: 5s
miss_threshold: 3
groups:
- name: default
threshold: 4
mediums: telegram
fail_threshold: 4
tests:
- ping 1.1.1.1
- name: region-west
interval: 5s
threshold: 3
send_interval: 5s
miss_threshold: 3
groups:
- name: default
threshold: 4
mediums: telegram
fail_threshold: 4
tests:
- http example.org
- http kongbytes.io
- name: region-south
interval: 5s
threshold: 2
send_interval: 5s
miss_threshold: 2
groups:
- name: egress
threshold: 2
mediums: telegram
fail_threshold: 2
tests:
- ping 1.1.1.1
- dns example.org
- http example.org
- dns kongbytes.io
- http kongbytes.io
- name: bars
mediums: telegram
threshold: 3
fail_threshold: 3
tests:
- ping 192.168.1.1
- ping 192.168.2.20
- ping 1.1.1.1
- name: other
mediums: telegram
threshold: 5
fail_threshold: 5
tests:
- ping 192.168.1.50
- ping 1.1.1.1
25 changes: 25 additions & 0 deletions src/cli/alerting.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
use serde::Deserialize;

use crate::common::error::Error;
use super::utils::api_post;

#[derive(Deserialize)]
struct AlertTestResponse {
alerts_sent: bool,
error: Option<String>
}

pub async fn test_alerting(base_url: &str, token: &str) -> Result<(), Error> {

let test_response: AlertTestResponse = api_post(base_url, token, "api/v1/alerting/test").await?;

if test_response.alerts_sent {
println!("Test alerts sent to all mediums");
}
else {
let error_message = test_response.error.unwrap_or("no details".into());
println!("Error while sending test alerts to all mediums ({})", error_message);
}

Ok(())
}
8 changes: 4 additions & 4 deletions src/cli/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub fn init_config() -> Result<(), Error> {
println!(" - A region named \"region-south\" with range 10.50.0.0/22");

let mut config = ConfigInput {
alerters: vec![],
regions: vec![]
};

Expand All @@ -32,12 +33,11 @@ pub fn init_config() -> Result<(), Error> {
"dns example.org".to_string(),
"http example.org".to_string()
],
mediums: "telegram".to_string(),
threshold: 4
fail_threshold: 4
}],
name: region_name,
interval: "5s".to_string(),
threshold: 3
send_interval: "5s".to_string(),
miss_threshold: 3
})
}

Expand Down
1 change: 1 addition & 0 deletions src/cli/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ pub mod incident;
pub mod status;
pub mod utils;
pub mod init;
pub mod alerting;
29 changes: 29 additions & 0 deletions src/cli/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub async fn api_get<T>(base_url: &str, token: &str, route: &str) -> Result<T, E
let http_client = Client::new();
let http_response = http_client.get(&get_api)
.header("Content-Type", "application/json")
.header("Accept", "application/json")
.header("Authorization", &authorization_header)
.send()
.await
Expand All @@ -32,6 +33,34 @@ pub async fn api_get<T>(base_url: &str, token: &str, route: &str) -> Result<T, E
Ok(json_response)
}

pub async fn api_post<T>(base_url: &str, token: &str, route: &str) -> Result<T, Error> where T: DeserializeOwned {

let post_api = format!("{}/{}", base_url, route);
let authorization_header = format!("Bearer {}", token);

let http_client = Client::new();
let http_response = http_client.post(&post_api)
.header("Content-Type", "application/json")
.header("Accept", "application/json")
.header("Authorization", &authorization_header)
.send()
.await
.map_err(|err| Error::new("An unknown network error triggered", err))?;

let http_status = &http_response.status();
if http_status.is_client_error() || http_status.is_server_error() {
let status_err = Error::basic(format!("Expected HTTP response code OK, but received {}", http_status));
return Err(status_err);
}

let body = http_response.text()
.await
.map_err(|err| Error::new("Could not decode response from server", err))?;

let json_response = serde_json::from_str::<T>(&body).map_err(|err| Error::new("Failed to decode JSON response", err))?;

Ok(json_response)
}

pub fn format_timestamp(timestamp: &str) -> String {

Expand Down
8 changes: 4 additions & 4 deletions src/common/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@ pub struct Error {

impl Error {

pub fn new(message: &'static str, details: impl Display) -> Self {
pub fn new<M>(message: M, details: impl Display) -> Self where M: Into<String> {

Error {
message: message.to_string(),
message: message.into(),
details: Some(details.to_string())
}
}

pub fn basic(message: String) -> Self {
pub fn basic<M>(message: M) -> Self where M: Into<String> {

Error {
message,
message: message.into(),
details: None
}
}
Expand Down
26 changes: 25 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ use std::env;
use std::process;

use clap::{Arg, Command};
use cli::alerting;

use crate::cli::{incident, status, init};
use crate::common::error::Error;

// TODO Should not launch Tokio for CLI
#[tokio::main]
async fn main() {

Expand Down Expand Up @@ -125,6 +125,22 @@ async fn main() {
}
}

},
Some(("alerting", alerting_matches)) => {

let (base_url, token) = extract_watchdog_env_or_fail();

match alerting_matches.subcommand() {
Some(("test", _)) => {
let cli_result = alerting::test_alerting(&base_url, &token).await;
handle_cli_failure(cli_result);
},
_ => {
eprintln!("Could not find command to launch");
process::exit(1)
}
}

},
_ => {
eprintln!("Could not find command to launch");
Expand Down Expand Up @@ -220,4 +236,12 @@ fn build_args() -> clap::Command {
.arg_required_else_help(true)
)
)
.subcommand(Command::new("alerting")
.about("Manage alerts & mediums")
.arg_required_else_help(true)
.subcommand(
Command::new("test")
.about("Test all alerting mediums")
)
)
}
2 changes: 2 additions & 0 deletions src/relay/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ impl ServerApi {
let http_response = self.client.get(&self.config_route)
.header("Content-Type", "application/json")
.header("Authorization", &self.authorization_header)
.header("Accept", "application/json")
.send()
.await
.map_err(|err| Error::new("Could not fetch configuration from server", err))?;
Expand All @@ -61,6 +62,7 @@ impl ServerApi {
let response = self.client.put(&self.update_route)
.header("Content-Type", "application/json")
.header("Authorization", &self.authorization_header)
.header("Accept", "application/json")
.body(json_state)
.send()
.await
Expand Down
18 changes: 0 additions & 18 deletions src/relay/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,6 @@ pub struct TestResult {
}

impl TestResult {

pub fn success<M>(target_name: M) -> TestResult where M: Into<String> {

TestResult {
target: target_name.into(),
result: ResultCategory::Success,
metrics: None
}
}

pub fn warning<M>(target_name: M) -> TestResult where M: Into<String> {

TestResult {
target: target_name.into(),
result: ResultCategory::Warning,
metrics: None
}
}

pub fn fail<M>(target_name: M) -> TestResult where M: Into<String> {

Expand Down
9 changes: 7 additions & 2 deletions src/relay/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ pub async fn launch(base_url: String, token: String, region_name: String) -> Res

let mut group_results: Vec<GroupResultInput> = vec![];
for group in &region_config.groups {

// Each monitoring group in a region has multiple tests (ping, http, ...) to ensure
// that the group is properly working. A group is working only if ALL tests are working
// and can have warnings.

let mut is_group_working = true;
let mut has_group_warnings: bool = false;
Expand All @@ -55,7 +59,8 @@ pub async fn launch(base_url: String, token: String, region_name: String) -> Res
match test_result {
Ok(test) => {

if test.result == ResultCategory::Success {
if test.result == ResultCategory::Fail {
error_message = Some("Test failed".to_string());
is_group_working = false;
}
else if test.result == ResultCategory::Warning {
Expand Down Expand Up @@ -98,7 +103,7 @@ pub async fn launch(base_url: String, token: String, region_name: String) -> Res

if !last_update.is_empty() {
region_config = api.fetch_region_conf().await.unwrap();
println!("Relay config reloaded");
println!("Relay config reloaded - version {}", last_update);
}

last_update = watchdog_update;
Expand Down
39 changes: 28 additions & 11 deletions src/relay/test/http.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use std::str;
use std::{str, collections::HashMap};
use tokio::time::Instant;

use reqwest::Client;

use crate::{common::error::Error, relay::model::TestResult};
use crate::{common::error::Error, relay::model::{TestResult, ResultCategory}};

pub struct HttpTest {
client: Client
Expand Down Expand Up @@ -30,24 +31,40 @@ impl HttpTest {
Some(domain) => {

let url = format!("http://{}", domain);
let request_result = self.client.get(url)
let builder = self.client.get(url)
.header("user-agent", "watchdog-relay")
.header("cache-control", "no-store")
.send()
.await;
.header("cache-control", "no-store");

// Measure the time between the request sent out time and the first byte
// received time (not 100% accurate - but still reasonable workaround)
let latency_chrono = Instant::now();
let request_result = builder.send().await;
let duration = latency_chrono.elapsed();

match request_result {
Ok(response) => {

let http_status = &response.status();
if http_status.is_client_error() || http_status.is_server_error() {
return Ok(TestResult::warning(domain));
}

return Ok(TestResult::success(domain));
let category = if http_status.is_client_error() || http_status.is_server_error() {
ResultCategory::Warning
} else {
ResultCategory::Success
};

let duration_ms: f32 = duration.as_millis() as f32;

let metrics: HashMap<String, f32> = HashMap::from([
("http_latency".to_string(), duration_ms)
]);

return Ok(TestResult::build(domain, category, Some(metrics)));

},
Err(_) => Ok(TestResult::fail(domain))
Err(_err) => {
// TODO Error lost (DNS failure, ...)
Ok(TestResult::fail(domain))
}
}
},
None => {
Expand Down
Loading

0 comments on commit 37dbe5e

Please sign in to comment.