From 1f0f224046e12786d87e9472d311d5bd3b9c33bd Mon Sep 17 00:00:00 2001 From: OnyemaAnthony Date: Sat, 28 Mar 2026 12:16:16 +0100 Subject: [PATCH] impelemnt SLA/SLO opperations --- README.md | 4 + docs/backend/SLA_SLO.md | 55 ++++++++++++ package-lock.json | 5 -- src/index.ts | 5 ++ src/operations/service-objectives.test.ts | 60 +++++++++++++ src/operations/service-objectives.ts | 105 ++++++++++++++++++++++ 6 files changed, 229 insertions(+), 5 deletions(-) create mode 100644 docs/backend/SLA_SLO.md create mode 100644 src/operations/service-objectives.test.ts create mode 100644 src/operations/service-objectives.ts diff --git a/README.md b/README.md index 662dd94..3c34ca7 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,10 @@ npm start | `npm test` | Run Jest tests | | `npm run lint` | Run ESLint | +## Documentation + +- [SLA/SLO Definitions and Alert Thresholds](docs/backend/SLA_SLO.md) + ## Contributing 1. Fork the repo and create a branch from `main`. diff --git a/docs/backend/SLA_SLO.md b/docs/backend/SLA_SLO.md new file mode 100644 index 0000000..a8e629b --- /dev/null +++ b/docs/backend/SLA_SLO.md @@ -0,0 +1,55 @@ +# Service Level Agreements (SLA) & Service Level Objectives (SLO) + +This document outlines the Service Objectives and Alert Thresholds implemented for the TalentTrust Backend operations. Setting these operational metrics allows us to proactively track performance, ensure high availability, and configure reliable alerts when our service degrades. + +## Core Concepts + +- **Service Level Agreement (SLA)**: A contractual commitment to maintain specific performance benchmarks. This backend focuses on providing the tools and metrics to enforce internal SLAs. +- **Service Level Objective (SLO)**: Our internal target for a given operation. For example, maintaining a `99.9%` success rate on the API. +- **Alert Thresholds**: The actionable limits where error rates or response latencies degrade and require developer or operator intervention. + +## Defined Objectives + +### Health Check (`/health`) +- **Target Success Rate**: 99.99% +- **Target Latency (P95)**: 50ms +- **Alert Trigger**: Error Rate $\ge$ 0.1% OR Average Latency $\ge$ 150ms over a 5-minute rolling window. + +### Contracts API (`/api/v1/contracts`) +- **Target Success Rate**: 99.9% +- **Target Latency (P95)**: 200ms +- **Alert Trigger**: Error Rate $\ge$ 1.0% OR Average Latency $\ge$ 400ms over a 5-minute rolling window. + +## Usage in Codebase + +The definitions are maintained within `src/operations/service-objectives.ts`. +We use NatSpec-style comments to document types, thresholds, and objectives to align with secure systems and standard architectures. + +```typescript +import { isThresholdBreached, DefaultAlertThresholds } from './operations/service-objectives'; + +// Example: Evaluating if a breach occurred +const hasBreached = isThresholdBreached( + DefaultAlertThresholds.contractsApi, + currentErrorRate, // e.g., fetched from metrics store like Prometheus + currentAverageLatency // e.g., measured via APM +); + +if (hasBreached) { + // Trigger PagerDuty, Slack alert, or perform fallbacks +} +``` + +## Expanding Metrics + +To define SLOs for new API routes or operations: +1. Open `src/operations/service-objectives.ts` +2. Define a new entry inside `DefaultServiceObjectives`. +3. Define the corresponding alerting limits in `DefaultAlertThresholds`. +4. Ensure relevant automated tests run cleanly. + +## Security and Threat Assumptions + +Since SLO and SLA data can be sensitive and used for operational integrity: +- **Metric Spoofing**: Ensure that metrics collected for the `isThresholdBreached` function are generated by trusted internal observability tools, preventing malicious external sources from triggering false positive alerts (DDoS on monitoring). +- **Efficiency**: Evaluation logic is kept completely synchronous $O(1)$ and lightweight to prevent evaluating thresholds from becoming a bottleneck during high-load periods. diff --git a/package-lock.json b/package-lock.json index 6392ca6..72b4a9d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1178,7 +1178,6 @@ "integrity": "sha512-F0R/h2+dsy5wJAUe3tAU6oqa2qbWY5TpNfL/RGmo1y38hiyO1w3x2jPtt76wmuaJI4DQnOBu21cNXQ2STIUUWg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "undici-types": "~6.21.0" } @@ -1603,7 +1602,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "baseline-browser-mapping": "^2.9.0", "caniuse-lite": "^1.0.30001759", @@ -2914,7 +2912,6 @@ "integrity": "sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@jest/core": "^29.7.0", "@jest/types": "^29.6.3", @@ -4788,7 +4785,6 @@ "integrity": "sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@cspotcode/source-map-support": "^0.8.0", "@tsconfig/node10": "^1.0.7", @@ -4937,7 +4933,6 @@ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" diff --git a/src/index.ts b/src/index.ts index dd2fd8f..220ca22 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,5 @@ import express, { Request, Response } from 'express'; +import { DefaultServiceObjectives, DefaultAlertThresholds } from './operations/service-objectives'; const app = express(); const PORT = process.env.PORT || 3001; @@ -15,4 +16,8 @@ app.get('/api/v1/contracts', (_req: Request, res: Response) => { app.listen(PORT, () => { console.log(`TalentTrust API listening on http://localhost:${PORT}`); + console.log('Active Service Objectives initialized:'); + Object.keys(DefaultServiceObjectives).forEach((key) => { + console.log(` - ${key}: Target ${DefaultServiceObjectives[key].targetSuccessRatePercent}% success, p95: ${DefaultServiceObjectives[key].targetLatencyP95Ms}ms`); + }); }); diff --git a/src/operations/service-objectives.test.ts b/src/operations/service-objectives.test.ts new file mode 100644 index 0000000..9b2a4ab --- /dev/null +++ b/src/operations/service-objectives.test.ts @@ -0,0 +1,60 @@ +import { + DefaultServiceObjectives, + DefaultAlertThresholds, + isThresholdBreached, + OperationType, +} from './service-objectives'; + +describe('Service Objectives and Alert Thresholds', () => { + describe('Default Configuration Validation', () => { + it('should have valid target success rates (<= 100%)', () => { + Object.values(DefaultServiceObjectives).forEach((objective) => { + expect(objective.targetSuccessRatePercent).toBeLessThanOrEqual(100); + expect(objective.targetSuccessRatePercent).toBeGreaterThan(0); + }); + }); + + it('should have logical latency goals (p95 <= p99)', () => { + Object.values(DefaultServiceObjectives).forEach((objective) => { + expect(objective.targetLatencyP95Ms).toBeLessThanOrEqual(objective.targetLatencyP99Ms); + expect(objective.targetLatencyP95Ms).toBeGreaterThan(0); + }); + }); + + it('should have positive alert thresholds', () => { + Object.values(DefaultAlertThresholds).forEach((threshold) => { + expect(threshold.maxErrorRatePercent).toBeGreaterThan(0); + expect(threshold.maxAverageLatencyMs).toBeGreaterThan(0); + expect(threshold.evaluationWindowSeconds).toBeGreaterThan(0); + }); + }); + }); + + describe('isThresholdBreached()', () => { + const mockThreshold = { + operationType: OperationType.API_REQUEST, + maxErrorRatePercent: 1.0, + maxAverageLatencyMs: 500, + evaluationWindowSeconds: 60, + }; + + it('should return false when metrics are within safe limits', () => { + expect(isThresholdBreached(mockThreshold, 0.5, 300)).toBe(false); + expect(isThresholdBreached(mockThreshold, 0.99, 499)).toBe(false); + }); + + it('should return true when error rate breaches the maximum limit', () => { + expect(isThresholdBreached(mockThreshold, 1.0, 300)).toBe(true); + expect(isThresholdBreached(mockThreshold, 5.0, 300)).toBe(true); + }); + + it('should return true when average latency breaches the maximum limit', () => { + expect(isThresholdBreached(mockThreshold, 0.5, 500)).toBe(true); + expect(isThresholdBreached(mockThreshold, 0.5, 1000)).toBe(true); + }); + + it('should return true when both metrics breach limits', () => { + expect(isThresholdBreached(mockThreshold, 2.0, 600)).toBe(true); + }); + }); +}); diff --git a/src/operations/service-objectives.ts b/src/operations/service-objectives.ts new file mode 100644 index 0000000..f79ff66 --- /dev/null +++ b/src/operations/service-objectives.ts @@ -0,0 +1,105 @@ +/** + * @title Service Objectives and Alert Thresholds + * @dev Defines the Service Level Objectives (SLOs) and Service Level Agreements (SLAs) for the backend operations. + */ + +export enum OperationType { + API_REQUEST = 'API_REQUEST', + DATABASE_QUERY = 'DATABASE_QUERY', + EXTERNAL_API_CALL = 'EXTERNAL_API_CALL', +} + +/** + * @dev Represents the target metrics for a specific service or operation to ensure high reliability. + */ +export interface ServiceObjective { + operationType: OperationType; + /** + * @dev Target availability/success rate as a percentage (e.g., 99.9). Must be <= 100. + */ + targetSuccessRatePercent: number; + /** + * @dev Maximum acceptable latency in milliseconds for the 95th percentile (p95). + */ + targetLatencyP95Ms: number; + /** + * @dev Maximum acceptable latency in milliseconds for the 99th percentile (p99). + */ + targetLatencyP99Ms: number; +} + +/** + * @dev Defines conditions under which an alert should be triggered for a specific operation. + */ +export interface AlertThreshold { + operationType: OperationType; + /** + * @dev Trigger alert if error rate percentage exceeds this value. + */ + maxErrorRatePercent: number; + /** + * @dev Trigger alert if average latency exceeds this value over the evaluation window. + */ + maxAverageLatencyMs: number; + /** + * @dev The time window in seconds over which the metrics should be evaluated to trigger alerts. + */ + evaluationWindowSeconds: number; +} + +/** + * @dev Registry of default service objectives for key system operations. + */ +export const DefaultServiceObjectives: Record = { + healthCheck: { + operationType: OperationType.API_REQUEST, + targetSuccessRatePercent: 99.99, + targetLatencyP95Ms: 50, + targetLatencyP99Ms: 100, + }, + contractsApi: { + operationType: OperationType.API_REQUEST, + targetSuccessRatePercent: 99.9, + targetLatencyP95Ms: 200, + targetLatencyP99Ms: 500, + }, +}; + +/** + * @dev Registry of default alert thresholds corresponding to the system operations. + */ +export const DefaultAlertThresholds: Record = { + healthCheck: { + operationType: OperationType.API_REQUEST, + maxErrorRatePercent: 0.1, // Alert if error rate > 0.1% + maxAverageLatencyMs: 150, + evaluationWindowSeconds: 300, // Evaluate over 5 minutes + }, + contractsApi: { + operationType: OperationType.API_REQUEST, + maxErrorRatePercent: 1.0, // Alert if error rate > 1.0% + maxAverageLatencyMs: 400, + evaluationWindowSeconds: 300, + }, +}; + +/** + * @dev Evaluates whether the current metrics breach the defined alert threshold for an operation. + * @param threshold The threshold configuration to evaluate against. + * @param currentErrorRateThe observed error rate percentage. + * @param currentAverageLatencyMs The observed average latency in ms. + * @returns true if an alert should be triggered, false otherwise. + */ +export function isThresholdBreached( + threshold: AlertThreshold, + currentErrorRate: number, + currentAverageLatencyMs: number +): boolean { + if (currentErrorRate >= threshold.maxErrorRatePercent) { + return true; + } + if (currentAverageLatencyMs >= threshold.maxAverageLatencyMs) { + return true; + } + return false; +}