Skip to content

Commit 54505aa

Browse files
Merge pull request #255 from dDevAhmed/feature/issue-253-scheduled-task-monitoring
feat(monitoring): add scheduled task tracking, timeout/missed alerts, and dashboard
2 parents fcc3de3 + 95e5a49 commit 54505aa

File tree

4 files changed

+427
-66
lines changed

4 files changed

+427
-66
lines changed

src/backup/backup.service.ts

Lines changed: 160 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,18 @@ import { BackupResponseDto } from './dto/backup-response.dto';
1313
import { BackupJobData } from './interfaces/backup.interfaces';
1414
import { AlertingService } from '../monitoring/alerting/alerting.service';
1515
import { MetricsCollectionService } from '../monitoring/metrics/metrics-collection.service';
16+
import {
17+
ScheduledTaskConfig,
18+
ScheduledTaskMonitoringService,
19+
} from '../monitoring/scheduled-task-monitoring.service';
1620

1721
@Injectable()
1822
export class BackupService {
1923
private readonly logger = new Logger(BackupService.name);
2024
private readonly retentionDays: number;
25+
private readonly scheduledTaskRetryLimit: number;
26+
private readonly scheduledTaskRetryDelayMs: number;
27+
private readonly scheduledTaskTimeoutMs: number;
2128

2229
constructor(
2330
@InjectRepository(BackupRecord)
@@ -27,8 +34,32 @@ export class BackupService {
2734
private readonly configService: ConfigService,
2835
private readonly alertingService: AlertingService,
2936
private readonly metricsService: MetricsCollectionService,
37+
private readonly scheduledTaskMonitoringService: ScheduledTaskMonitoringService,
3038
) {
3139
this.retentionDays = this.configService.get<number>('BACKUP_RETENTION_DAYS', 30);
40+
this.scheduledTaskRetryLimit = this.configService.get<number>(
41+
'BACKUP_SCHEDULED_TASK_RETRY_LIMIT',
42+
2,
43+
);
44+
this.scheduledTaskRetryDelayMs = this.configService.get<number>(
45+
'BACKUP_SCHEDULED_TASK_RETRY_DELAY_MS',
46+
10000,
47+
);
48+
this.scheduledTaskTimeoutMs = this.configService.get<number>(
49+
'BACKUP_SCHEDULED_TASK_TIMEOUT_MS',
50+
30 * 60 * 1000,
51+
);
52+
53+
this.scheduledTaskMonitoringService.registerTask('weekly-database-backup', {
54+
expectedIntervalMs: 7 * 24 * 60 * 60 * 1000,
55+
timeoutMs: this.scheduledTaskTimeoutMs,
56+
maxRetries: this.scheduledTaskRetryLimit,
57+
});
58+
this.scheduledTaskMonitoringService.registerTask('cleanup-expired-backups', {
59+
expectedIntervalMs: 24 * 60 * 60 * 1000,
60+
timeoutMs: this.scheduledTaskTimeoutMs,
61+
maxRetries: this.scheduledTaskRetryLimit,
62+
});
3263
}
3364

3465
/**
@@ -39,58 +70,59 @@ export class BackupService {
3970
timeZone: 'UTC',
4071
})
4172
async handleScheduledBackup(): Promise<void> {
42-
this.logger.log('Starting scheduled weekly backup');
43-
44-
try {
45-
const region =
46-
(this.configService.get<string>('BACKUP_PRIMARY_REGION') as Region) || Region.US_EAST_1;
47-
const databaseName = this.configService.get<string>('DB_DATABASE', 'teachlink');
48-
49-
const expiresAt = new Date();
50-
expiresAt.setDate(expiresAt.getDate() + this.retentionDays);
51-
52-
const backupRecord = this.backupRepository.create({
53-
backupType: BackupType.FULL,
54-
status: BackupStatus.PENDING,
55-
region,
56-
databaseName,
57-
storageKey: '',
58-
expiresAt,
59-
metadata: {
60-
startTime: new Date(),
61-
},
62-
});
63-
64-
await this.backupRepository.save(backupRecord);
65-
66-
// Queue backup job
67-
await this.backupQueue.add(
68-
'create-backup',
69-
{
70-
backupRecordId: backupRecord.id,
73+
await this.executeMonitoredScheduledTask(
74+
'weekly-database-backup',
75+
{
76+
expectedIntervalMs: 7 * 24 * 60 * 60 * 1000,
77+
timeoutMs: this.scheduledTaskTimeoutMs,
78+
maxRetries: this.scheduledTaskRetryLimit,
79+
},
80+
async () => {
81+
this.logger.log('Starting scheduled weekly backup');
82+
83+
const region =
84+
(this.configService.get<string>('BACKUP_PRIMARY_REGION') as Region) || Region.US_EAST_1;
85+
const databaseName = this.configService.get<string>('DB_DATABASE', 'teachlink');
86+
87+
const expiresAt = new Date();
88+
expiresAt.setDate(expiresAt.getDate() + this.retentionDays);
89+
90+
const backupRecord = this.backupRepository.create({
7191
backupType: BackupType.FULL,
92+
status: BackupStatus.PENDING,
7293
region,
7394
databaseName,
74-
} as BackupJobData,
75-
{
76-
attempts: 3,
77-
backoff: {
78-
type: 'exponential',
79-
delay: 10000,
95+
storageKey: '',
96+
expiresAt,
97+
metadata: {
98+
startTime: new Date(),
8099
},
81-
timeout: 3600000, // 1 hour timeout
82-
},
83-
);
100+
});
84101

85-
this.logger.log(`Scheduled backup ${backupRecord.id} queued`);
86-
} catch (error) {
87-
this.logger.error('Failed to initiate scheduled backup:', error);
88-
this.alertingService.sendAlert(
89-
'BACKUP_SCHEDULED_FAILED',
90-
`Scheduled backup failed: ${error.message}`,
91-
'CRITICAL',
92-
);
93-
}
102+
await this.backupRepository.save(backupRecord);
103+
104+
// Queue backup job
105+
await this.backupQueue.add(
106+
'create-backup',
107+
{
108+
backupRecordId: backupRecord.id,
109+
backupType: BackupType.FULL,
110+
region,
111+
databaseName,
112+
} as BackupJobData,
113+
{
114+
attempts: 3,
115+
backoff: {
116+
type: 'exponential',
117+
delay: 10000,
118+
},
119+
timeout: 3600000, // 1 hour timeout
120+
},
121+
);
122+
123+
this.logger.log(`Scheduled backup ${backupRecord.id} queued`);
124+
},
125+
);
94126
}
95127

96128
/**
@@ -101,32 +133,96 @@ export class BackupService {
101133
timeZone: 'UTC',
102134
})
103135
async handleBackupCleanup(): Promise<void> {
104-
this.logger.log('Starting backup cleanup job');
136+
await this.executeMonitoredScheduledTask(
137+
'cleanup-expired-backups',
138+
{
139+
expectedIntervalMs: 24 * 60 * 60 * 1000,
140+
timeoutMs: this.scheduledTaskTimeoutMs,
141+
maxRetries: this.scheduledTaskRetryLimit,
142+
},
143+
async () => {
144+
this.logger.log('Starting backup cleanup job');
145+
146+
const expirationDate = new Date();
147+
expirationDate.setDate(expirationDate.getDate() - this.retentionDays);
148+
149+
const expiredBackups = await this.backupRepository.find({
150+
where: {
151+
createdAt: LessThan(expirationDate),
152+
status: BackupStatus.COMPLETED,
153+
},
154+
});
105155

106-
const expirationDate = new Date();
107-
expirationDate.setDate(expirationDate.getDate() - this.retentionDays);
156+
this.logger.log(`Found ${expiredBackups.length} expired backups to cleanup`);
108157

109-
const expiredBackups = await this.backupRepository.find({
110-
where: {
111-
createdAt: LessThan(expirationDate),
112-
status: BackupStatus.COMPLETED,
158+
for (const backup of expiredBackups) {
159+
await this.backupQueue.add(
160+
'delete-backup',
161+
{ backupRecordId: backup.id },
162+
{
163+
attempts: 3,
164+
backoff: { type: 'exponential', delay: 5000 },
165+
},
166+
);
167+
}
113168
},
169+
);
170+
}
171+
172+
private async executeMonitoredScheduledTask(
173+
taskName: string,
174+
config: ScheduledTaskConfig,
175+
taskRunner: () => Promise<void>,
176+
): Promise<void> {
177+
const executionId = this.scheduledTaskMonitoringService.startExecution(taskName, config, {
178+
source: BackupService.name,
114179
});
115180

116-
this.logger.log(`Found ${expiredBackups.length} expired backups to cleanup`);
181+
const maxAttempts = (config.maxRetries || 0) + 1;
117182

118-
for (const backup of expiredBackups) {
119-
await this.backupQueue.add(
120-
'delete-backup',
121-
{ backupRecordId: backup.id },
122-
{
123-
attempts: 3,
124-
backoff: { type: 'exponential', delay: 5000 },
125-
},
126-
);
183+
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
184+
try {
185+
await taskRunner();
186+
this.scheduledTaskMonitoringService.markSuccess(executionId, {
187+
attempt,
188+
maxAttempts,
189+
retriesUsed: attempt - 1,
190+
});
191+
return;
192+
} catch (error) {
193+
const errorMessage = error instanceof Error ? error.message : String(error);
194+
const shouldRetry = attempt < maxAttempts;
195+
196+
this.logger.error(
197+
`Scheduled task ${taskName} failed on attempt ${attempt}/${maxAttempts}: ${errorMessage}`,
198+
error instanceof Error ? error.stack : undefined,
199+
);
200+
201+
if (shouldRetry) {
202+
this.scheduledTaskMonitoringService.recordRetry(taskName, attempt, maxAttempts - 1, errorMessage);
203+
await this.delay(this.scheduledTaskRetryDelayMs);
204+
continue;
205+
}
206+
207+
this.scheduledTaskMonitoringService.markFailure(executionId, errorMessage, {
208+
attempt,
209+
maxAttempts,
210+
retriesUsed: attempt - 1,
211+
});
212+
213+
this.alertingService.sendAlert(
214+
'BACKUP_SCHEDULED_FAILED',
215+
`Scheduled task ${taskName} failed after ${maxAttempts} attempt(s): ${errorMessage}`,
216+
'CRITICAL',
217+
);
218+
}
127219
}
128220
}
129221

222+
private async delay(ms: number): Promise<void> {
223+
await new Promise((resolve) => setTimeout(resolve, ms));
224+
}
225+
130226
async getLatestBackup(region?: Region): Promise<BackupRecord | null> {
131227
const where: any = {
132228
status: BackupStatus.COMPLETED,
Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,24 @@
11
import { Controller, Get, Res } from '@nestjs/common';
22
import { MetricsCollectionService } from './metrics/metrics-collection.service';
33
import { Response } from 'express';
4+
import { ScheduledTaskMonitoringService } from './scheduled-task-monitoring.service';
45

56
@Controller('metrics')
67
export class MonitoringController {
7-
constructor(private readonly metricsService: MetricsCollectionService) {}
8+
constructor(
9+
private readonly metricsService: MetricsCollectionService,
10+
private readonly scheduledTaskMonitoringService: ScheduledTaskMonitoringService,
11+
) {}
812

913
@Get()
1014
async getMetrics(@Res() res: Response) {
1115
const metrics = await this.metricsService.getMetrics();
1216
res.set('Content-Type', this.metricsService.getRegistry().contentType);
1317
res.send(metrics);
1418
}
19+
20+
@Get('scheduled-tasks/dashboard')
21+
getScheduledTasksDashboard() {
22+
return this.scheduledTaskMonitoringService.getDashboard();
23+
}
1524
}

src/monitoring/monitoring.module.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { MetricsCollectionService } from './metrics/metrics-collection.service';
66
import { PerformanceAnalysisService } from './performance/performance-analysis.service';
77
import { OptimizationService } from './optimization/optimization.service';
88
import { AlertingService } from './alerting/alerting.service';
9+
import { ScheduledTaskMonitoringService } from './scheduled-task-monitoring.service';
910

1011
@Module({
1112
imports: [ScheduleModule.forRoot()],
@@ -16,7 +17,8 @@ import { AlertingService } from './alerting/alerting.service';
1617
PerformanceAnalysisService,
1718
OptimizationService,
1819
AlertingService,
20+
ScheduledTaskMonitoringService,
1921
],
20-
exports: [MetricsCollectionService, AlertingService],
22+
exports: [MetricsCollectionService, AlertingService, ScheduledTaskMonitoringService],
2123
})
2224
export class MonitoringModule {}

0 commit comments

Comments
 (0)