@@ -13,11 +13,18 @@ import { BackupResponseDto } from './dto/backup-response.dto';
1313import { BackupJobData } from './interfaces/backup.interfaces' ;
1414import { AlertingService } from '../monitoring/alerting/alerting.service' ;
1515import { MetricsCollectionService } from '../monitoring/metrics/metrics-collection.service' ;
16+ import {
17+ ScheduledTaskConfig ,
18+ ScheduledTaskMonitoringService ,
19+ } from '../monitoring/scheduled-task-monitoring.service' ;
1620
1721@Injectable ( )
1822export class BackupService {
1923 private readonly logger = new Logger ( BackupService . name ) ;
2024 private readonly retentionDays : number ;
25+ private readonly scheduledTaskRetryLimit : number ;
26+ private readonly scheduledTaskRetryDelayMs : number ;
27+ private readonly scheduledTaskTimeoutMs : number ;
2128
2229 constructor (
2330 @InjectRepository ( BackupRecord )
@@ -27,8 +34,32 @@ export class BackupService {
2734 private readonly configService : ConfigService ,
2835 private readonly alertingService : AlertingService ,
2936 private readonly metricsService : MetricsCollectionService ,
37+ private readonly scheduledTaskMonitoringService : ScheduledTaskMonitoringService ,
3038 ) {
3139 this . retentionDays = this . configService . get < number > ( 'BACKUP_RETENTION_DAYS' , 30 ) ;
40+ this . scheduledTaskRetryLimit = this . configService . get < number > (
41+ 'BACKUP_SCHEDULED_TASK_RETRY_LIMIT' ,
42+ 2 ,
43+ ) ;
44+ this . scheduledTaskRetryDelayMs = this . configService . get < number > (
45+ 'BACKUP_SCHEDULED_TASK_RETRY_DELAY_MS' ,
46+ 10000 ,
47+ ) ;
48+ this . scheduledTaskTimeoutMs = this . configService . get < number > (
49+ 'BACKUP_SCHEDULED_TASK_TIMEOUT_MS' ,
50+ 30 * 60 * 1000 ,
51+ ) ;
52+
53+ this . scheduledTaskMonitoringService . registerTask ( 'weekly-database-backup' , {
54+ expectedIntervalMs : 7 * 24 * 60 * 60 * 1000 ,
55+ timeoutMs : this . scheduledTaskTimeoutMs ,
56+ maxRetries : this . scheduledTaskRetryLimit ,
57+ } ) ;
58+ this . scheduledTaskMonitoringService . registerTask ( 'cleanup-expired-backups' , {
59+ expectedIntervalMs : 24 * 60 * 60 * 1000 ,
60+ timeoutMs : this . scheduledTaskTimeoutMs ,
61+ maxRetries : this . scheduledTaskRetryLimit ,
62+ } ) ;
3263 }
3364
3465 /**
@@ -39,58 +70,59 @@ export class BackupService {
3970 timeZone : 'UTC' ,
4071 } )
4172 async handleScheduledBackup ( ) : Promise < void > {
42- this . logger . log ( 'Starting scheduled weekly backup' ) ;
43-
44- try {
45- const region =
46- ( this . configService . get < string > ( 'BACKUP_PRIMARY_REGION' ) as Region ) || Region . US_EAST_1 ;
47- const databaseName = this . configService . get < string > ( 'DB_DATABASE' , 'teachlink' ) ;
48-
49- const expiresAt = new Date ( ) ;
50- expiresAt . setDate ( expiresAt . getDate ( ) + this . retentionDays ) ;
51-
52- const backupRecord = this . backupRepository . create ( {
53- backupType : BackupType . FULL ,
54- status : BackupStatus . PENDING ,
55- region,
56- databaseName,
57- storageKey : '' ,
58- expiresAt,
59- metadata : {
60- startTime : new Date ( ) ,
61- } ,
62- } ) ;
63-
64- await this . backupRepository . save ( backupRecord ) ;
65-
66- // Queue backup job
67- await this . backupQueue . add (
68- 'create-backup' ,
69- {
70- backupRecordId : backupRecord . id ,
73+ await this . executeMonitoredScheduledTask (
74+ 'weekly-database-backup' ,
75+ {
76+ expectedIntervalMs : 7 * 24 * 60 * 60 * 1000 ,
77+ timeoutMs : this . scheduledTaskTimeoutMs ,
78+ maxRetries : this . scheduledTaskRetryLimit ,
79+ } ,
80+ async ( ) => {
81+ this . logger . log ( 'Starting scheduled weekly backup' ) ;
82+
83+ const region =
84+ ( this . configService . get < string > ( 'BACKUP_PRIMARY_REGION' ) as Region ) || Region . US_EAST_1 ;
85+ const databaseName = this . configService . get < string > ( 'DB_DATABASE' , 'teachlink' ) ;
86+
87+ const expiresAt = new Date ( ) ;
88+ expiresAt . setDate ( expiresAt . getDate ( ) + this . retentionDays ) ;
89+
90+ const backupRecord = this . backupRepository . create ( {
7191 backupType : BackupType . FULL ,
92+ status : BackupStatus . PENDING ,
7293 region,
7394 databaseName,
74- } as BackupJobData ,
75- {
76- attempts : 3 ,
77- backoff : {
78- type : 'exponential' ,
79- delay : 10000 ,
95+ storageKey : '' ,
96+ expiresAt,
97+ metadata : {
98+ startTime : new Date ( ) ,
8099 } ,
81- timeout : 3600000 , // 1 hour timeout
82- } ,
83- ) ;
100+ } ) ;
84101
85- this . logger . log ( `Scheduled backup ${ backupRecord . id } queued` ) ;
86- } catch ( error ) {
87- this . logger . error ( 'Failed to initiate scheduled backup:' , error ) ;
88- this . alertingService . sendAlert (
89- 'BACKUP_SCHEDULED_FAILED' ,
90- `Scheduled backup failed: ${ error . message } ` ,
91- 'CRITICAL' ,
92- ) ;
93- }
102+ await this . backupRepository . save ( backupRecord ) ;
103+
104+ // Queue backup job
105+ await this . backupQueue . add (
106+ 'create-backup' ,
107+ {
108+ backupRecordId : backupRecord . id ,
109+ backupType : BackupType . FULL ,
110+ region,
111+ databaseName,
112+ } as BackupJobData ,
113+ {
114+ attempts : 3 ,
115+ backoff : {
116+ type : 'exponential' ,
117+ delay : 10000 ,
118+ } ,
119+ timeout : 3600000 , // 1 hour timeout
120+ } ,
121+ ) ;
122+
123+ this . logger . log ( `Scheduled backup ${ backupRecord . id } queued` ) ;
124+ } ,
125+ ) ;
94126 }
95127
96128 /**
@@ -101,32 +133,96 @@ export class BackupService {
101133 timeZone : 'UTC' ,
102134 } )
103135 async handleBackupCleanup ( ) : Promise < void > {
104- this . logger . log ( 'Starting backup cleanup job' ) ;
136+ await this . executeMonitoredScheduledTask (
137+ 'cleanup-expired-backups' ,
138+ {
139+ expectedIntervalMs : 24 * 60 * 60 * 1000 ,
140+ timeoutMs : this . scheduledTaskTimeoutMs ,
141+ maxRetries : this . scheduledTaskRetryLimit ,
142+ } ,
143+ async ( ) => {
144+ this . logger . log ( 'Starting backup cleanup job' ) ;
145+
146+ const expirationDate = new Date ( ) ;
147+ expirationDate . setDate ( expirationDate . getDate ( ) - this . retentionDays ) ;
148+
149+ const expiredBackups = await this . backupRepository . find ( {
150+ where : {
151+ createdAt : LessThan ( expirationDate ) ,
152+ status : BackupStatus . COMPLETED ,
153+ } ,
154+ } ) ;
105155
106- const expirationDate = new Date ( ) ;
107- expirationDate . setDate ( expirationDate . getDate ( ) - this . retentionDays ) ;
156+ this . logger . log ( `Found ${ expiredBackups . length } expired backups to cleanup` ) ;
108157
109- const expiredBackups = await this . backupRepository . find ( {
110- where : {
111- createdAt : LessThan ( expirationDate ) ,
112- status : BackupStatus . COMPLETED ,
158+ for ( const backup of expiredBackups ) {
159+ await this . backupQueue . add (
160+ 'delete-backup' ,
161+ { backupRecordId : backup . id } ,
162+ {
163+ attempts : 3 ,
164+ backoff : { type : 'exponential' , delay : 5000 } ,
165+ } ,
166+ ) ;
167+ }
113168 } ,
169+ ) ;
170+ }
171+
172+ private async executeMonitoredScheduledTask (
173+ taskName : string ,
174+ config : ScheduledTaskConfig ,
175+ taskRunner : ( ) => Promise < void > ,
176+ ) : Promise < void > {
177+ const executionId = this . scheduledTaskMonitoringService . startExecution ( taskName , config , {
178+ source : BackupService . name ,
114179 } ) ;
115180
116- this . logger . log ( `Found ${ expiredBackups . length } expired backups to cleanup` ) ;
181+ const maxAttempts = ( config . maxRetries || 0 ) + 1 ;
117182
118- for ( const backup of expiredBackups ) {
119- await this . backupQueue . add (
120- 'delete-backup' ,
121- { backupRecordId : backup . id } ,
122- {
123- attempts : 3 ,
124- backoff : { type : 'exponential' , delay : 5000 } ,
125- } ,
126- ) ;
183+ for ( let attempt = 1 ; attempt <= maxAttempts ; attempt += 1 ) {
184+ try {
185+ await taskRunner ( ) ;
186+ this . scheduledTaskMonitoringService . markSuccess ( executionId , {
187+ attempt,
188+ maxAttempts,
189+ retriesUsed : attempt - 1 ,
190+ } ) ;
191+ return ;
192+ } catch ( error ) {
193+ const errorMessage = error instanceof Error ? error . message : String ( error ) ;
194+ const shouldRetry = attempt < maxAttempts ;
195+
196+ this . logger . error (
197+ `Scheduled task ${ taskName } failed on attempt ${ attempt } /${ maxAttempts } : ${ errorMessage } ` ,
198+ error instanceof Error ? error . stack : undefined ,
199+ ) ;
200+
201+ if ( shouldRetry ) {
202+ this . scheduledTaskMonitoringService . recordRetry ( taskName , attempt , maxAttempts - 1 , errorMessage ) ;
203+ await this . delay ( this . scheduledTaskRetryDelayMs ) ;
204+ continue ;
205+ }
206+
207+ this . scheduledTaskMonitoringService . markFailure ( executionId , errorMessage , {
208+ attempt,
209+ maxAttempts,
210+ retriesUsed : attempt - 1 ,
211+ } ) ;
212+
213+ this . alertingService . sendAlert (
214+ 'BACKUP_SCHEDULED_FAILED' ,
215+ `Scheduled task ${ taskName } failed after ${ maxAttempts } attempt(s): ${ errorMessage } ` ,
216+ 'CRITICAL' ,
217+ ) ;
218+ }
127219 }
128220 }
129221
222+ private async delay ( ms : number ) : Promise < void > {
223+ await new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
224+ }
225+
130226 async getLatestBackup ( region ?: Region ) : Promise < BackupRecord | null > {
131227 const where : any = {
132228 status : BackupStatus . COMPLETED ,
0 commit comments