Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/deployments/monitoring/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ services:
- GF_ANALYTICS_REPORTING_ENABLED=false
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
depends_on:
- prometheus
- loki
Expand All @@ -71,6 +72,21 @@ services:
networks:
- monitoring-network

promtail:
image: grafana/promtail:2.9.0
container_name: ${CONTAINER_PREFIX:-plum}-promtail
restart: always
volumes:
- ./promtail/promtail-config.yaml:/etc/promtail/promtail-config.yaml:ro
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
command: -config.file=/etc/promtail/promtail-config.yaml
networks:
- monitoring-network
depends_on:
- loki

volumes:
prometheus-data:
driver: local
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Grafana 데이터소스 자동 프로비저닝
# Prometheus와 Loki를 자동으로 등록

apiVersion: 1

datasources:
# Prometheus - 메트릭 데이터
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
jsonData:
timeInterval: 15s

# Loki - 로그 데이터
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: true
jsonData:
maxLines: 1000
6 changes: 5 additions & 1 deletion .github/deployments/monitoring/loki/loki-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@ storage_config:

# 제한 설정
limits_config:
retention_period: 168h # 7일 보관
reject_old_samples: false # 과거 샘플 거부 비활성화
reject_old_samples_max_age: 168h # 7일 이전 데이터도 허용
creation_grace_period: 12h # 미래 타임스탬프 12시간까지 허용 (KST +9시간 커버)
unordered_writes: true # 순서 없는 쓰기 허용
retention_period: 168h # 7일 보관
ingestion_rate_mb: 16
ingestion_burst_size_mb: 32
max_query_length: 721h # 30일 쿼리 가능
Expand Down
25 changes: 25 additions & 0 deletions .github/deployments/monitoring/promtail/promtail-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
server:
http_listen_port: 9080
grpc_listen_port: 0

positions:
filename: /tmp/positions.yaml

clients:
- url: http://loki:3100/loki/api/v1/push

scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
- source_labels: ['__meta_docker_container_log_stream']
target_label: 'logstream'
- source_labels: ['__meta_docker_container_label_com_docker_compose_project']
target_label: 'project'
- source_labels: ['__meta_docker_container_label_com_docker_compose_service']
target_label: 'service'
14 changes: 8 additions & 6 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ jobs:
echo "LOKI_HOST=${{ secrets.NCP_DEV_MONITORING_HOST }}" >> .env
fi

docker compose -f docker-compose.yml -f docker-compose.develop.yml pull backend
docker compose -f docker-compose.yml -f docker-compose.develop.yml up -d backend
docker compose -f docker-compose.yml -f docker-compose.develop.yml pull
docker compose -f docker-compose.yml -f docker-compose.develop.yml up -d

# 헬스체크 (30초 타임아웃)
echo "헬스체크 중..."
Expand Down Expand Up @@ -190,8 +190,8 @@ jobs:
echo "LOKI_HOST=${{ secrets.NCP_PROD_MONITORING_HOST }}" >> .env
fi

docker compose -f docker-compose.yml -f docker-compose.prod.yml pull backend
docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d backend
docker compose -f docker-compose.yml -f docker-compose.prod.yml pull
docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d

# 헬스체크 (60초 타임아웃, 운영 환경은 더 긴 시간)
echo "헬스체크 중..."
Expand Down Expand Up @@ -322,9 +322,10 @@ jobs:

# 설정 파일 복사
cp .github/deployments/monitoring/docker-compose.yml ./docker-compose.yml
mkdir -p prometheus loki
mkdir -p prometheus loki promtail
cp .github/deployments/monitoring/prometheus/prometheus-dev.yml ./prometheus/prometheus.yml
cp .github/deployments/monitoring/loki/loki-config.yaml ./loki/loki-config.yaml
cp .github/deployments/monitoring/promtail/promtail-config.yaml ./promtail/promtail-config.yaml

# Prometheus 설정 파일의 IP 플레이스홀더 치환
sed -i "s/BACKEND_HOST_PLACEHOLDER/${{ secrets.NCP_DEV_HOST }}/g" ./prometheus/prometheus.yml
Expand Down Expand Up @@ -457,9 +458,10 @@ jobs:
fi

cp .github/deployments/monitoring/docker-compose.yml ./docker-compose.yml
mkdir -p prometheus loki
mkdir -p prometheus loki promtail
cp .github/deployments/monitoring/prometheus/prometheus-prod.yml ./prometheus/prometheus.yml
cp .github/deployments/monitoring/loki/loki-config.yaml ./loki/loki-config.yaml
cp .github/deployments/monitoring/promtail/promtail-config.yaml ./promtail/promtail-config.yaml

# Prometheus 설정 파일의 IP 플레이스홀더 치환
sed -i "s/BACKEND_HOST_PLACEHOLDER/${{ secrets.NCP_PROD_HOST }}/g" ./prometheus/prometheus.yml
Expand Down
1 change: 1 addition & 0 deletions apps/backend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"@nestjs/websockets": "^10.4.20",
"@plum/shared-interfaces": "workspace:*",
"nest-winston": "^1.10.2",
"prom-client": "^15.1.0",
"reflect-metadata": "^0.2.2",
"rxjs": "^7.8.1",
"socket.io": "^4.8.1",
Expand Down
42 changes: 8 additions & 34 deletions apps/backend/promtail.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ positions:

clients:
# Loki 서버 주소 (환경변수로 설정)
- url: ${LOKI_URL:-http://localhost:3100}/loki/api/v1/push
- url: ${LOKI_URL}/loki/api/v1/push

scrape_configs:
# 일반 로그 파일 (app-*.log)
Expand All @@ -20,31 +20,18 @@ scrape_configs:
- localhost
labels:
job: plum-backend
env: ${NODE_ENV:-development}
env: ${NODE_ENV}
__path__: /app/logs/nestjs/app-*.log

# JSON 파싱과 타임스탬프만 처리 (동적 labels 제거)
pipeline_stages:
# JSON 파싱
- json:
expressions:
timestamp: timestamp
level: level
message: message
context: context
trace: trace
span: span

# 타임스탬프 파싱
- timestamp:
source: timestamp
format: '2006-01-02 15:04:05'

# 레이블 추가
- labels:
level:
context:

# 출력 포맷
format: '2026-01-02 15:04:05'
- output:
source: message

Expand All @@ -54,31 +41,18 @@ scrape_configs:
- targets:
- localhost
labels:
job: plum-backend
env: ${NODE_ENV:-development}
level: error
job: plum-backend-error
env: ${NODE_ENV}
__path__: /app/logs/nestjs/error-*.log

# JSON 파싱과 타임스탬프만 처리 (동적 labels 제거)
pipeline_stages:
# JSON 파싱
- json:
expressions:
timestamp: timestamp
message: message
context: context
stack: stack
trace: trace
span: span

# 타임스탬프 파싱
- timestamp:
source: timestamp
format: '2006-01-02 15:04:05'

# 레이블 추가
- labels:
context:

# 출력 포맷
format: '2026-01-02 15:04:05'
- output:
source: message
8 changes: 8 additions & 0 deletions apps/backend/src/app.module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ import { HttpExceptionFilter } from './common/filters/index.js';
import { MediaModule } from './media/media.module.js';
import { InteractionModule } from './interaction/interaction.module.js';
import { RoomModule } from './room/room.module.js';
import { PrometheusModule, MetricsInterceptor } from './prometheus/index.js';

@Module({
imports: [
HealthModule,
WinstonModule.forRoot(winstonConfig),
PrometheusModule,
MediaModule,
InteractionModule,
RoomModule,
Expand All @@ -25,6 +27,12 @@ import { RoomModule } from './room/room.module.js';
useClass: LoggingInterceptor,
},

// 전역 메트릭 수집 인터셉터
{
provide: APP_INTERCEPTOR,
useClass: MetricsInterceptor,
},

// 전역 예외 필터 (404 에러 등 로깅)
{
provide: APP_FILTER,
Expand Down
4 changes: 4 additions & 0 deletions apps/backend/src/prometheus/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
export { PrometheusModule } from './prometheus.module.js';
export { PrometheusService } from './prometheus.service.js';
export { PrometheusController } from './prometheus.controller.js';
export { MetricsInterceptor } from './metrics.interceptor.js';
85 changes: 85 additions & 0 deletions apps/backend/src/prometheus/metrics.interceptor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import { Injectable, NestInterceptor, ExecutionContext, CallHandler } from '@nestjs/common';
import { Observable } from 'rxjs';
import { tap } from 'rxjs/operators';
import { PrometheusService } from './prometheus.service.js';

/**
*
* 모든 HTTP 요청을 자동으로 가로채서 Prometheus 메트릭을 수집하는 인터셉터
*
* 1. HTTP 요청이 시작되면 타이머 시작
* 2. 요청 처리 후 응답 시간 측정
* 3. 요청 정보(method, route, status code)를 라벨로 저장
* 4. Prometheus 메트릭에 기록 (Duration Histogram, Count Counter)
*
* 왜 필요할까?
* - 모든 엔드포인트에 메트릭 코드를 일일이 추가하지 않아도 됨
* - 컨트롤러 코드를 수정하지 않고 자동으로 메트릭 수집
* - 전역 인터셉터로 등록하면 모든 HTTP 요청에 자동 적용
*
* 흐름
* 요청 도착 → Interceptor 실행 → 타이머 시작 → Controller 처리 →
* Interceptor로 돌아옴 → 시간 측정 → 메트릭 기록 → 응답 반환
*/
@Injectable()
export class MetricsInterceptor implements NestInterceptor {
constructor(private readonly prometheusService: PrometheusService) {}

/**
* HTTP 요청을 가로채는 메인 메서드
*
* @param context - NestJS 실행 컨텍스트 (요청/응답 정보 접근)
* @param next - 다음 핸들러 (실제 Controller 로직)
* @returns Observable - RxJS 스트림 (비동기 처리)
*/
intercept(context: ExecutionContext, next: CallHandler): Observable<unknown> {
// 1. 요청 정보 추출
const request = context.switchToHttp().getRequest();
const method = request.method; // GET, POST, PUT, DELETE 등
const route = request.route?.path || request.url; // /health, /metrics 등

// 2. 타이머 시작 (요청 처리 시작 시간 기록)
const start = Date.now();

// 3. 실제 Controller
return next.handle().pipe(
tap({
// 성공 - 정상 응답 시 메트릭 기록
next: () => {
const response = context.switchToHttp().getResponse();
const statusCode = response.statusCode; // 200, 201 등
const duration = (Date.now() - start) / 1000; // 초 단위로 변환

// HTTP 요청 응답 시간
// GET /health 200 → 0.05초
this.prometheusService.httpRequestDuration
.labels(method, route, statusCode.toString())
.observe(duration);

// HTTP 요청 횟수 증가 (카운터)
// GET /health 200 → +1
this.prometheusService.httpRequestsTotal
.labels(method, route, statusCode.toString())
.inc();
},

//실패 - 에러 발생 시에도 메트릭 기록
error: (error) => {
const statusCode = error.status || 500; // 404, 500 등
const duration = (Date.now() - start) / 1000;

// 에러 케이스도 응답 시간과 횟수 기록
// GET /invalid-path 404 → 0.01초
this.prometheusService.httpRequestDuration
.labels(method, route, statusCode.toString())
.observe(duration);

// GET /invalid-path 404 → +1
this.prometheusService.httpRequestsTotal
.labels(method, route, statusCode.toString())
.inc();
},
}),
);
}
}
13 changes: 13 additions & 0 deletions apps/backend/src/prometheus/prometheus.controller.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import { Controller, Get, Header } from '@nestjs/common';
import { PrometheusService } from './prometheus.service.js';

@Controller()
export class PrometheusController {
constructor(private readonly prometheusService: PrometheusService) {}

@Get('/metrics')
@Header('Content-Type', 'text/plain; version=0.0.4; charset=utf-8')
async getMetrics(): Promise<string> {
return this.prometheusService.getMetrics();
}
}
10 changes: 10 additions & 0 deletions apps/backend/src/prometheus/prometheus.module.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import { Module } from '@nestjs/common';
import { PrometheusService } from './prometheus.service.js';
import { PrometheusController } from './prometheus.controller.js';

@Module({
providers: [PrometheusService],
controllers: [PrometheusController],
exports: [PrometheusService],
})
export class PrometheusModule {}
Loading
Loading