-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.cloud.yaml
More file actions
235 lines (207 loc) · 7.66 KB
/
docker-compose.cloud.yaml
File metadata and controls
235 lines (207 loc) · 7.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# =============================================================================
# Hive — Cloud GPU deployment override
#
# Usage:
# docker compose -f docker-compose.yaml -f docker-compose.cloud.yaml up -d
#
# Target: Linux cloud VMs with NVIDIA GPUs
# AWS g5/p4, GCP a2/a3, Lambda Labs, CoreWeave, Linux Desktop instances
#
# This file overrides only the fields that differ from the base compose file.
# All services, volumes, and networks are inherited from docker-compose.yaml.
#
# Requirements:
# - NVIDIA Container Toolkit installed and configured
# - docker info | grep -i runtime → should show "nvidia"
# - For rootless Docker: nvidia-ctk runtime configure --runtime=docker
# --config=$HOME/.config/docker/daemon.json && systemctl --user restart docker
# =============================================================================
name: hive
services:
# ---------------------------------------------------------------------------
# hive-api — bind to all interfaces so cloud load balancers can reach it
# ---------------------------------------------------------------------------
hive-api:
ports:
# Cloud: expose on all interfaces (sit behind nginx/caddy or security group)
- "0.0.0.0:8000:8000"
environment:
HIVE_DATA_DIR: /data
PYTHONUNBUFFERED: "1"
OLLAMA_HOST: "${OLLAMA_HOST:-http://ollama:11434}"
# Raise the API worker count for multi-core cloud VMs
WEB_CONCURRENCY: "${WEB_CONCURRENCY:-4}"
deploy:
resources:
limits:
# Cloud VMs typically have more RAM; raise ceiling
memory: 4G
reservations:
memory: 512M
healthcheck:
# Faster health checks — cloud orchestrators need quick convergence
interval: 15s
timeout: 5s
start_period: 20s
retries: 3
# ---------------------------------------------------------------------------
# hive-web — bind to all interfaces for external access
# ---------------------------------------------------------------------------
hive-web:
ports:
- "0.0.0.0:3000:3000"
environment:
# On cloud instances the public IP is the ingress; users set this in .env
NEXT_PUBLIC_API_URL: "${NEXT_PUBLIC_API_URL:-http://localhost:8000}"
NODE_ENV: production
deploy:
resources:
limits:
memory: 1G
reservations:
memory: 256M
healthcheck:
interval: 15s
timeout: 5s
start_period: 30s
retries: 3
# ---------------------------------------------------------------------------
# ollama — full GPU passthrough with cloud-tuned environment
# ---------------------------------------------------------------------------
ollama:
ports:
# Bind to loopback only; external access goes through the API
- "127.0.0.1:11434:11434"
# Explicit NVIDIA runtime declaration — required on some cloud providers
# (supplements the deploy.resources.reservations block in the base file)
runtime: nvidia
environment:
OLLAMA_HOST: "0.0.0.0:11434"
OLLAMA_KEEP_ALIVE: "${OLLAMA_KEEP_ALIVE:-30m}"
# Flash Attention significantly reduces VRAM usage on Ampere/Ada/Hopper GPUs
OLLAMA_FLASH_ATTENTION: "${OLLAMA_FLASH_ATTENTION:-1}"
# Allow more parallel requests — cloud VMs have more RAM and bandwidth
OLLAMA_NUM_PARALLEL: "${OLLAMA_NUM_PARALLEL:-4}"
# Pass-through NVIDIA runtime variables required by the NVIDIA Container Toolkit
NVIDIA_VISIBLE_DEVICES: "${NVIDIA_VISIBLE_DEVICES:-all}"
NVIDIA_DRIVER_CAPABILITIES: "${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}"
# Optional: set a VRAM ceiling so Ollama doesn't OOM other processes.
# Expressed as a fraction of total VRAM (0.9 = 90%).
# OLLAMA_GPU_OVERHEAD: "0"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
limits:
# Prevent Ollama's CPU threads from monopolising the VM
# GPU memory is managed by CUDA — this only limits host RAM
memory: "${OLLAMA_MEMORY_LIMIT:-16G}"
healthcheck:
# Faster convergence on cloud — models load quickly from NVMe
interval: 10s
timeout: 10s
start_period: 20s
retries: 6
# ---------------------------------------------------------------------------
# ollama-init — pull the right model based on GPU VRAM
#
# Override OLLAMA_PULL_MODEL in your .env to choose a different model:
# OLLAMA_PULL_MODEL=nemotron # 70B — needs 48 GB+ VRAM (A100, H100)
# OLLAMA_PULL_MODEL=nemotron-small # 4B — runs on 8 GB+ VRAM (default)
# OLLAMA_PULL_MODEL=llama3.3:70b # Meta Llama 3.3 70B
# ---------------------------------------------------------------------------
ollama-init:
environment:
OLLAMA_HOST: "http://ollama:11434"
command:
- |
MODEL="${OLLAMA_PULL_MODEL:-nemotron-small}"
echo "Hive cloud init: checking for model: $MODEL ..."
if ollama list 2>/dev/null | grep -qF "$MODEL"; then
echo "$MODEL already present — skipping pull."
else
echo "Pulling $MODEL ..."
ollama pull "$MODEL" && echo "Pull complete." \
|| echo "Pull failed — retry with: docker compose exec ollama ollama pull $MODEL"
fi
# ---------------------------------------------------------------------------
# nginx — HTTPS reverse proxy (optional; activate with --profile https)
#
# Usage:
# docker compose -f docker-compose.yaml -f docker-compose.cloud.yaml \
# --profile https up -d
#
# Place your TLS certificate files at:
# ./nginx/certs/hive.crt
# ./nginx/certs/hive.key
# Or let Caddy handle ACME automatically (see caddy service below).
# ---------------------------------------------------------------------------
nginx:
image: nginx:1.27-alpine
container_name: hive-nginx
profiles: ["https"]
restart: unless-stopped
ports:
- "0.0.0.0:80:80"
- "0.0.0.0:443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
- ./nginx/certs:/etc/nginx/certs:ro
networks:
- council-net
depends_on:
hive-api:
condition: service_healthy
hive-web:
condition: service_healthy
healthcheck:
test: ["CMD", "nginx", "-t"]
interval: 30s
timeout: 5s
retries: 3
# ---------------------------------------------------------------------------
# caddy — automatic HTTPS via ACME/Let's Encrypt (optional profile: caddy)
#
# Usage:
# HIVE_DOMAIN=hive.example.com \
# docker compose -f docker-compose.yaml -f docker-compose.cloud.yaml \
# --profile caddy up -d
# ---------------------------------------------------------------------------
caddy:
image: caddy:2-alpine
container_name: hive-caddy
profiles: ["caddy"]
restart: unless-stopped
ports:
- "0.0.0.0:80:80"
- "0.0.0.0:443:443"
- "0.0.0.0:443:443/udp" # HTTP/3
environment:
HIVE_DOMAIN: "${HIVE_DOMAIN:-localhost}"
volumes:
- ./caddy/Caddyfile:/etc/caddy/Caddyfile:ro
- caddy-data:/data
- caddy-config:/config
networks:
- council-net
depends_on:
hive-api:
condition: service_healthy
hive-web:
condition: service_healthy
healthcheck:
test: ["CMD", "caddy", "version"]
interval: 30s
timeout: 5s
retries: 3
# ---------------------------------------------------------------------------
# Extra volumes for optional proxy services
# ---------------------------------------------------------------------------
volumes:
hive-data:
ollama-models:
caddy-data:
caddy-config: