dragonflight/docker-compose.worker.yml

131 lines
5.3 KiB
YAML
Raw Permalink Normal View History

# Wild Dragon MAM — Worker Node Stack
# ─────────────────────────────────────
# Deploy on any machine you want to join the cluster as a worker.
# The primary stack (mam-api, db, redis) continues running on TrueNAS.
#
# Required env vars (set in .env.worker or export before running):
# MAM_API_URL URL of the primary MAM API e.g. http://10.0.0.25:47432
# NODE_TOKEN Bearer token from the primary's Tokens page
# NODE_IP Host LAN IP to report (set by onboard-node.sh)
#
# Optional hardware overrides (if Docker can't see /dev directly):
# GPU_COUNT Number of NVIDIA GPUs on this node (default: auto-detect from /dev/nvidia*)
# BMD_COUNT Number of Blackmagic DeckLink cards (default: auto-detect from /dev/blackmagic/)
# BMD_MODEL Marketed card name (e.g. "DeckLink Duo 2") — drives the port-diagram UI
#
# Optional env vars (needed only if starting the worker or capture profiles):
# REDIS_URL, DATABASE_URL, S3_ENDPOINT, S3_BUCKET, S3_ACCESS_KEY, S3_SECRET_KEY
# BMD_DEVICE_0 DeckLink device path (default: /dev/blackmagic/dv0)
chore: 1.2 ship-prep sweep — close 38 issues Frontend / UX / a11y - Sidebar collapse/expand toggle with localStorage persistence (#142) - Settings sections wrap inputs in <form> with Enter-to-submit + native validation; password autocomplete=new-password (#141, #138) - Asset thumbnails get descriptive alt text (#140) - Production deploy now precompiles JSX via esbuild and loads the production React UMD instead of dev builds + in-browser Babel (#139, #122) - Search wrapper gets role=search; global search input gets aria-label, role=combobox, aria-controls/aria-expanded/aria-activedescendant wiring (#137, #135) - Dashboard and Library no longer share the same nav icon (#136) - Sidebar collapses off-canvas with a topbar menu button below 768 px; mobile default is collapsed (#134) - --text-3 bumped to #8B92A0 for WCAG AA contrast on --bg-0 (#133) - Schedule and Library routes were rendering empty inside the .main flex container — switched to flex:1 + min-height:0 (#131, #132, editor + asset detail get the same fix) - Jobs nav badge now polls /jobs?status=active every 10 s and reflects the live count (#130, #113) - aria-label sweep on every icon-only button (#126) - Premiere panel release list moved to window.PREMIERE_RELEASES in data.jsx; Editor + Settings read from the same source (#125) - Typo setPgMclips → setPgmClips (#124) - Stray console.error / console.warn calls gated behind window.DF_LOG.{warn,error} (#123) - Hardcoded /api/v1 paths route through window.ZAMPP_API_PREFIX (#115) - Schedule rows no longer crash on null recorder_id (#117) - EditorKeyboard guards against document.activeElement === null (#116) - Unmount-safe timers for PasswordResetModal, Containers, Editor (#111) - Player seek clamps below totalMs, server-side range clamping + uncached 416 on EOF, client-side EOF-stall watchdog (#143) - Duration badge overlap fix on narrow asset cards (#52) Backend / security / reliability - GET /recorders fixed N+1: single LATERAL JOIN for live_asset_id; Docker inspects bounded to actually-recording rows (#121) - Upload disk-storage (multer.diskStorage) streams parts to S3 instead of buffering 500 MB in RAM (#120) - /assets list clamps limit to MAX_LIMIT=500 to prevent OOM (#119) - SDK upload archive listing + post-extract sanitize block zip-slip / tar-slip and symlink escapes (#118) - Migrations track applied state in schema_migrations, run in a transaction, and exit non-zero on failure (#107) - node-agent BMD_COUNT override uses BMD_DEVICE_PREFIX; filesystem detection wins (#109, #127) - GPU_COUNT override now merges with nvidia-smi enrichment (#108) - /cluster/heartbeat requires a node-bound token or admin user; tokens carry bound_hostname (#106) - /recorders/:id/start error responses no longer echo the Docker create payload — env vars stay out of client responses (#105) - /recorders/probe restricts schemes (srt/rtmp/rtsp/udp/rtp), blocks private + loopback hosts for non-admins, denies common service ports (#104) - Scheduler tick guarded by a Postgres advisory lock; pending/running rows claimed via UPDATE...RETURNING + FOR UPDATE SKIP LOCKED to survive multi-node deploys (#103) - UUID validateUuid('id') param middleware on every /:id route (#102) - Error handler scrubs Postgres error messages and 5xx detail (#101) - Graceful SIGTERM/SIGINT shutdown — stops scheduler, drains the HTTP server, ends the pool, 25 s force-exit watchdog (#100) - AMPP sync moved from fire-and-forget to a persisted retry queue (ampp_sync_status / attempts / next_attempt_at + scheduler retry loop with exponential backoff) (#77) Migrations - 019: api_tokens.bound_hostname (#106) - 020: assets.ampp_sync_status + retry bookkeeping (#77) Other - Defer #92 Growing-files per-upload toggle, #80 Audio tab, #57 Dashboard redesign, #56 Editor SPA polish phase 3, #114 S3 migration tool to v1.3
2026-05-26 22:06:14 -04:00
# (DeckLink IO / Quad cards expose /dev/blackmagic/io* instead — set BMD_DEVICE_PREFIX=io)
# BMD_DEVICE_1 DeckLink device path (default: /dev/blackmagic/dv1)
chore: 1.2 ship-prep sweep — close 38 issues Frontend / UX / a11y - Sidebar collapse/expand toggle with localStorage persistence (#142) - Settings sections wrap inputs in <form> with Enter-to-submit + native validation; password autocomplete=new-password (#141, #138) - Asset thumbnails get descriptive alt text (#140) - Production deploy now precompiles JSX via esbuild and loads the production React UMD instead of dev builds + in-browser Babel (#139, #122) - Search wrapper gets role=search; global search input gets aria-label, role=combobox, aria-controls/aria-expanded/aria-activedescendant wiring (#137, #135) - Dashboard and Library no longer share the same nav icon (#136) - Sidebar collapses off-canvas with a topbar menu button below 768 px; mobile default is collapsed (#134) - --text-3 bumped to #8B92A0 for WCAG AA contrast on --bg-0 (#133) - Schedule and Library routes were rendering empty inside the .main flex container — switched to flex:1 + min-height:0 (#131, #132, editor + asset detail get the same fix) - Jobs nav badge now polls /jobs?status=active every 10 s and reflects the live count (#130, #113) - aria-label sweep on every icon-only button (#126) - Premiere panel release list moved to window.PREMIERE_RELEASES in data.jsx; Editor + Settings read from the same source (#125) - Typo setPgMclips → setPgmClips (#124) - Stray console.error / console.warn calls gated behind window.DF_LOG.{warn,error} (#123) - Hardcoded /api/v1 paths route through window.ZAMPP_API_PREFIX (#115) - Schedule rows no longer crash on null recorder_id (#117) - EditorKeyboard guards against document.activeElement === null (#116) - Unmount-safe timers for PasswordResetModal, Containers, Editor (#111) - Player seek clamps below totalMs, server-side range clamping + uncached 416 on EOF, client-side EOF-stall watchdog (#143) - Duration badge overlap fix on narrow asset cards (#52) Backend / security / reliability - GET /recorders fixed N+1: single LATERAL JOIN for live_asset_id; Docker inspects bounded to actually-recording rows (#121) - Upload disk-storage (multer.diskStorage) streams parts to S3 instead of buffering 500 MB in RAM (#120) - /assets list clamps limit to MAX_LIMIT=500 to prevent OOM (#119) - SDK upload archive listing + post-extract sanitize block zip-slip / tar-slip and symlink escapes (#118) - Migrations track applied state in schema_migrations, run in a transaction, and exit non-zero on failure (#107) - node-agent BMD_COUNT override uses BMD_DEVICE_PREFIX; filesystem detection wins (#109, #127) - GPU_COUNT override now merges with nvidia-smi enrichment (#108) - /cluster/heartbeat requires a node-bound token or admin user; tokens carry bound_hostname (#106) - /recorders/:id/start error responses no longer echo the Docker create payload — env vars stay out of client responses (#105) - /recorders/probe restricts schemes (srt/rtmp/rtsp/udp/rtp), blocks private + loopback hosts for non-admins, denies common service ports (#104) - Scheduler tick guarded by a Postgres advisory lock; pending/running rows claimed via UPDATE...RETURNING + FOR UPDATE SKIP LOCKED to survive multi-node deploys (#103) - UUID validateUuid('id') param middleware on every /:id route (#102) - Error handler scrubs Postgres error messages and 5xx detail (#101) - Graceful SIGTERM/SIGINT shutdown — stops scheduler, drains the HTTP server, ends the pool, 25 s force-exit watchdog (#100) - AMPP sync moved from fire-and-forget to a persisted retry queue (ampp_sync_status / attempts / next_attempt_at + scheduler retry loop with exponential backoff) (#77) Migrations - 019: api_tokens.bound_hostname (#106) - 020: assets.ampp_sync_status + retry bookkeeping (#77) Other - Defer #92 Growing-files per-upload toggle, #80 Audio tab, #57 Dashboard redesign, #56 Editor SPA polish phase 3, #114 S3 migration tool to v1.3
2026-05-26 22:06:14 -04:00
# BMD_DEVICE_PREFIX Naming prefix for synthesized BMD_COUNT-based devices (default: dv). Use 'io' for IO/Quad.
# LIVE_DIR Host path for HLS live segments (default: /mnt/NVME/MAM/wild-dragon-live)
#
# Profiles:
# (default) node-agent only — cluster visibility + hardware heartbeat
# --profile worker + CPU/GPU job worker (proxy generation, transcoding)
# --profile capture + SDI capture service (requires Blackmagic DeckLink card)
#
# To enable GPU transcoding, also apply docker-compose.gpu.yml:
# docker compose -f docker-compose.worker.yml -f docker-compose.gpu.yml --profile worker up -d
#
# NOTE: The node-agent mounts /var/run/docker.sock to spawn on-demand SDI
# capture sidecars when the primary mam-api routes a recorder to this node.
# Build the capture image before first use:
# docker compose -f docker-compose.worker.yml build capture
services:
# node-agent runs in host network mode so it can see the real host
# interfaces, GPU devices and DeckLink cards without bridging tricks.
# The reported IP / hostname will be the host's, not the container's.
node-agent:
build: ./services/node-agent
restart: unless-stopped
network_mode: host
environment:
MAM_API_URL: ${MAM_API_URL}
NODE_TOKEN: ${NODE_TOKEN:-}
NODE_ROLE: ${NODE_ROLE:-worker}
NODE_IP: ${NODE_IP:-}
AGENT_PORT: ${AGENT_PORT:-7436}
HEARTBEAT_MS: ${HEARTBEAT_MS:-30000}
chore: 1.2 ship-prep sweep — close 38 issues Frontend / UX / a11y - Sidebar collapse/expand toggle with localStorage persistence (#142) - Settings sections wrap inputs in <form> with Enter-to-submit + native validation; password autocomplete=new-password (#141, #138) - Asset thumbnails get descriptive alt text (#140) - Production deploy now precompiles JSX via esbuild and loads the production React UMD instead of dev builds + in-browser Babel (#139, #122) - Search wrapper gets role=search; global search input gets aria-label, role=combobox, aria-controls/aria-expanded/aria-activedescendant wiring (#137, #135) - Dashboard and Library no longer share the same nav icon (#136) - Sidebar collapses off-canvas with a topbar menu button below 768 px; mobile default is collapsed (#134) - --text-3 bumped to #8B92A0 for WCAG AA contrast on --bg-0 (#133) - Schedule and Library routes were rendering empty inside the .main flex container — switched to flex:1 + min-height:0 (#131, #132, editor + asset detail get the same fix) - Jobs nav badge now polls /jobs?status=active every 10 s and reflects the live count (#130, #113) - aria-label sweep on every icon-only button (#126) - Premiere panel release list moved to window.PREMIERE_RELEASES in data.jsx; Editor + Settings read from the same source (#125) - Typo setPgMclips → setPgmClips (#124) - Stray console.error / console.warn calls gated behind window.DF_LOG.{warn,error} (#123) - Hardcoded /api/v1 paths route through window.ZAMPP_API_PREFIX (#115) - Schedule rows no longer crash on null recorder_id (#117) - EditorKeyboard guards against document.activeElement === null (#116) - Unmount-safe timers for PasswordResetModal, Containers, Editor (#111) - Player seek clamps below totalMs, server-side range clamping + uncached 416 on EOF, client-side EOF-stall watchdog (#143) - Duration badge overlap fix on narrow asset cards (#52) Backend / security / reliability - GET /recorders fixed N+1: single LATERAL JOIN for live_asset_id; Docker inspects bounded to actually-recording rows (#121) - Upload disk-storage (multer.diskStorage) streams parts to S3 instead of buffering 500 MB in RAM (#120) - /assets list clamps limit to MAX_LIMIT=500 to prevent OOM (#119) - SDK upload archive listing + post-extract sanitize block zip-slip / tar-slip and symlink escapes (#118) - Migrations track applied state in schema_migrations, run in a transaction, and exit non-zero on failure (#107) - node-agent BMD_COUNT override uses BMD_DEVICE_PREFIX; filesystem detection wins (#109, #127) - GPU_COUNT override now merges with nvidia-smi enrichment (#108) - /cluster/heartbeat requires a node-bound token or admin user; tokens carry bound_hostname (#106) - /recorders/:id/start error responses no longer echo the Docker create payload — env vars stay out of client responses (#105) - /recorders/probe restricts schemes (srt/rtmp/rtsp/udp/rtp), blocks private + loopback hosts for non-admins, denies common service ports (#104) - Scheduler tick guarded by a Postgres advisory lock; pending/running rows claimed via UPDATE...RETURNING + FOR UPDATE SKIP LOCKED to survive multi-node deploys (#103) - UUID validateUuid('id') param middleware on every /:id route (#102) - Error handler scrubs Postgres error messages and 5xx detail (#101) - Graceful SIGTERM/SIGINT shutdown — stops scheduler, drains the HTTP server, ends the pool, 25 s force-exit watchdog (#100) - AMPP sync moved from fire-and-forget to a persisted retry queue (ampp_sync_status / attempts / next_attempt_at + scheduler retry loop with exponential backoff) (#77) Migrations - 019: api_tokens.bound_hostname (#106) - 020: assets.ampp_sync_status + retry bookkeeping (#77) Other - Defer #92 Growing-files per-upload toggle, #80 Audio tab, #57 Dashboard redesign, #56 Editor SPA polish phase 3, #114 S3 migration tool to v1.3
2026-05-26 22:06:14 -04:00
GPU_COUNT: ${GPU_COUNT:--1}
BMD_COUNT: ${BMD_COUNT:--1}
BMD_MODEL: ${BMD_MODEL:-}
BMD_DEVICE_PREFIX: ${BMD_DEVICE_PREFIX:-dv}
LIVE_DIR: ${LIVE_DIR:-/mnt/NVME/MAM/wild-dragon-live}
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- /dev:/dev:ro
- /mnt/NVME/MAM/wild-dragon-live:/mnt/NVME/MAM/wild-dragon-live:ro
devices:
- /dev/blackmagic:/dev/blackmagic
worker:
build: ./services/worker
profiles: [worker]
restart: unless-stopped
environment:
REDIS_URL: ${REDIS_URL}
DATABASE_URL: ${DATABASE_URL}
S3_ENDPOINT: ${S3_ENDPOINT}
S3_BUCKET: ${S3_BUCKET}
S3_ACCESS_KEY: ${S3_ACCESS_KEY}
S3_SECRET_KEY: ${S3_SECRET_KEY}
S3_REGION: ${S3_REGION:-us-east-1}
NVENC_ENABLED: ${NVENC_ENABLED:-false}
networks:
- wild-dragon-worker
# SDI capture service — only start on nodes with Blackmagic DeckLink cards
# Set BMD_DEVICE_0 in .env.worker to the actual device path, e.g. /dev/blackmagic/dv0
capture:
build: ./services/capture
profiles: [capture]
restart: unless-stopped
environment:
REDIS_URL: ${REDIS_URL}
DATABASE_URL: ${DATABASE_URL}
S3_ENDPOINT: ${S3_ENDPOINT}
S3_BUCKET: ${S3_BUCKET}
S3_ACCESS_KEY: ${S3_ACCESS_KEY}
S3_SECRET_KEY: ${S3_SECRET_KEY}
CAPTURE_PORT: 3001
devices:
- ${BMD_DEVICE_0:-/dev/blackmagic/dv0}:/dev/blackmagic/dv0
- ${BMD_DEVICE_1:-/dev/blackmagic/dv1}:/dev/blackmagic/dv1
ports:
- "${CAPTURE_PORT:-7437}:3001"
networks:
- wild-dragon-worker
# worker-l4: HEAVY tier (proxy/conform/trim) on the L4 (NVENC). Talks to
# zampp1's Redis/Postgres/S3 over the LAN (.200). No promotion scanner here.
worker-l4:
build:
context: ./services/worker
dockerfile: Dockerfile.gpu
image: wild-dragon-worker-gpu:latest
runtime: nvidia
restart: unless-stopped
environment:
REDIS_URL: ${REDIS_URL}
DATABASE_URL: ${DATABASE_URL}
S3_ENDPOINT: ${S3_ENDPOINT}
S3_BUCKET: ${S3_BUCKET}
S3_ACCESS_KEY: ${S3_ACCESS_KEY}
S3_SECRET_KEY: ${S3_SECRET_KEY}
S3_REGION: ${S3_REGION:-us-east-1}
WORKER_QUEUES: proxy,conform,trim
PROXY_CONCURRENCY: "3"
NVIDIA_VISIBLE_DEVICES: GPU-13acf439-8bf4-a5e0-7804-c1071bca547a
WORKER_LABEL: "zampp2 / L4"
NVIDIA_DRIVER_CAPABILITIES: video,compute,utility
networks:
- wild-dragon-worker
networks:
wild-dragon-worker:
driver: bridge