From b27b9f6909ff4b64dc986c268d673a5411b4570c Mon Sep 17 00:00:00 2001 From: ZGaetano Date: Thu, 4 Jun 2026 12:53:28 +0000 Subject: [PATCH] fix(s3): keep-alive agents + long timeouts to end socket starvation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of stuck 'processing', failed deletes, and dead playback: The mam-api proxies media (/video, /hls pipe the full S3 body through Express), holding long-lived streaming sockets. With the SDK's default http agents (no keep-alive, unbounded but unpooled) those streams starved control-plane calls — DeleteObject and the proxy worker's master download — which timed out (10s connectionTimeout) in bursts. Fixes: - mam-api S3 client: dedicated keep-alive http/https Agents (maxSockets 256) + requestTimeout raised 30s→300s so large master GETs finish. - worker S3 client: previously had NO handler config at all (SDK defaults). Added keep-alive agents + 600s requestTimeout so proxy/conform master downloads (hundreds of MB) don't stall and leave assets in 'processing'. --- services/mam-api/src/s3/client.js | 26 +++++++++++++++++++++++--- services/worker/src/s3/client.js | 17 +++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/services/mam-api/src/s3/client.js b/services/mam-api/src/s3/client.js index 374bbb9..8526675 100644 --- a/services/mam-api/src/s3/client.js +++ b/services/mam-api/src/s3/client.js @@ -2,8 +2,20 @@ import { NodeHttpHandler } from '@smithy/node-http-handler'; import { S3Client, GetObjectCommand, DeleteObjectCommand, HeadBucketCommand, ListObjectsV2Command } from '@aws-sdk/client-s3'; import { getSignedUrl } from '@aws-sdk/s3-request-presigner'; import { Upload } from '@aws-sdk/lib-storage'; +import http from 'node:http'; +import https from 'node:https'; import pool from '../db/pool.js'; +// Dedicated keep-alive agents with a high socket ceiling. Without these the +// SDK uses Node's default agents (effectively short-lived, low reuse); when the +// API proxies media (/video, /hls pipe the full S3 body through Express) those +// long-lived streaming sockets starve control-plane calls (DeleteObject, the +// proxy worker's master download), which then time out → assets stuck in +// 'processing', "s3 delete failed", and dead browser playback. A large pool + +// keep-alive lets streams and control ops coexist. +const _s3HttpAgent = new http.Agent({ keepAlive: true, maxSockets: 256, maxFreeSockets: 32, timeout: 120_000 }); +const _s3HttpsAgent = new https.Agent({ keepAlive: true, maxSockets: 256, maxFreeSockets: 32, timeout: 120_000 }); + // ── Mutable config ──────────────────────────────────────────────────────────── let _cfg = { endpoint: process.env.S3_ENDPOINT || '', @@ -23,9 +35,17 @@ function buildClient(cfg) { secretAccessKey: cfg.secretKey, }, forcePathStyle: true, - // Hard request/connection timeouts so a stalled RustFS GET can't hang the - // /video and /hls endpoints forever (the original browser-playback hang). - requestHandler: new NodeHttpHandler({ requestTimeout: 30_000, connectionTimeout: 10_000 }), + // Keep-alive agents (above) prevent socket starvation between media streams + // and control-plane ops. requestTimeout is generous so the proxy worker's + // full-master download (hundreds of MB) doesn't abort mid-transfer and leave + // the asset stuck in 'processing'; connectionTimeout stays short so a dead + // endpoint fails fast rather than hanging /video. + requestHandler: new NodeHttpHandler({ + httpAgent: _s3HttpAgent, + httpsAgent: _s3HttpsAgent, + requestTimeout: 300_000, + connectionTimeout: 10_000, + }), requestChecksumCalculation: 'WHEN_REQUIRED', responseChecksumValidation: 'WHEN_REQUIRED', }); diff --git a/services/worker/src/s3/client.js b/services/worker/src/s3/client.js index ece50c8..29bfca1 100644 --- a/services/worker/src/s3/client.js +++ b/services/worker/src/s3/client.js @@ -1,8 +1,11 @@ import { S3Client, GetObjectCommand, PutObjectCommand } from '@aws-sdk/client-s3'; +import { NodeHttpHandler } from '@smithy/node-http-handler'; import { createReadStream, createWriteStream } from 'fs'; import { readdir } from 'fs/promises'; import { join, extname } from 'path'; import { pipeline } from 'stream/promises'; +import http from 'node:http'; +import https from 'node:https'; const CONTENT_TYPES = { '.m3u8': 'application/vnd.apple.mpegurl', @@ -10,6 +13,14 @@ const CONTENT_TYPES = { '.mp4': 'video/mp4', }; +// Keep-alive agents + a long request timeout. The proxy/conform jobs download +// full master files (hundreds of MB) and upload HLS segments; the SDK defaults +// (no keep-alive, 0/short timeouts under contention) caused master downloads to +// stall and abort, leaving assets stuck in 'processing'. Generous timeout + +// pooled sockets make these large transfers reliable. +const _httpAgent = new http.Agent({ keepAlive: true, maxSockets: 128, timeout: 600_000 }); +const _httpsAgent = new https.Agent({ keepAlive: true, maxSockets: 128, timeout: 600_000 }); + const createS3Client = () => { return new S3Client({ region: process.env.S3_REGION || 'us-east-1', @@ -19,6 +30,12 @@ const createS3Client = () => { secretAccessKey: process.env.S3_SECRET_KEY, }, forcePathStyle: true, + requestHandler: new NodeHttpHandler({ + httpAgent: _httpAgent, + httpsAgent: _httpsAgent, + requestTimeout: 600_000, + connectionTimeout: 15_000, + }), }); };