fix(s3): keep-alive agents + long timeouts to end socket starvation

Root cause of stuck 'processing', failed deletes, and dead playback:

The mam-api proxies media (/video, /hls pipe the full S3 body through
Express), holding long-lived streaming sockets. With the SDK's default
http agents (no keep-alive, unbounded but unpooled) those streams starved
control-plane calls — DeleteObject and the proxy worker's master download
— which timed out (10s connectionTimeout) in bursts.

Fixes:
- mam-api S3 client: dedicated keep-alive http/https Agents (maxSockets 256)
  + requestTimeout raised 30s→300s so large master GETs finish.
- worker S3 client: previously had NO handler config at all (SDK defaults).
  Added keep-alive agents + 600s requestTimeout so proxy/conform master
  downloads (hundreds of MB) don't stall and leave assets in 'processing'.
This commit is contained in:
Zac Gaetano 2026-06-04 12:53:28 +00:00
parent ac1d7e1e1f
commit b27b9f6909
2 changed files with 40 additions and 3 deletions

View file

@ -2,8 +2,20 @@ import { NodeHttpHandler } from '@smithy/node-http-handler';
import { S3Client, GetObjectCommand, DeleteObjectCommand, HeadBucketCommand, ListObjectsV2Command } from '@aws-sdk/client-s3'; import { S3Client, GetObjectCommand, DeleteObjectCommand, HeadBucketCommand, ListObjectsV2Command } from '@aws-sdk/client-s3';
import { getSignedUrl } from '@aws-sdk/s3-request-presigner'; import { getSignedUrl } from '@aws-sdk/s3-request-presigner';
import { Upload } from '@aws-sdk/lib-storage'; import { Upload } from '@aws-sdk/lib-storage';
import http from 'node:http';
import https from 'node:https';
import pool from '../db/pool.js'; import pool from '../db/pool.js';
// Dedicated keep-alive agents with a high socket ceiling. Without these the
// SDK uses Node's default agents (effectively short-lived, low reuse); when the
// API proxies media (/video, /hls pipe the full S3 body through Express) those
// long-lived streaming sockets starve control-plane calls (DeleteObject, the
// proxy worker's master download), which then time out → assets stuck in
// 'processing', "s3 delete failed", and dead browser playback. A large pool +
// keep-alive lets streams and control ops coexist.
const _s3HttpAgent = new http.Agent({ keepAlive: true, maxSockets: 256, maxFreeSockets: 32, timeout: 120_000 });
const _s3HttpsAgent = new https.Agent({ keepAlive: true, maxSockets: 256, maxFreeSockets: 32, timeout: 120_000 });
// ── Mutable config ──────────────────────────────────────────────────────────── // ── Mutable config ────────────────────────────────────────────────────────────
let _cfg = { let _cfg = {
endpoint: process.env.S3_ENDPOINT || '', endpoint: process.env.S3_ENDPOINT || '',
@ -23,9 +35,17 @@ function buildClient(cfg) {
secretAccessKey: cfg.secretKey, secretAccessKey: cfg.secretKey,
}, },
forcePathStyle: true, forcePathStyle: true,
// Hard request/connection timeouts so a stalled RustFS GET can't hang the // Keep-alive agents (above) prevent socket starvation between media streams
// /video and /hls endpoints forever (the original browser-playback hang). // and control-plane ops. requestTimeout is generous so the proxy worker's
requestHandler: new NodeHttpHandler({ requestTimeout: 30_000, connectionTimeout: 10_000 }), // full-master download (hundreds of MB) doesn't abort mid-transfer and leave
// the asset stuck in 'processing'; connectionTimeout stays short so a dead
// endpoint fails fast rather than hanging /video.
requestHandler: new NodeHttpHandler({
httpAgent: _s3HttpAgent,
httpsAgent: _s3HttpsAgent,
requestTimeout: 300_000,
connectionTimeout: 10_000,
}),
requestChecksumCalculation: 'WHEN_REQUIRED', requestChecksumCalculation: 'WHEN_REQUIRED',
responseChecksumValidation: 'WHEN_REQUIRED', responseChecksumValidation: 'WHEN_REQUIRED',
}); });

View file

@ -1,8 +1,11 @@
import { S3Client, GetObjectCommand, PutObjectCommand } from '@aws-sdk/client-s3'; import { S3Client, GetObjectCommand, PutObjectCommand } from '@aws-sdk/client-s3';
import { NodeHttpHandler } from '@smithy/node-http-handler';
import { createReadStream, createWriteStream } from 'fs'; import { createReadStream, createWriteStream } from 'fs';
import { readdir } from 'fs/promises'; import { readdir } from 'fs/promises';
import { join, extname } from 'path'; import { join, extname } from 'path';
import { pipeline } from 'stream/promises'; import { pipeline } from 'stream/promises';
import http from 'node:http';
import https from 'node:https';
const CONTENT_TYPES = { const CONTENT_TYPES = {
'.m3u8': 'application/vnd.apple.mpegurl', '.m3u8': 'application/vnd.apple.mpegurl',
@ -10,6 +13,14 @@ const CONTENT_TYPES = {
'.mp4': 'video/mp4', '.mp4': 'video/mp4',
}; };
// Keep-alive agents + a long request timeout. The proxy/conform jobs download
// full master files (hundreds of MB) and upload HLS segments; the SDK defaults
// (no keep-alive, 0/short timeouts under contention) caused master downloads to
// stall and abort, leaving assets stuck in 'processing'. Generous timeout +
// pooled sockets make these large transfers reliable.
const _httpAgent = new http.Agent({ keepAlive: true, maxSockets: 128, timeout: 600_000 });
const _httpsAgent = new https.Agent({ keepAlive: true, maxSockets: 128, timeout: 600_000 });
const createS3Client = () => { const createS3Client = () => {
return new S3Client({ return new S3Client({
region: process.env.S3_REGION || 'us-east-1', region: process.env.S3_REGION || 'us-east-1',
@ -19,6 +30,12 @@ const createS3Client = () => {
secretAccessKey: process.env.S3_SECRET_KEY, secretAccessKey: process.env.S3_SECRET_KEY,
}, },
forcePathStyle: true, forcePathStyle: true,
requestHandler: new NodeHttpHandler({
httpAgent: _httpAgent,
httpsAgent: _httpsAgent,
requestTimeout: 600_000,
connectionTimeout: 15_000,
}),
}); });
}; };