diff --git a/services/mam-api/src/s3/client.js b/services/mam-api/src/s3/client.js index 374bbb9..8526675 100644 --- a/services/mam-api/src/s3/client.js +++ b/services/mam-api/src/s3/client.js @@ -2,8 +2,20 @@ import { NodeHttpHandler } from '@smithy/node-http-handler'; import { S3Client, GetObjectCommand, DeleteObjectCommand, HeadBucketCommand, ListObjectsV2Command } from '@aws-sdk/client-s3'; import { getSignedUrl } from '@aws-sdk/s3-request-presigner'; import { Upload } from '@aws-sdk/lib-storage'; +import http from 'node:http'; +import https from 'node:https'; import pool from '../db/pool.js'; +// Dedicated keep-alive agents with a high socket ceiling. Without these the +// SDK uses Node's default agents (effectively short-lived, low reuse); when the +// API proxies media (/video, /hls pipe the full S3 body through Express) those +// long-lived streaming sockets starve control-plane calls (DeleteObject, the +// proxy worker's master download), which then time out → assets stuck in +// 'processing', "s3 delete failed", and dead browser playback. A large pool + +// keep-alive lets streams and control ops coexist. +const _s3HttpAgent = new http.Agent({ keepAlive: true, maxSockets: 256, maxFreeSockets: 32, timeout: 120_000 }); +const _s3HttpsAgent = new https.Agent({ keepAlive: true, maxSockets: 256, maxFreeSockets: 32, timeout: 120_000 }); + // ── Mutable config ──────────────────────────────────────────────────────────── let _cfg = { endpoint: process.env.S3_ENDPOINT || '', @@ -23,9 +35,17 @@ function buildClient(cfg) { secretAccessKey: cfg.secretKey, }, forcePathStyle: true, - // Hard request/connection timeouts so a stalled RustFS GET can't hang the - // /video and /hls endpoints forever (the original browser-playback hang). - requestHandler: new NodeHttpHandler({ requestTimeout: 30_000, connectionTimeout: 10_000 }), + // Keep-alive agents (above) prevent socket starvation between media streams + // and control-plane ops. requestTimeout is generous so the proxy worker's + // full-master download (hundreds of MB) doesn't abort mid-transfer and leave + // the asset stuck in 'processing'; connectionTimeout stays short so a dead + // endpoint fails fast rather than hanging /video. + requestHandler: new NodeHttpHandler({ + httpAgent: _s3HttpAgent, + httpsAgent: _s3HttpsAgent, + requestTimeout: 300_000, + connectionTimeout: 10_000, + }), requestChecksumCalculation: 'WHEN_REQUIRED', responseChecksumValidation: 'WHEN_REQUIRED', }); diff --git a/services/worker/src/s3/client.js b/services/worker/src/s3/client.js index ece50c8..29bfca1 100644 --- a/services/worker/src/s3/client.js +++ b/services/worker/src/s3/client.js @@ -1,8 +1,11 @@ import { S3Client, GetObjectCommand, PutObjectCommand } from '@aws-sdk/client-s3'; +import { NodeHttpHandler } from '@smithy/node-http-handler'; import { createReadStream, createWriteStream } from 'fs'; import { readdir } from 'fs/promises'; import { join, extname } from 'path'; import { pipeline } from 'stream/promises'; +import http from 'node:http'; +import https from 'node:https'; const CONTENT_TYPES = { '.m3u8': 'application/vnd.apple.mpegurl', @@ -10,6 +13,14 @@ const CONTENT_TYPES = { '.mp4': 'video/mp4', }; +// Keep-alive agents + a long request timeout. The proxy/conform jobs download +// full master files (hundreds of MB) and upload HLS segments; the SDK defaults +// (no keep-alive, 0/short timeouts under contention) caused master downloads to +// stall and abort, leaving assets stuck in 'processing'. Generous timeout + +// pooled sockets make these large transfers reliable. +const _httpAgent = new http.Agent({ keepAlive: true, maxSockets: 128, timeout: 600_000 }); +const _httpsAgent = new https.Agent({ keepAlive: true, maxSockets: 128, timeout: 600_000 }); + const createS3Client = () => { return new S3Client({ region: process.env.S3_REGION || 'us-east-1', @@ -19,6 +30,12 @@ const createS3Client = () => { secretAccessKey: process.env.S3_SECRET_KEY, }, forcePathStyle: true, + requestHandler: new NodeHttpHandler({ + httpAgent: _httpAgent, + httpsAgent: _httpsAgent, + requestTimeout: 600_000, + connectionTimeout: 15_000, + }), }); };