fix(s3): keep-alive agents + long timeouts to end socket starvation
Root cause of stuck 'processing', failed deletes, and dead playback: The mam-api proxies media (/video, /hls pipe the full S3 body through Express), holding long-lived streaming sockets. With the SDK's default http agents (no keep-alive, unbounded but unpooled) those streams starved control-plane calls — DeleteObject and the proxy worker's master download — which timed out (10s connectionTimeout) in bursts. Fixes: - mam-api S3 client: dedicated keep-alive http/https Agents (maxSockets 256) + requestTimeout raised 30s→300s so large master GETs finish. - worker S3 client: previously had NO handler config at all (SDK defaults). Added keep-alive agents + 600s requestTimeout so proxy/conform master downloads (hundreds of MB) don't stall and leave assets in 'processing'.
This commit is contained in:
parent
ac1d7e1e1f
commit
b27b9f6909
2 changed files with 40 additions and 3 deletions
|
|
@ -2,8 +2,20 @@ import { NodeHttpHandler } from '@smithy/node-http-handler';
|
|||
import { S3Client, GetObjectCommand, DeleteObjectCommand, HeadBucketCommand, ListObjectsV2Command } from '@aws-sdk/client-s3';
|
||||
import { getSignedUrl } from '@aws-sdk/s3-request-presigner';
|
||||
import { Upload } from '@aws-sdk/lib-storage';
|
||||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import pool from '../db/pool.js';
|
||||
|
||||
// Dedicated keep-alive agents with a high socket ceiling. Without these the
|
||||
// SDK uses Node's default agents (effectively short-lived, low reuse); when the
|
||||
// API proxies media (/video, /hls pipe the full S3 body through Express) those
|
||||
// long-lived streaming sockets starve control-plane calls (DeleteObject, the
|
||||
// proxy worker's master download), which then time out → assets stuck in
|
||||
// 'processing', "s3 delete failed", and dead browser playback. A large pool +
|
||||
// keep-alive lets streams and control ops coexist.
|
||||
const _s3HttpAgent = new http.Agent({ keepAlive: true, maxSockets: 256, maxFreeSockets: 32, timeout: 120_000 });
|
||||
const _s3HttpsAgent = new https.Agent({ keepAlive: true, maxSockets: 256, maxFreeSockets: 32, timeout: 120_000 });
|
||||
|
||||
// ── Mutable config ────────────────────────────────────────────────────────────
|
||||
let _cfg = {
|
||||
endpoint: process.env.S3_ENDPOINT || '',
|
||||
|
|
@ -23,9 +35,17 @@ function buildClient(cfg) {
|
|||
secretAccessKey: cfg.secretKey,
|
||||
},
|
||||
forcePathStyle: true,
|
||||
// Hard request/connection timeouts so a stalled RustFS GET can't hang the
|
||||
// /video and /hls endpoints forever (the original browser-playback hang).
|
||||
requestHandler: new NodeHttpHandler({ requestTimeout: 30_000, connectionTimeout: 10_000 }),
|
||||
// Keep-alive agents (above) prevent socket starvation between media streams
|
||||
// and control-plane ops. requestTimeout is generous so the proxy worker's
|
||||
// full-master download (hundreds of MB) doesn't abort mid-transfer and leave
|
||||
// the asset stuck in 'processing'; connectionTimeout stays short so a dead
|
||||
// endpoint fails fast rather than hanging /video.
|
||||
requestHandler: new NodeHttpHandler({
|
||||
httpAgent: _s3HttpAgent,
|
||||
httpsAgent: _s3HttpsAgent,
|
||||
requestTimeout: 300_000,
|
||||
connectionTimeout: 10_000,
|
||||
}),
|
||||
requestChecksumCalculation: 'WHEN_REQUIRED',
|
||||
responseChecksumValidation: 'WHEN_REQUIRED',
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,8 +1,11 @@
|
|||
import { S3Client, GetObjectCommand, PutObjectCommand } from '@aws-sdk/client-s3';
|
||||
import { NodeHttpHandler } from '@smithy/node-http-handler';
|
||||
import { createReadStream, createWriteStream } from 'fs';
|
||||
import { readdir } from 'fs/promises';
|
||||
import { join, extname } from 'path';
|
||||
import { pipeline } from 'stream/promises';
|
||||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
|
||||
const CONTENT_TYPES = {
|
||||
'.m3u8': 'application/vnd.apple.mpegurl',
|
||||
|
|
@ -10,6 +13,14 @@ const CONTENT_TYPES = {
|
|||
'.mp4': 'video/mp4',
|
||||
};
|
||||
|
||||
// Keep-alive agents + a long request timeout. The proxy/conform jobs download
|
||||
// full master files (hundreds of MB) and upload HLS segments; the SDK defaults
|
||||
// (no keep-alive, 0/short timeouts under contention) caused master downloads to
|
||||
// stall and abort, leaving assets stuck in 'processing'. Generous timeout +
|
||||
// pooled sockets make these large transfers reliable.
|
||||
const _httpAgent = new http.Agent({ keepAlive: true, maxSockets: 128, timeout: 600_000 });
|
||||
const _httpsAgent = new https.Agent({ keepAlive: true, maxSockets: 128, timeout: 600_000 });
|
||||
|
||||
const createS3Client = () => {
|
||||
return new S3Client({
|
||||
region: process.env.S3_REGION || 'us-east-1',
|
||||
|
|
@ -19,6 +30,12 @@ const createS3Client = () => {
|
|||
secretAccessKey: process.env.S3_SECRET_KEY,
|
||||
},
|
||||
forcePathStyle: true,
|
||||
requestHandler: new NodeHttpHandler({
|
||||
httpAgent: _httpAgent,
|
||||
httpsAgent: _httpsAgent,
|
||||
requestTimeout: 600_000,
|
||||
connectionTimeout: 15_000,
|
||||
}),
|
||||
});
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue