dragonflight/services/worker/src/index.js

116 lines
5 KiB
JavaScript
Raw Normal View History

2026-04-07 21:58:18 -04:00
import 'dotenv/config';
feat: server-side filmstrip worker + fix scheduler crash + fix clip freeze Root causes found: 1. Scheduler crashing every 15s: assets table has no error_message column. Fix: remove error_message from UPDATE in scheduler.js (#66 regression). 2. Clip freezing: client-side filmstrip seek loop runs on main thread, seeks same proxy the player is streaming → both stall → freeze. Fix: replace browser seek loop entirely with server-side FFmpeg worker. 3. No dedicated filmstrip worker: filmstrip was never pre-built server-side. Changes: - services/mam-api/src/db/migrations/018-add-filmstrip-s3-key.sql Add filmstrip_s3_key TEXT column to assets table - services/worker/src/workers/filmstrip.js (new) BullMQ worker: downloads proxy, runs FFmpeg fps filter to extract 28 evenly-spaced JPEG frames, base64-encodes them, uploads JSON array to S3 at filmstrips/<assetId>.json, stores key in DB - services/worker/src/workers/thumbnail.js Queue filmstrip job automatically after thumbnail completes - services/worker/src/index.js Register filmstrip worker (concurrency=2), export filmstripQueue singleton, close it on SIGTERM - services/mam-api/src/routes/assets.js - filmstripQueue added - POST /reprocess?type=filmstrip now supported - GET /:id/filmstrip returns signed S3 URL for JSON frames - services/mam-api/src/routes/jobs.js filmstrip queue visible in Jobs UI - services/web-ui/public/screens-asset.jsx Replace browser seek loop with fetch of /assets/:id/filmstrip → fetch S3 JSON → render frames. Zero browser-side video seeking. Right-click and Files tab re-generate via API endpoint.
2026-05-26 12:39:44 -04:00
import { Worker, Queue } from 'bullmq';
import { proxyWorker, thumbnailQueue as proxyThumbnailQueue } from './workers/proxy.js';
2026-04-07 21:58:18 -04:00
import { thumbnailWorker } from './workers/thumbnail.js';
feat: server-side filmstrip worker + fix scheduler crash + fix clip freeze Root causes found: 1. Scheduler crashing every 15s: assets table has no error_message column. Fix: remove error_message from UPDATE in scheduler.js (#66 regression). 2. Clip freezing: client-side filmstrip seek loop runs on main thread, seeks same proxy the player is streaming → both stall → freeze. Fix: replace browser seek loop entirely with server-side FFmpeg worker. 3. No dedicated filmstrip worker: filmstrip was never pre-built server-side. Changes: - services/mam-api/src/db/migrations/018-add-filmstrip-s3-key.sql Add filmstrip_s3_key TEXT column to assets table - services/worker/src/workers/filmstrip.js (new) BullMQ worker: downloads proxy, runs FFmpeg fps filter to extract 28 evenly-spaced JPEG frames, base64-encodes them, uploads JSON array to S3 at filmstrips/<assetId>.json, stores key in DB - services/worker/src/workers/thumbnail.js Queue filmstrip job automatically after thumbnail completes - services/worker/src/index.js Register filmstrip worker (concurrency=2), export filmstripQueue singleton, close it on SIGTERM - services/mam-api/src/routes/assets.js - filmstripQueue added - POST /reprocess?type=filmstrip now supported - GET /:id/filmstrip returns signed S3 URL for JSON frames - services/mam-api/src/routes/jobs.js filmstrip queue visible in Jobs UI - services/web-ui/public/screens-asset.jsx Replace browser seek loop with fetch of /assets/:id/filmstrip → fetch S3 JSON → render frames. Zero browser-side video seeking. Right-click and Files tab re-generate via API endpoint.
2026-05-26 12:39:44 -04:00
import { filmstripWorker } from './workers/filmstrip.js';
2026-04-07 21:58:18 -04:00
import { conformWorker } from './workers/conform.js';
import { youtubeImportWorker, proxyQueue as youtubeProxyQueue } from './workers/youtube-import.js';
import { trimWorker } from './workers/trimWorker.js';
import { startPromotionWorker } from './workers/promotion.js';
2026-04-07 21:58:18 -04:00
const parseRedisUrl = (url) => {
const parsed = new URL(url);
return {
host: parsed.hostname,
port: parseInt(parsed.port, 10),
password: parsed.password || undefined,
};
};
const redisOptions = parseRedisUrl(process.env.REDIS_URL || 'redis://localhost:6379');
const createWorker = (queueName, handler, overrides = {}) => {
const worker = new Worker(queueName, handler, {
connection: redisOptions,
// Stall detection: if a worker dies mid-job, BullMQ moves it back to wait
// after stalledInterval. Without this a crashed run sits in active forever.
stalledInterval: 30000,
maxStalledCount: 1,
lockDuration: 60000,
lockRenewTime: 15000,
...overrides,
});
2026-04-07 21:58:18 -04:00
worker.on('completed', (job) => {
console.log(`[${queueName}] Job ${job.id} completed`);
});
worker.on('failed', (job, err) => {
console.error(`[${queueName}] Job ${job.id} failed:`, err.message);
});
worker.on('stalled', (jobId) => {
console.warn(`[${queueName}] Job ${jobId} stalled — reclaimed`);
});
worker.on('progress', (job, progress) => {
console.log(`[${queueName}] Job ${job.id} progress:`, progress);
2026-04-07 21:58:18 -04:00
});
return worker;
};
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
// Per-queue concurrency. Defaults to 1, which serialises every job in a
// queue — meaning a single stalled job blocks every other one. We want
// thumbnails (cheap, parallel-safe) to run several at a time so a slow
// outlier doesn't back the rest of the catalog up. Proxy + conform are
// heavier (ffmpeg transcode) so we keep them lower to avoid trashing
// the box; tune via env if a node has more headroom.
feat: server-side filmstrip worker + fix scheduler crash + fix clip freeze Root causes found: 1. Scheduler crashing every 15s: assets table has no error_message column. Fix: remove error_message from UPDATE in scheduler.js (#66 regression). 2. Clip freezing: client-side filmstrip seek loop runs on main thread, seeks same proxy the player is streaming → both stall → freeze. Fix: replace browser seek loop entirely with server-side FFmpeg worker. 3. No dedicated filmstrip worker: filmstrip was never pre-built server-side. Changes: - services/mam-api/src/db/migrations/018-add-filmstrip-s3-key.sql Add filmstrip_s3_key TEXT column to assets table - services/worker/src/workers/filmstrip.js (new) BullMQ worker: downloads proxy, runs FFmpeg fps filter to extract 28 evenly-spaced JPEG frames, base64-encodes them, uploads JSON array to S3 at filmstrips/<assetId>.json, stores key in DB - services/worker/src/workers/thumbnail.js Queue filmstrip job automatically after thumbnail completes - services/worker/src/index.js Register filmstrip worker (concurrency=2), export filmstripQueue singleton, close it on SIGTERM - services/mam-api/src/routes/assets.js - filmstripQueue added - POST /reprocess?type=filmstrip now supported - GET /:id/filmstrip returns signed S3 URL for JSON frames - services/mam-api/src/routes/jobs.js filmstrip queue visible in Jobs UI - services/web-ui/public/screens-asset.jsx Replace browser seek loop with fetch of /assets/:id/filmstrip → fetch S3 JSON → render frames. Zero browser-side video seeking. Right-click and Files tab re-generate via API endpoint.
2026-05-26 12:39:44 -04:00
const PROXY_CONCURRENCY = parseInt(process.env.PROXY_CONCURRENCY || '2', 10);
const THUMBNAIL_CONCURRENCY = parseInt(process.env.THUMBNAIL_CONCURRENCY || '4', 10);
const FILMSTRIP_CONCURRENCY = parseInt(process.env.FILMSTRIP_CONCURRENCY || '2', 10);
const CONFORM_CONCURRENCY = parseInt(process.env.CONFORM_CONCURRENCY || '1', 10);
const TRIM_CONCURRENCY = parseInt(process.env.TRIM_CONCURRENCY || '4', 10);
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
2026-04-07 21:58:18 -04:00
const workers = [
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
createWorker('proxy', proxyWorker, { concurrency: PROXY_CONCURRENCY }),
createWorker('thumbnail', thumbnailWorker, { concurrency: THUMBNAIL_CONCURRENCY }),
feat: server-side filmstrip worker + fix scheduler crash + fix clip freeze Root causes found: 1. Scheduler crashing every 15s: assets table has no error_message column. Fix: remove error_message from UPDATE in scheduler.js (#66 regression). 2. Clip freezing: client-side filmstrip seek loop runs on main thread, seeks same proxy the player is streaming → both stall → freeze. Fix: replace browser seek loop entirely with server-side FFmpeg worker. 3. No dedicated filmstrip worker: filmstrip was never pre-built server-side. Changes: - services/mam-api/src/db/migrations/018-add-filmstrip-s3-key.sql Add filmstrip_s3_key TEXT column to assets table - services/worker/src/workers/filmstrip.js (new) BullMQ worker: downloads proxy, runs FFmpeg fps filter to extract 28 evenly-spaced JPEG frames, base64-encodes them, uploads JSON array to S3 at filmstrips/<assetId>.json, stores key in DB - services/worker/src/workers/thumbnail.js Queue filmstrip job automatically after thumbnail completes - services/worker/src/index.js Register filmstrip worker (concurrency=2), export filmstripQueue singleton, close it on SIGTERM - services/mam-api/src/routes/assets.js - filmstripQueue added - POST /reprocess?type=filmstrip now supported - GET /:id/filmstrip returns signed S3 URL for JSON frames - services/mam-api/src/routes/jobs.js filmstrip queue visible in Jobs UI - services/web-ui/public/screens-asset.jsx Replace browser seek loop with fetch of /assets/:id/filmstrip → fetch S3 JSON → render frames. Zero browser-side video seeking. Right-click and Files tab re-generate via API endpoint.
2026-05-26 12:39:44 -04:00
createWorker('filmstrip', filmstripWorker, { concurrency: FILMSTRIP_CONCURRENCY }),
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
createWorker('conform', conformWorker, { concurrency: CONFORM_CONCURRENCY }),
createWorker('trim', trimWorker, { concurrency: TRIM_CONCURRENCY }),
createWorker('import', youtubeImportWorker, {
concurrency: 1,
lockDuration: 10 * 60 * 1000,
lockRenewTime: 60000,
}),
2026-04-07 21:58:18 -04:00
];
feat: server-side filmstrip worker + fix scheduler crash + fix clip freeze Root causes found: 1. Scheduler crashing every 15s: assets table has no error_message column. Fix: remove error_message from UPDATE in scheduler.js (#66 regression). 2. Clip freezing: client-side filmstrip seek loop runs on main thread, seeks same proxy the player is streaming → both stall → freeze. Fix: replace browser seek loop entirely with server-side FFmpeg worker. 3. No dedicated filmstrip worker: filmstrip was never pre-built server-side. Changes: - services/mam-api/src/db/migrations/018-add-filmstrip-s3-key.sql Add filmstrip_s3_key TEXT column to assets table - services/worker/src/workers/filmstrip.js (new) BullMQ worker: downloads proxy, runs FFmpeg fps filter to extract 28 evenly-spaced JPEG frames, base64-encodes them, uploads JSON array to S3 at filmstrips/<assetId>.json, stores key in DB - services/worker/src/workers/thumbnail.js Queue filmstrip job automatically after thumbnail completes - services/worker/src/index.js Register filmstrip worker (concurrency=2), export filmstripQueue singleton, close it on SIGTERM - services/mam-api/src/routes/assets.js - filmstripQueue added - POST /reprocess?type=filmstrip now supported - GET /:id/filmstrip returns signed S3 URL for JSON frames - services/mam-api/src/routes/jobs.js filmstrip queue visible in Jobs UI - services/web-ui/public/screens-asset.jsx Replace browser seek loop with fetch of /assets/:id/filmstrip → fetch S3 JSON → render frames. Zero browser-side video seeking. Right-click and Files tab re-generate via API endpoint.
2026-05-26 12:39:44 -04:00
// Filmstrip queue singleton — used by thumbnail worker to enqueue filmstrip jobs
export const filmstripQueue = new Queue('filmstrip', { connection: redisOptions });
console.log(`Concurrency: proxy=${PROXY_CONCURRENCY} thumbnail=${THUMBNAIL_CONCURRENCY} filmstrip=${FILMSTRIP_CONCURRENCY} conform=${CONFORM_CONCURRENCY} trim=${TRIM_CONCURRENCY} import=1`);
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
// BUG FIX #4: startPromotionWorker() now returns a shutdown function that
// clears the poll intervals and closes the promotion proxyQueue singleton.
const stopPromotionWorker = startPromotionWorker();
2026-04-07 21:58:18 -04:00
console.log('Wild Dragon Worker Service started');
console.log(`Redis: ${redisOptions.host}:${redisOptions.port}`);
console.log('Active queues: proxy, thumbnail, conform, trim, import');
console.log('Background scans: promotion (growing-files → S3)');
2026-04-07 21:58:18 -04:00
process.on('SIGTERM', async () => {
console.log('SIGTERM received, shutting down...');
// BUG FIX #4 + #10: Close all BullMQ Workers AND all Queue client instances
// on SIGTERM. Workers process jobs; Queues dispatch them. Both hold open
// Redis connections that keep the event loop alive after workers are closed,
// causing the process to hang indefinitely unless process.exit() is called.
// Explicitly closing every Queue allows the event loop to drain naturally.
await Promise.all([
// Close all Worker instances (stops accepting new jobs, waits for active)
...workers.map(w => w.close()),
// BUG FIX #7: Close the Queue singletons from worker modules.
// proxyThumbnailQueue: thumbnailQueue in proxy.js (dispatches thumbnail jobs)
// youtubeProxyQueue: proxyQueue in youtube-import.js (dispatches proxy jobs)
proxyThumbnailQueue.close().catch(() => {}),
youtubeProxyQueue.close().catch(() => {}),
feat: server-side filmstrip worker + fix scheduler crash + fix clip freeze Root causes found: 1. Scheduler crashing every 15s: assets table has no error_message column. Fix: remove error_message from UPDATE in scheduler.js (#66 regression). 2. Clip freezing: client-side filmstrip seek loop runs on main thread, seeks same proxy the player is streaming → both stall → freeze. Fix: replace browser seek loop entirely with server-side FFmpeg worker. 3. No dedicated filmstrip worker: filmstrip was never pre-built server-side. Changes: - services/mam-api/src/db/migrations/018-add-filmstrip-s3-key.sql Add filmstrip_s3_key TEXT column to assets table - services/worker/src/workers/filmstrip.js (new) BullMQ worker: downloads proxy, runs FFmpeg fps filter to extract 28 evenly-spaced JPEG frames, base64-encodes them, uploads JSON array to S3 at filmstrips/<assetId>.json, stores key in DB - services/worker/src/workers/thumbnail.js Queue filmstrip job automatically after thumbnail completes - services/worker/src/index.js Register filmstrip worker (concurrency=2), export filmstripQueue singleton, close it on SIGTERM - services/mam-api/src/routes/assets.js - filmstripQueue added - POST /reprocess?type=filmstrip now supported - GET /:id/filmstrip returns signed S3 URL for JSON frames - services/mam-api/src/routes/jobs.js filmstrip queue visible in Jobs UI - services/web-ui/public/screens-asset.jsx Replace browser seek loop with fetch of /assets/:id/filmstrip → fetch S3 JSON → render frames. Zero browser-side video seeking. Right-click and Files tab re-generate via API endpoint.
2026-05-26 12:39:44 -04:00
filmstripQueue.close().catch(() => {}),
// BUG FIX #4: Stop the promotion worker intervals and close its proxyQueue
stopPromotionWorker(),
]);
console.log('All workers and queues closed');
2026-04-07 21:58:18 -04:00
process.exit(0);
});