dragonflight/services/mam-api/src/routes/jobs.js

340 lines
12 KiB
JavaScript
Raw Normal View History

import express from 'express';
import pool from '../db/pool.js';
import { Queue } from 'bullmq';
const router = express.Router();
// Note: jobs use BullMQ id format "<queueType>:<bullId>" (e.g. "conform:42"),
// NOT UUIDs. The GET/:id, POST/:id/retry, and DELETE/:id handlers below split
// on the colon themselves and look up the queue. Adding a UUID validator
// here would 400 every BullMQ poll the panel makes (which is exactly what
// caused Export Timeline to stall "Rendering Hi-Res" forever — fixed 2026-05-28).
// ── Redis connection ──────────────────────────────────────────────────────────
const parseRedisUrl = (url) => {
try {
const parsed = new URL(url);
return { host: parsed.hostname, port: parseInt(parsed.port, 10) || 6379 };
} catch {
return { host: 'localhost', port: 6379 };
}
};
const redisConn = parseRedisUrl(process.env.REDIS_URL || 'redis://queue:6379');
feat: server-side filmstrip worker + fix scheduler crash + fix clip freeze Root causes found: 1. Scheduler crashing every 15s: assets table has no error_message column. Fix: remove error_message from UPDATE in scheduler.js (#66 regression). 2. Clip freezing: client-side filmstrip seek loop runs on main thread, seeks same proxy the player is streaming → both stall → freeze. Fix: replace browser seek loop entirely with server-side FFmpeg worker. 3. No dedicated filmstrip worker: filmstrip was never pre-built server-side. Changes: - services/mam-api/src/db/migrations/018-add-filmstrip-s3-key.sql Add filmstrip_s3_key TEXT column to assets table - services/worker/src/workers/filmstrip.js (new) BullMQ worker: downloads proxy, runs FFmpeg fps filter to extract 28 evenly-spaced JPEG frames, base64-encodes them, uploads JSON array to S3 at filmstrips/<assetId>.json, stores key in DB - services/worker/src/workers/thumbnail.js Queue filmstrip job automatically after thumbnail completes - services/worker/src/index.js Register filmstrip worker (concurrency=2), export filmstripQueue singleton, close it on SIGTERM - services/mam-api/src/routes/assets.js - filmstripQueue added - POST /reprocess?type=filmstrip now supported - GET /:id/filmstrip returns signed S3 URL for JSON frames - services/mam-api/src/routes/jobs.js filmstrip queue visible in Jobs UI - services/web-ui/public/screens-asset.jsx Replace browser seek loop with fetch of /assets/:id/filmstrip → fetch S3 JSON → render frames. Zero browser-side video seeking. Right-click and Files tab re-generate via API endpoint.
2026-05-26 12:39:44 -04:00
const proxyQueue = new Queue('proxy', { connection: redisConn });
const thumbnailQueue = new Queue('thumbnail', { connection: redisConn });
const filmstripQueue = new Queue('filmstrip', { connection: redisConn });
const conformQueue = new Queue('conform', { connection: redisConn });
const importQueue = new Queue('import', { connection: redisConn });
const trimQueue = new Queue('trim', { connection: redisConn });
const QUEUES = [
feat: server-side filmstrip worker + fix scheduler crash + fix clip freeze Root causes found: 1. Scheduler crashing every 15s: assets table has no error_message column. Fix: remove error_message from UPDATE in scheduler.js (#66 regression). 2. Clip freezing: client-side filmstrip seek loop runs on main thread, seeks same proxy the player is streaming → both stall → freeze. Fix: replace browser seek loop entirely with server-side FFmpeg worker. 3. No dedicated filmstrip worker: filmstrip was never pre-built server-side. Changes: - services/mam-api/src/db/migrations/018-add-filmstrip-s3-key.sql Add filmstrip_s3_key TEXT column to assets table - services/worker/src/workers/filmstrip.js (new) BullMQ worker: downloads proxy, runs FFmpeg fps filter to extract 28 evenly-spaced JPEG frames, base64-encodes them, uploads JSON array to S3 at filmstrips/<assetId>.json, stores key in DB - services/worker/src/workers/thumbnail.js Queue filmstrip job automatically after thumbnail completes - services/worker/src/index.js Register filmstrip worker (concurrency=2), export filmstripQueue singleton, close it on SIGTERM - services/mam-api/src/routes/assets.js - filmstripQueue added - POST /reprocess?type=filmstrip now supported - GET /:id/filmstrip returns signed S3 URL for JSON frames - services/mam-api/src/routes/jobs.js filmstrip queue visible in Jobs UI - services/web-ui/public/screens-asset.jsx Replace browser seek loop with fetch of /assets/:id/filmstrip → fetch S3 JSON → render frames. Zero browser-side video seeking. Right-click and Files tab re-generate via API endpoint.
2026-05-26 12:39:44 -04:00
{ queue: proxyQueue, type: 'proxy' },
{ queue: thumbnailQueue, type: 'thumbnail' },
{ queue: filmstripQueue, type: 'filmstrip' },
{ queue: conformQueue, type: 'conform' },
{ queue: importQueue, type: 'import' },
{ queue: trimQueue, type: 'trim' },
];
// BullMQ state → API status mapping
const STATE_MAP = {
waiting: 'waiting',
active: 'active',
completed: 'completed',
failed: 'failed',
delayed: 'waiting',
paused: 'waiting',
};
// Ordered state buckets used for bulk fetch — avoids N+1 getState() calls.
const STATE_BUCKETS = ['active', 'waiting', 'completed', 'failed', 'delayed', 'paused'];
function normalizeJob(bullJob, type, apiStatus) {
const isCompleted = apiStatus === 'completed';
const isFailed = apiStatus === 'failed';
return {
id: `${type}:${bullJob.id}`,
type,
status: apiStatus,
progress: typeof bullJob.progress === 'number' ? bullJob.progress : 0,
asset_id: bullJob.data?.assetId || null,
asset_name: bullJob.data?.assetName || null,
created_at: bullJob.timestamp ? new Date(bullJob.timestamp).toISOString() : null,
started_at: bullJob.processedOn ? new Date(bullJob.processedOn).toISOString() : null,
completed_at: isCompleted && bullJob.finishedOn ? new Date(bullJob.finishedOn).toISOString() : null,
failed_at: isFailed && bullJob.finishedOn ? new Date(bullJob.finishedOn).toISOString() : null,
error: bullJob.failedReason || null,
metadata: bullJob.data || {},
};
}
// Fetch all jobs from all queues in bulk by state bucket (no per-job getState() calls).
async function getAllBullMQJobs() {
const results = [];
for (const { queue, type } of QUEUES) {
for (const bucket of STATE_BUCKETS) {
try {
const apiStatus = STATE_MAP[bucket] || bucket;
const jobs = await queue.getJobs([bucket], 0, 200);
for (const job of jobs) {
results.push(normalizeJob(job, type, apiStatus));
}
} catch {
// queue or bucket unavailable — skip
}
}
}
return results;
}
// Mutate `jobs` in place to fill in asset_name from the assets table for any
// job that has an assetId but no inline assetName in its payload. One bulk
// SQL query per refresh — cheap, and means we don't have to remember to pass
// assetName at every enqueue site (upload.js, capture stop, scheduler, etc.).
async function attachAssetNames(jobs) {
const idsNeedingLookup = [...new Set(
jobs.filter(j => j.asset_id && !j.asset_name).map(j => j.asset_id)
)];
if (idsNeedingLookup.length === 0) return;
let rows = [];
try {
const result = await pool.query(
'SELECT id, display_name, filename FROM assets WHERE id = ANY($1::uuid[])',
[idsNeedingLookup]
);
rows = result.rows;
} catch {
// If the lookup fails (DB down, bad UUID in a stale BullMQ payload), keep
// serving jobs without names rather than 500-ing the whole list.
return;
}
const byId = new Map(rows.map(r => [r.id, r.display_name || r.filename]));
for (const j of jobs) {
if (j.asset_id && !j.asset_name) {
const name = byId.get(j.asset_id);
if (name) j.asset_name = name;
}
}
}
// ── GET /events Server-Sent Events stream of live job updates ───────────────
router.get('/events', async (req, res) => {
res.setHeader('Content-Type', 'text/event-stream');
res.setHeader('Cache-Control', 'no-cache');
res.setHeader('Connection', 'keep-alive');
res.setHeader('X-Accel-Buffering', 'no');
res.flushHeaders();
let closed = false;
req.on('close', () => { closed = true; });
const push = async () => {
if (closed) return;
try {
const jobs = await getAllBullMQJobs();
await attachAssetNames(jobs);
if (!closed) res.write(`data: ${JSON.stringify({ type: 'jobs', jobs })}\n\n`);
} catch (err) {
if (!closed) res.write(`data: ${JSON.stringify({ type: 'error', message: err.message })}\n\n`);
}
if (!closed) setTimeout(push, 2000);
};
await push();
});
fix: close all 24 open issues (#40–#94) Bug fixes: - #91: dockerApi() 10s socket timeout (Docker daemon hang) - #77: await syncToAmpp() with .catch() — no longer fire-and-forget - #75: migration 016 — add 'proxy','import' to job_type enum; add 'completed' to job_status - #73: BullMQ orphan job cleanup on hard asset delete - #70: batch-trim jobs table gets expires_at; trim-status auto-expires stale rows - #66: scheduler tick marks stale live assets (>2h) as error - #63: migration 017 — partial unique index prevents concurrent live asset overwrite - #61: recorders.js uses getS3Bucket() not stale process.env.S3_BUCKET - #60: already fixed (copy nulls proxy/thumbnail keys, requeues proxy) - #40: already fixed (All projects clears openProject) - #64: already fixed (sourceType/needsProxy handled) - #90: GET /jobs now includes DB jobs table (trim jobs visible in UI) - #74: nginx Content-Type header preserved; multer 500MB file size limit - #68: GET /upload returns in-progress ingesting assets - #58: /stream and /video endpoints fall back to original file for all video types - #55: recorder poll .catch() logs auth errors cleanly; redirect stops interval - #52: thumb-status and thumb-duration moved inside position:relative wrapper - #50: ProjectCard gets onContextMenu handler with rename/delete menu - #49: project context menu dismisses on contextmenu + scroll events Features: - #93: POST /assets/:id/reprocess?type=proxy|thumbnail — force re-queue any asset Asset ⋯ menu now shows 'Re-generate proxy' and 'Re-generate thumbnail' buttons UI: - Logo: brightness(0) invert(1) filter applied consistently in sidebar, launcher, and login — white logo pops on dark UI; inline style removed from login.html
2026-05-26 10:10:44 -04:00
// Fetch DB-tracked jobs (e.g. trim) and normalize to the same shape as BullMQ jobs.
// Only returns non-expired rows.
async function getDbJobs() {
try {
const result = await pool.query(
`SELECT j.id, j.type, j.status, j.payload, j.created_at, j.updated_at,
ts.asset_id
FROM jobs j
LEFT JOIN temp_segments ts ON ts.job_id = j.id
WHERE (j.expires_at IS NULL OR j.expires_at > NOW())
ORDER BY j.created_at DESC
LIMIT 200`
);
// Dedupe — multiple temp_segments per job, take first asset_id found
const seen = new Map();
for (const row of result.rows) {
if (!seen.has(row.id)) {
seen.set(row.id, {
id: `trim:${row.id}`,
type: row.type,
status: row.status === 'completed' ? 'completed' : row.status,
progress: row.status === 'completed' ? 100 : (row.status === 'failed' ? 0 : 50),
asset_id: row.asset_id || null,
asset_name: null,
created_at: row.created_at ? new Date(row.created_at).toISOString() : null,
started_at: null,
completed_at: row.status === 'completed' && row.updated_at ? new Date(row.updated_at).toISOString() : null,
failed_at: row.status === 'failed' && row.updated_at ? new Date(row.updated_at).toISOString() : null,
error: null,
metadata: row.payload || {},
});
}
}
return [...seen.values()];
} catch {
return [];
}
}
// ── GET / - List jobs (BullMQ queues + DB trim jobs) ─────────────────────────
router.get('/', async (req, res, next) => {
try {
const { type, status, asset_id } = req.query;
let jobs = await getAllBullMQJobs();
fix: close all 24 open issues (#40–#94) Bug fixes: - #91: dockerApi() 10s socket timeout (Docker daemon hang) - #77: await syncToAmpp() with .catch() — no longer fire-and-forget - #75: migration 016 — add 'proxy','import' to job_type enum; add 'completed' to job_status - #73: BullMQ orphan job cleanup on hard asset delete - #70: batch-trim jobs table gets expires_at; trim-status auto-expires stale rows - #66: scheduler tick marks stale live assets (>2h) as error - #63: migration 017 — partial unique index prevents concurrent live asset overwrite - #61: recorders.js uses getS3Bucket() not stale process.env.S3_BUCKET - #60: already fixed (copy nulls proxy/thumbnail keys, requeues proxy) - #40: already fixed (All projects clears openProject) - #64: already fixed (sourceType/needsProxy handled) - #90: GET /jobs now includes DB jobs table (trim jobs visible in UI) - #74: nginx Content-Type header preserved; multer 500MB file size limit - #68: GET /upload returns in-progress ingesting assets - #58: /stream and /video endpoints fall back to original file for all video types - #55: recorder poll .catch() logs auth errors cleanly; redirect stops interval - #52: thumb-status and thumb-duration moved inside position:relative wrapper - #50: ProjectCard gets onContextMenu handler with rename/delete menu - #49: project context menu dismisses on contextmenu + scroll events Features: - #93: POST /assets/:id/reprocess?type=proxy|thumbnail — force re-queue any asset Asset ⋯ menu now shows 'Re-generate proxy' and 'Re-generate thumbnail' buttons UI: - Logo: brightness(0) invert(1) filter applied consistently in sidebar, launcher, and login — white logo pops on dark UI; inline style removed from login.html
2026-05-26 10:10:44 -04:00
const dbJobs = await getDbJobs();
jobs = jobs.concat(dbJobs);
await attachAssetNames(jobs);
if (type) jobs = jobs.filter(j => j.type === type);
if (status) jobs = jobs.filter(j => j.status === status);
if (asset_id) jobs = jobs.filter(j => j.asset_id === asset_id);
jobs.sort((a, b) => new Date(b.created_at || 0) - new Date(a.created_at || 0));
res.json(jobs);
} catch (err) {
next(err);
}
});
// ── GET /:id - Single job ─────────────────────────────────────────────────────
router.get('/:id', async (req, res, next) => {
try {
const { id } = req.params;
const colonIdx = id.indexOf(':');
const qType = colonIdx > -1 ? id.slice(0, colonIdx) : null;
const bullId = colonIdx > -1 ? id.slice(colonIdx + 1) : id;
for (const { queue, type } of QUEUES) {
if (qType && type !== qType) continue;
try {
const job = await queue.getJob(bullId);
if (job) {
const state = await job.getState();
const apiStatus = STATE_MAP[state] || state;
const normalized = normalizeJob(job, type, apiStatus);
await attachAssetNames([normalized]);
return res.json(normalized);
}
} catch { /* try next queue */ }
}
res.status(404).json({ error: 'Job not found' });
} catch (err) {
next(err);
}
});
// ── POST /:id/retry - Retry a failed job ──────────────────────────────────────
router.post('/:id/retry', async (req, res, next) => {
try {
const { id } = req.params;
const colonIdx = id.indexOf(':');
const qType = colonIdx > -1 ? id.slice(0, colonIdx) : null;
const bullId = colonIdx > -1 ? id.slice(colonIdx + 1) : id;
for (const { queue, type } of QUEUES) {
if (qType && type !== qType) continue;
try {
const job = await queue.getJob(bullId);
if (job) {
await job.retry();
return res.json({ id, status: 'queued' });
}
} catch { /* try next queue */ }
}
res.status(404).json({ error: 'Job not found' });
} catch (err) {
next(err);
}
});
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
// ── DELETE /:id - Remove a job (also handles cancel for active jobs) ─────────
// BullMQ refuses job.remove() while a job is in the 'active' state. Before this
// fix the route caught that error and fell through to a misleading 404, so
// operators couldn't kill a stalled-active job from the UI. Now we detect the
// active state explicitly: moveToFailed with the magic '0' token bypasses the
// per-worker lock check and transitions active → failed (freeing the queue's
// concurrency slot), then remove() drops the row.
router.delete('/:id', async (req, res, next) => {
try {
const { id } = req.params;
const colonIdx = id.indexOf(':');
const qType = colonIdx > -1 ? id.slice(0, colonIdx) : null;
const bullId = colonIdx > -1 ? id.slice(colonIdx + 1) : id;
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
let lastErr = null;
for (const { queue, type } of QUEUES) {
if (qType && type !== qType) continue;
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
let job;
try {
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
job = await queue.getJob(bullId);
} catch (err) {
// Queue-level lookup error: remember it so we don't mask it with 404.
lastErr = err;
continue;
}
if (!job) continue;
const state = await job.getState();
if (state === 'active') {
// Token '0' tells BullMQ to skip the worker-lock check — necessary
// because the operator-side cancel doesn't hold the worker's lock.
try {
await job.moveToFailed(new Error('Cancelled by operator'), '0', false);
} catch (err) {
// Lock owned by a still-living worker; fall back to discard + remove
// so at least the result is thrown away and the row is gone.
try { await job.discard(); } catch (_) {}
}
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
}
try {
await job.remove();
} catch (err) {
// Last-resort obliteration of the job row via raw Redis. This is
// the path stalled jobs hit when moveToFailed couldn't transition
// them either.
const client = await queue.client;
const prefix = queue.toKey(bullId);
await client.del(prefix);
}
return res.json({ success: true, cancelled: state === 'active' });
}
fix(jobs): real cancel for active jobs + multi-threaded thumbnail worker DELETE /jobs/:id was throwing "404 not found" when the operator tried to cancel a running job. BullMQ refuses job.remove() while a job is in the active state; the route caught that error and fell through to the 404 branch, which was misleading because the job actually exists — the queue was just refusing to drop it from under the worker. Fix: - Detect 'active' state explicitly and call moveToFailed(err, '0', false) first. Token '0' bypasses the per-worker lock check (the operator-side cancel doesn't hold the worker lock). That transitions active -> failed and frees the queue's concurrency slot. - If moveToFailed itself fails (lock owned by a live worker), fall back to job.discard() so at least the result is thrown away. - If remove() then fails (stalled, broken state), drop the job's Redis key directly via queue.client. Last-resort obliteration. - Stop swallowing getJob() errors — if Redis is sad, surface it via next(err) instead of returning a misleading 404. - Return { cancelled: true } when the job was active, so the client can show "Cancelled" rather than "Removed" in any future toast. While here: thumbnail jobs now run with concurrency 4 by default (proxy 2, conform 1, import 1 unchanged). Every queue defaulted to concurrency 1 before, so a single stalled job blocked the entire queue. All three are overridable via PROXY_CONCURRENCY / THUMBNAIL_CONCURRENCY / CONFORM_CONCURRENCY env vars for nodes with more headroom. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 17:23:07 -04:00
if (lastErr) return next(lastErr);
res.status(404).json({ error: 'Job not found' });
} catch (err) {
next(err);
}
});
// ── POST /conform - Submit a conform (EDL export) job ────────────────────────
router.post('/conform', async (req, res, next) => {
try {
const { edl, project_id, output_format } = req.body;
if (!edl || !project_id || !output_format) {
return res.status(400).json({
error: 'edl, project_id, and output_format are required',
});
}
const bullJob = await conformQueue.add('conform-task', {
edl,
projectId: project_id,
outputFormat: output_format,
});
res.status(202).json({ id: `conform:${bullJob.id}`, status: 'queued' });
} catch (err) {
next(err);
}
});
export default router;