fix(jobs): stall detection + manual kill button so 5h-stuck actives can't happen
A thumbnail job from earlier stayed 'active' for 6+ hours: worker was restarted at 70% progress, BullMQ left it in the active set, and there was no stall reaper because the worker was created with only the default options. Worker now passes stalledInterval: 30000, lockDuration: 60000, lockRenewTime: 15000, maxStalledCount: 1 to the Worker constructor. If a run dies, BullMQ reclaims the job back to waiting within 30s and a 'stalled' event is logged. Otherwise the lock is renewed mid-job. Jobs UI gains a 'Kill' button per row next to Details. Calls DELETE /api/v1/jobs/:id which already removes the job from Redis. Use it on any row that looks stuck.
This commit is contained in:
parent
e441176961
commit
562881f0db
2 changed files with 27 additions and 5 deletions
|
|
@ -656,14 +656,25 @@ function renderRow(job) {
|
|||
</td>
|
||||
<td class="time-cell">${dur}</td>
|
||||
<td>
|
||||
<button class="btn btn-ghost" style="font-size:var(--text-xs);padding:4px 10px" onclick="openDetail('${escHtml(job.id)}')">
|
||||
Details
|
||||
</button>
|
||||
<button class="btn btn-ghost" style="font-size:var(--text-xs);padding:4px 10px" onclick="openDetail('${escHtml(job.id)}')">Details</button>
|
||||
<button class="btn btn-ghost" style="font-size:var(--text-xs);padding:4px 10px;color:var(--signal-bad)" onclick="killJob('${escHtml(job.id)}', event)" title="Remove this job from the queue">Kill</button>
|
||||
</td>`;
|
||||
|
||||
return tr;
|
||||
}
|
||||
|
||||
async function killJob(jobId, ev) {
|
||||
ev.stopPropagation();
|
||||
if (!confirm('Remove this job from the queue? If a worker is still processing it, the run is abandoned.')) return;
|
||||
try {
|
||||
const r = await fetch('/api/v1/jobs/' + encodeURIComponent(jobId), { method: 'DELETE', credentials: 'include' });
|
||||
if (r.ok) { toast('Job removed', 'success'); fetchJobs(); }
|
||||
else { const d = await r.json().catch(()=>({})); toast('Remove failed: ' + (d.error || r.statusText), 'error'); }
|
||||
} catch (err) {
|
||||
toast('Remove failed: ' + err.message, 'error');
|
||||
}
|
||||
}
|
||||
|
||||
function statusBadge(status) {
|
||||
const map = {
|
||||
active: '<span class="badge badge-recording">Active</span>',
|
||||
|
|
|
|||
|
|
@ -16,7 +16,15 @@ const parseRedisUrl = (url) => {
|
|||
const redisOptions = parseRedisUrl(process.env.REDIS_URL || 'redis://localhost:6379');
|
||||
|
||||
const createWorker = (queueName, handler) => {
|
||||
const worker = new Worker(queueName, handler, { connection: redisOptions });
|
||||
const worker = new Worker(queueName, handler, {
|
||||
connection: redisOptions,
|
||||
// Stall detection: if a worker dies mid-job, BullMQ moves it back to wait
|
||||
// after stalledInterval. Without this a crashed run sits in active forever.
|
||||
stalledInterval: 30000,
|
||||
maxStalledCount: 1,
|
||||
lockDuration: 60000,
|
||||
lockRenewTime: 15000,
|
||||
});
|
||||
|
||||
worker.on('completed', (job) => {
|
||||
console.log(`[${queueName}] Job ${job.id} completed`);
|
||||
|
|
@ -26,7 +34,10 @@ const createWorker = (queueName, handler) => {
|
|||
console.error(`[${queueName}] Job ${job.id} failed:`, err.message);
|
||||
});
|
||||
|
||||
// job.progress is a property (the value set by updateProgress), not a function
|
||||
worker.on('stalled', (jobId) => {
|
||||
console.warn(`[${queueName}] Job ${jobId} stalled — reclaimed`);
|
||||
});
|
||||
|
||||
worker.on('progress', (job, progress) => {
|
||||
console.log(`[${queueName}] Job ${job.id} progress:`, progress);
|
||||
});
|
||||
|
|
|
|||
Loading…
Reference in a new issue