/** * instance-lock.ts — single-writer guard for worker mode. * * Multiple maestro worker processes pointed at the same SQLite DB each run * their own WorkerManager + Scheduler. They share the `jobs` table and the * `config.yaml` file, so they fight over job claims, issue locks, and config * reloads — which in production caused jobs to be picked up and executed * twice. (An orphaned second process on a different port survived a restart * because nothing stopped it.) * * This lock makes a second worker process on the same host refuse to start * instead of silently double-scheduling. It is intentionally lightweight: a * JSON lock file next to the DB holding the owner pid + host. Liveness is * checked with `process.kill(pid, 0)` (same host only). A wedged lock can be * overridden with MAESTRO_FORCE_WORKER_LOCK=1. */ import { readFileSync, writeFileSync, existsSync, unlinkSync } from 'fs'; import { hostname } from 'os'; import { logger } from './logger.js'; export interface WorkerLock { /** Remove the lock file if we still own it. Idempotent. */ release(): void; /** Absolute/relative path of the lock file (for diagnostics + tests). */ readonly path: string; } interface LockRecord { pid: number; host: string; startedAt: string; } function isAlive(pid: number): boolean { try { process.kill(pid, 0); return true; } catch (e) { // ESRCH → no such process (dead). EPERM → exists but not signalable by us (alive). return (e as NodeJS.ErrnoException)?.code === 'EPERM'; } } /** * Acquire the worker lock for `dbPath`. Throws if another live worker process * on this host already holds it. Returns a handle whose `release()` removes the * lock — wire it into the shutdown sequence. */ export function acquireWorkerLock(dbPath: string): WorkerLock { const lockPath = `${dbPath}.worker.lock`; const host = hostname(); const force = process.env['MAESTRO_FORCE_WORKER_LOCK'] === '1'; if (existsSync(lockPath) && !force) { try { const prev = JSON.parse(readFileSync(lockPath, 'utf-8')) as LockRecord; if (prev.host === host && prev.pid !== process.pid && isAlive(prev.pid)) { throw new Error( `Another maestro worker process is already running ` + `(pid=${prev.pid}, host=${host}, since ${prev.startedAt}). ` + `Lock file: ${lockPath}. Stop that process first, or set ` + `MAESTRO_FORCE_WORKER_LOCK=1 to override a wedged lock.`, ); } if (prev.host !== host) { logger.warn( `[instance-lock] lock held by a different host (${prev.host}, pid=${prev.pid}); ` + `cannot verify liveness. Proceeding, but multiple worker hosts against one ` + `SQLite DB is unsupported and unsafe.`, ); } else { logger.warn(`[instance-lock] reclaiming stale lock from dead pid=${prev.pid}`); } } catch (err) { if (err instanceof Error && err.message.startsWith('Another maestro worker process')) { throw err; } logger.warn(`[instance-lock] unreadable lock file ${lockPath}, reclaiming: ${err}`); } } const record: LockRecord = { pid: process.pid, host, startedAt: new Date().toISOString() }; writeFileSync(lockPath, JSON.stringify(record), 'utf-8'); logger.info(`[instance-lock] acquired worker lock ${lockPath} (pid=${process.pid})`); let released = false; return { path: lockPath, release(): void { if (released) return; released = true; try { if (existsSync(lockPath)) { const cur = JSON.parse(readFileSync(lockPath, 'utf-8')) as LockRecord; if (cur.pid === process.pid) unlinkSync(lockPath); } } catch { /* best effort — never block shutdown on lock cleanup */ } }, }; }