maestro/src/instance-lock.ts
oss-sync c526adddc2
Some checks failed
CI / build-and-test (push) Has been cancelled
sync: update from private repo (402599f)
2026-06-04 13:41:33 +00:00

103 lines
3.7 KiB
TypeScript

/**
* instance-lock.ts — single-writer guard for worker mode.
*
* Multiple maestro worker processes pointed at the same SQLite DB each run
* their own WorkerManager + Scheduler. They share the `jobs` table and the
* `config.yaml` file, so they fight over job claims, issue locks, and config
* reloads — which in production caused jobs to be picked up and executed
* twice. (An orphaned second process on a different port survived a restart
* because nothing stopped it.)
*
* This lock makes a second worker process on the same host refuse to start
* instead of silently double-scheduling. It is intentionally lightweight: a
* JSON lock file next to the DB holding the owner pid + host. Liveness is
* checked with `process.kill(pid, 0)` (same host only). A wedged lock can be
* overridden with MAESTRO_FORCE_WORKER_LOCK=1.
*/
import { readFileSync, writeFileSync, existsSync, unlinkSync } from 'fs';
import { hostname } from 'os';
import { logger } from './logger.js';
export interface WorkerLock {
/** Remove the lock file if we still own it. Idempotent. */
release(): void;
/** Absolute/relative path of the lock file (for diagnostics + tests). */
readonly path: string;
}
interface LockRecord {
pid: number;
host: string;
startedAt: string;
}
function isAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch (e) {
// ESRCH → no such process (dead). EPERM → exists but not signalable by us (alive).
return (e as NodeJS.ErrnoException)?.code === 'EPERM';
}
}
/**
* Acquire the worker lock for `dbPath`. Throws if another live worker process
* on this host already holds it. Returns a handle whose `release()` removes the
* lock — wire it into the shutdown sequence.
*/
export function acquireWorkerLock(dbPath: string): WorkerLock {
const lockPath = `${dbPath}.worker.lock`;
const host = hostname();
const force = process.env['MAESTRO_FORCE_WORKER_LOCK'] === '1';
if (existsSync(lockPath) && !force) {
try {
const prev = JSON.parse(readFileSync(lockPath, 'utf-8')) as LockRecord;
if (prev.host === host && prev.pid !== process.pid && isAlive(prev.pid)) {
throw new Error(
`Another maestro worker process is already running ` +
`(pid=${prev.pid}, host=${host}, since ${prev.startedAt}). ` +
`Lock file: ${lockPath}. Stop that process first, or set ` +
`MAESTRO_FORCE_WORKER_LOCK=1 to override a wedged lock.`,
);
}
if (prev.host !== host) {
logger.warn(
`[instance-lock] lock held by a different host (${prev.host}, pid=${prev.pid}); ` +
`cannot verify liveness. Proceeding, but multiple worker hosts against one ` +
`SQLite DB is unsupported and unsafe.`,
);
} else {
logger.warn(`[instance-lock] reclaiming stale lock from dead pid=${prev.pid}`);
}
} catch (err) {
if (err instanceof Error && err.message.startsWith('Another maestro worker process')) {
throw err;
}
logger.warn(`[instance-lock] unreadable lock file ${lockPath}, reclaiming: ${err}`);
}
}
const record: LockRecord = { pid: process.pid, host, startedAt: new Date().toISOString() };
writeFileSync(lockPath, JSON.stringify(record), 'utf-8');
logger.info(`[instance-lock] acquired worker lock ${lockPath} (pid=${process.pid})`);
let released = false;
return {
path: lockPath,
release(): void {
if (released) return;
released = true;
try {
if (existsSync(lockPath)) {
const cur = JSON.parse(readFileSync(lockPath, 'utf-8')) as LockRecord;
if (cur.pid === process.pid) unlinkSync(lockPath);
}
} catch {
/* best effort — never block shutdown on lock cleanup */
}
},
};
}