103 lines
3.7 KiB
TypeScript
103 lines
3.7 KiB
TypeScript
/**
|
|
* instance-lock.ts — single-writer guard for worker mode.
|
|
*
|
|
* Multiple maestro worker processes pointed at the same SQLite DB each run
|
|
* their own WorkerManager + Scheduler. They share the `jobs` table and the
|
|
* `config.yaml` file, so they fight over job claims, issue locks, and config
|
|
* reloads — which in production caused jobs to be picked up and executed
|
|
* twice. (An orphaned second process on a different port survived a restart
|
|
* because nothing stopped it.)
|
|
*
|
|
* This lock makes a second worker process on the same host refuse to start
|
|
* instead of silently double-scheduling. It is intentionally lightweight: a
|
|
* JSON lock file next to the DB holding the owner pid + host. Liveness is
|
|
* checked with `process.kill(pid, 0)` (same host only). A wedged lock can be
|
|
* overridden with MAESTRO_FORCE_WORKER_LOCK=1.
|
|
*/
|
|
import { readFileSync, writeFileSync, existsSync, unlinkSync } from 'fs';
|
|
import { hostname } from 'os';
|
|
import { logger } from './logger.js';
|
|
|
|
export interface WorkerLock {
|
|
/** Remove the lock file if we still own it. Idempotent. */
|
|
release(): void;
|
|
/** Absolute/relative path of the lock file (for diagnostics + tests). */
|
|
readonly path: string;
|
|
}
|
|
|
|
interface LockRecord {
|
|
pid: number;
|
|
host: string;
|
|
startedAt: string;
|
|
}
|
|
|
|
function isAlive(pid: number): boolean {
|
|
try {
|
|
process.kill(pid, 0);
|
|
return true;
|
|
} catch (e) {
|
|
// ESRCH → no such process (dead). EPERM → exists but not signalable by us (alive).
|
|
return (e as NodeJS.ErrnoException)?.code === 'EPERM';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Acquire the worker lock for `dbPath`. Throws if another live worker process
|
|
* on this host already holds it. Returns a handle whose `release()` removes the
|
|
* lock — wire it into the shutdown sequence.
|
|
*/
|
|
export function acquireWorkerLock(dbPath: string): WorkerLock {
|
|
const lockPath = `${dbPath}.worker.lock`;
|
|
const host = hostname();
|
|
const force = process.env['MAESTRO_FORCE_WORKER_LOCK'] === '1';
|
|
|
|
if (existsSync(lockPath) && !force) {
|
|
try {
|
|
const prev = JSON.parse(readFileSync(lockPath, 'utf-8')) as LockRecord;
|
|
if (prev.host === host && prev.pid !== process.pid && isAlive(prev.pid)) {
|
|
throw new Error(
|
|
`Another maestro worker process is already running ` +
|
|
`(pid=${prev.pid}, host=${host}, since ${prev.startedAt}). ` +
|
|
`Lock file: ${lockPath}. Stop that process first, or set ` +
|
|
`MAESTRO_FORCE_WORKER_LOCK=1 to override a wedged lock.`,
|
|
);
|
|
}
|
|
if (prev.host !== host) {
|
|
logger.warn(
|
|
`[instance-lock] lock held by a different host (${prev.host}, pid=${prev.pid}); ` +
|
|
`cannot verify liveness. Proceeding, but multiple worker hosts against one ` +
|
|
`SQLite DB is unsupported and unsafe.`,
|
|
);
|
|
} else {
|
|
logger.warn(`[instance-lock] reclaiming stale lock from dead pid=${prev.pid}`);
|
|
}
|
|
} catch (err) {
|
|
if (err instanceof Error && err.message.startsWith('Another maestro worker process')) {
|
|
throw err;
|
|
}
|
|
logger.warn(`[instance-lock] unreadable lock file ${lockPath}, reclaiming: ${err}`);
|
|
}
|
|
}
|
|
|
|
const record: LockRecord = { pid: process.pid, host, startedAt: new Date().toISOString() };
|
|
writeFileSync(lockPath, JSON.stringify(record), 'utf-8');
|
|
logger.info(`[instance-lock] acquired worker lock ${lockPath} (pid=${process.pid})`);
|
|
|
|
let released = false;
|
|
return {
|
|
path: lockPath,
|
|
release(): void {
|
|
if (released) return;
|
|
released = true;
|
|
try {
|
|
if (existsSync(lockPath)) {
|
|
const cur = JSON.parse(readFileSync(lockPath, 'utf-8')) as LockRecord;
|
|
if (cur.pid === process.pid) unlinkSync(lockPath);
|
|
}
|
|
} catch {
|
|
/* best effort — never block shutdown on lock cleanup */
|
|
}
|
|
},
|
|
};
|
|
}
|