'use strict'; const path = require('node:path'); const { spawn, execFileSync } = require('node:child_process'); const { desktopCapturer, screen, BrowserWindow, nativeImage, Tray, Menu, Notification } = require('electron'); const { expandPlaceholders } = require('../core/placeholders'); const raster = require('../core/raster'); const { encodePng } = require('../core/png'); const { selectFrameForClick, frameUsableForClick, pointInBounds, DEFAULT_MAX_AGE_MS, DEFAULT_START_SLACK_MS, } = require('./click-frames'); const { physicalToDip } = require('./coords'); /** * Capture service: full-screen, active-window, and region capture, plus a * click-marker annotation at the click position and a capture session * (start/pause/resume/finish). * * A session captures continuously, with three triggers layered by what the * platform supports: * - click-capture via an OS adapter (xinput on X11, a low-level mouse hook * on Windows), * - a global hotkey (unreliable on some Wayland compositors), * - interval auto-capture as the always-works fallback. * * Click captures are served from one of two frame recorders: * - the stream backend (app/stream-backend.js): a hidden worker window * samples a desktop media stream per display into a timestamped ring * buffer, entirely off the main process. This is the preferred path — * the main-process event loop stays free, so OS click events arrive on * time, and the tight sampling cadence keeps a genuinely fresh pre-click * frame available for every click; * - the legacy in-process frame loop below, kept as the fallback when * streams can't start (portal-less Wayland, exotic drivers). * * Either way the pairing rule is the same (click-frames.js): in strict mode * a click only ever gets a frame captured at or before the click — never one * whose grab started after it. * * Note: under Wayland/WSLg, screen capture may require portal support; all * failures surface as { ok: false, reason } instead of crashing. */ // Suppress only *duplicate deliveries* of one physical press (same button, // same coordinates, a few ms apart). This deliberately replaces the old // time-only debounce: real humans double-click ~50-100ms apart, and any // purely temporal cutoff eventually drops a legitimate fast click, which // reads as "my click didn't register". One hook/watcher event = one click. const CLICK_EVENT_DUPLICATE_MS = 8; // How long a Linux raw button event waits for its regular twin (the // representation that carries root coordinates) before firing without them. const LINUX_CLICK_TWIN_MS = 25; // Idle gap between legacy frame-loop grabs. Must stay well above zero: // grabbing back-to-back starves the main-process event loop, which delays // delivery of click events from the OS watcher by whole seconds. (The // stream backend exists precisely because of this constraint.) const FRAME_LOOP_IDLE_MS = 200; // A buffered frame older than this is too stale to pass off as "the screen // at the instant of the click". Shared with click-frames.js. const CLICK_FRAME_MAX_AGE_MS = DEFAULT_MAX_AGE_MS; // How long a click waits for the in-flight grab before falling back to a // one-off fresh shot. const CLICK_FRAME_WAIT_MS = 2000; // Balanced (non-strict) mode only: a loop grab that started at most this // long after the click is still accepted. Strict mode never does this. const CLICK_FRAME_START_SLACK_MS = DEFAULT_START_SLACK_MS; const CLICK_CAPTURE_HIDE_DELAY_MS = 25; // Frames hold raw images (~20MB each at 2880x1800), so keep the history // window wide enough to outlast any processing hiccup but the count low. const RECENT_FRAME_RETENTION_MS = 4000; const RECENT_FRAME_LIMIT = 4; // The click that stops/pauses a session via the tray reaches the OS hook at // almost the same instant the tray handler fires. We discard at most that // one click — and only when it matches the recorded gesture in *both* time // and position, so a fast workflow click that merely happens to land near // the stop is never mistaken for the stop itself. const SESSION_STOP_CLICK_WINDOW_MS = 700; const SESSION_STOP_CLICK_RADIUS_PX = 8; // Per-click diagnostics, enabled with STEPFORGE_CAPTURE_LOG=1. Cheap enough // to leave in: one line per click/frame decision, nothing per frame-loop tick. const CAPTURE_LOG = Boolean(process.env.STEPFORGE_CAPTURE_LOG); function clog(...args) { if (CAPTURE_LOG) console.log('[capture]', ...args); } function hasBinary(name) { try { execFileSync('which', [name], { stdio: 'pipe' }); return true; } catch { return false; } } class CaptureService { constructor({ store, settings, getWindow, notify, screenApi = screen }) { this.store = store; this.settings = settings; this.getWindow = getWindow; this.notify = notify; // Injectable for tests; the click/coordinate paths must never reach for // the global `screen` directly so coordinate handling stays testable. this.screen = screenApi; this.session = null; // { guideId, paused, count, intervalSec } this.intervalTimer = null; this.clickWatcher = null; this.frameLoopTimer = null; this.frameLoopRunning = false; this.frameWaiters = []; this.latestFrame = null; this.clickWatcherBuf = ''; this.clickWatcherErrTail = ''; this.linuxEvent = null; // event block currently being parsed this.pendingRawClick = null; // raw press waiting for its coordinate twin this.clickQueue = Promise.resolve(); this.frameLoopInFlight = false; this.frameLoopGrabStartedAt = null; this.recentFrames = []; this.shooting = false; this.lastClickEventByButton = new Map(); this.streamBackend = null; this.streamBackendStarting = false; } state() { return this.session ? { active: true, paused: this.session.paused, guideId: this.session.guideId, count: this.session.count, intervalSec: this.session.intervalSec || 0, clickCapture: Boolean(this.clickWatcher), clickCaptureAvailable: this.clickCaptureAvailable(), clickFrameSource: this.streamBackend ? 'stream' : (this.frameLoopRunning ? 'loop' : 'idle'), strictClickFrames: this.strictClickFrames(), } : { active: false, clickCaptureAvailable: this.clickCaptureAvailable() }; } /** * Strict is the default: a stored step must never show the screen *after* * its click (a frame whose grab started post-click can already contain the * click's effects). The setting exists as an explicit escape hatch for * machines where capture is too slow to keep pre-click frames buffered — * there, the legacy slack heuristics trade accuracy for fewer fresh-shot * fallbacks. */ strictClickFrames() { return this.settings.get('capture.strictClickFrames') !== false; } clickCaptureAvailable() { if (this._clickAvail === undefined) { this._clickAvail = process.platform === 'win32' || (process.platform === 'linux' && hasBinary('xinput')); } return this._clickAvail; } startSession(guideId, { intervalSec = null } = {}) { this.finishSession(); // Default trigger: clicks when the platform supports it, otherwise an // interval so a session always produces steps even if the global hotkey // never fires (common under Wayland/WSLg). let interval = intervalSec; if (interval == null) { interval = this.clickCaptureAvailable() ? 0 : (this.settings.get('capture.autoIntervalSec') || 5); } // Sessions start paused: nothing hides and no capturing happens until // the user explicitly presses "Start recording" in the capture bar, so // New Capture never makes the window vanish out from under them. this.session = { guideId, paused: true, count: 0, intervalSec: interval }; if (this.settings.get('capture.captureOutsideClicks') !== false) this.startClickWatcher(); this.applyInterval(); this.notify('capture:state', this.state()); // (Skipped for the dev screenshot hook, which needs a visible page.) if (!process.env.STEPFORGE_SCREENSHOT) { this.createSessionTray(); const win = this.getWindow(); // Remember whether the window was visible when the session was set // up — that's what `togglePause` uses to decide whether to tuck the // app away once the user actually starts recording. this.hiddenForSession = Boolean(win && !win.isDestroyed() && win.isVisible()); try { new Notification({ title: 'StepForge is ready to capture', body: 'Click "Start recording" in the red capture bar when you’re ready. The window tucks away and the red tray icon takes over.', }).show(); } catch { /* notifications unavailable on this desktop */ } } } /** Red-dot tray icon with session controls, shown while recording. */ createSessionTray() { this.destroySessionTray(); try { const img = raster.createImage(16, 16, [0, 0, 0, 0]); raster.fillOval(img, 2, 2, 12, 12, [229, 72, 77, 255]); this.tray = new Tray(nativeImage.createFromBuffer(encodePng(img))); this.tray.setToolTip('StepForge — capture session running'); const rebuild = () => { if (!this.tray || this.tray.isDestroyed()) return; this.tray.setContextMenu(Menu.buildFromTemplate([ { label: `Captured ${this.session ? this.session.count : 0} steps`, enabled: false }, { type: 'separator' }, { label: 'Capture now', click: () => this.sessionCapture('manual').then(rebuild).catch(() => {}) }, { label: this.session && this.session.paused ? 'Resume capturing' : 'Pause capturing', click: () => { this.noteUiStopGesture(); this.togglePause(); rebuild(); }, }, { label: 'Open StepForge (pauses capture)', click: () => { this.noteUiStopGesture(); this.togglePause(true); this.showWindow(); rebuild(); }, }, { type: 'separator' }, { label: 'Finish session', click: () => { this.noteUiStopGesture(); this.finishSession(); } }, ])); }; rebuild(); this.rebuildTrayMenu = rebuild; this.tray.on('click', () => { this.noteUiStopGesture(); this.togglePause(true); this.showWindow(); rebuild(); }); } catch { this.tray = null; // no tray on this desktop; cursor-over skip still protects clicks } } destroySessionTray() { if (this.tray && !this.tray.isDestroyed()) this.tray.destroy(); this.tray = null; this.rebuildTrayMenu = null; } /** * Record that the user just stopped/paused capture from StepForge's own UI * (tray icon or its menu). The physical click that did so is also seen by * the OS hook and would otherwise queue as a workflow step; isStopGesture * uses this to discard exactly that one click — matched by position, not * just time, so a real fast click elsewhere is never lost. */ noteUiStopGesture() { let pos = null; try { pos = this.screen.getCursorScreenPoint(); } catch { pos = null; } this.uiStopGesture = { at: Date.now(), pos }; } /** True when a queued click is the tray gesture that stopped the session. */ isStopGesture(clickPos, clickAt) { const g = this.uiStopGesture; if (!g) return false; if (Math.abs((clickAt || Date.now()) - g.at) > SESSION_STOP_CLICK_WINDOW_MS) return false; // No position to compare (e.g. cursor read failed): fall back to the // time window alone, but only consume the gesture once. if (!g.pos || !clickPos) { this.uiStopGesture = null; return true; } const near = Math.abs(clickPos.x - g.pos.x) <= SESSION_STOP_CLICK_RADIUS_PX && Math.abs(clickPos.y - g.pos.y) <= SESSION_STOP_CLICK_RADIUS_PX; if (near) this.uiStopGesture = null; // one stop click per gesture return near; } showWindow() { const win = this.getWindow(); if (win && !win.isDestroyed()) { win.show(); win.focus(); } } setInterval(intervalSec) { if (!this.session) return this.state(); this.session.intervalSec = Math.max(0, Number(intervalSec) || 0); this.applyInterval(); this.notify('capture:state', this.state()); return this.state(); } applyInterval() { if (this.intervalTimer) { clearInterval(this.intervalTimer); this.intervalTimer = null; } const sec = this.session && this.session.intervalSec; if (sec > 0) { this.intervalTimer = setInterval(() => { this.sessionCapture('interval').catch(() => {}); }, sec * 1000); } } togglePause(force) { if (!this.session) return; const wasPaused = this.session.paused; this.session.paused = typeof force === 'boolean' ? force : !this.session.paused; // Starting/resuming tucks the window away again for clean shots (after // a brief delay so the user sees it happen) and starts the frame // recorder that serves click captures. Pausing stops it and discards // buffered frames, so a resume can never serve a pre-pause screen. if (wasPaused && !this.session.paused) { const win = this.getWindow(); const arm = () => { if (!this.session || this.session.paused) return; if (this.hiddenForSession && win && !win.isDestroyed() && win.isVisible()) win.hide(); if (this.settings.get('capture.captureOutsideClicks') !== false && this.clickCaptureAvailable()) { this.startClickFrameBackend().catch(() => {}); } }; if (this.hiddenForSession && win && !win.isDestroyed()) setTimeout(arm, 400); else arm(); } else if (!wasPaused && this.session.paused) { this.stopFrameLoop(); this.stopClickFrameBackend(); } if (this.rebuildTrayMenu) this.rebuildTrayMenu(); this.notify('capture:state', this.state()); } finishSession() { if (this.intervalTimer) { clearInterval(this.intervalTimer); this.intervalTimer = null; } this.stopClickWatcher(); this.stopFrameLoop(); this.stopClickFrameBackend(); this.destroySessionTray(); this.session = null; if (this.hiddenForSession) { this.hiddenForSession = false; this.showWindow(); } this.notify('capture:state', this.state()); } /** * True when the user is interacting with StepForge itself. Deliberately * based on cursor position over the visible window, not isFocused(): * some compositors (WSLg) report focus as stuck-true, which would block * every automatic capture forever. */ userIsInApp() { const win = this.getWindow(); if (!win || win.isDestroyed() || !win.isVisible() || win.isMinimized()) return false; const cur = this.screen.getCursorScreenPoint(); const b = win.getBounds(); return cur.x >= b.x && cur.x <= b.x + b.width && cur.y >= b.y && cur.y <= b.y + b.height; } /** One capture inside the active session (hotkey/click/interval/manual). */ async sessionCapture(trigger = 'hotkey', clickPos = null, clickMeta = null) { // A click that was registered while recording carries its guide id // (see enqueueClickCapture) and must become a step even if the session // was paused or finished while it sat behind slower clicks in the // queue. Dropping queued clicks at stop time is how "I clicked five // times and only got two steps" happens on hosts with slow encodes. const queuedClickGuide = trigger === 'click' && clickMeta && clickMeta.guideId ? clickMeta.guideId : null; if (!this.session || this.session.paused) { if (!queuedClickGuide) return { ok: false, reason: 'no active capture session' }; } else if (trigger !== 'manual' && this.userIsInApp()) { // Automatic triggers stand down while the user is in StepForge, so the // app stays clickable mid-session and never screenshots itself. return { ok: false, reason: 'skipped — StepForge is focused' }; } // Clicks are served from the frame recorder: the chosen frame was // captured at (or moments before) the click instant, so the background // matches what the user clicked on. A click that lands while a grab is // in flight waits for that frame instead of being dropped, so fast // clicking still yields one step per click. if (trigger === 'click') { const clickAt = clickMeta && Number.isFinite(clickMeta.at) ? clickMeta.at : Date.now(); // Prefer the frame the click was paired with at event time (see // enqueueClickCapture); ask now only when no eager pairing happened. const frame = clickMeta && clickMeta.framePromise ? await clickMeta.framePromise : await this.frameForClick(clickPos, clickAt); const sessionLive = this.session && !this.session.paused; const guideId = sessionLive ? this.session.guideId : queuedClickGuide; if (!guideId) return { ok: false, reason: 'no active capture session' }; // The tray gesture that stopped the session is itself a hook click in // the queue — storing it would append a junk step of the menu. Discard // only that one click, matched by position so a fast workflow click is // never collateral damage. if (!sessionLive && this.isStopGesture(clickPos, clickAt)) { clog('click@', clickAt, 'discarded — it triggered the session stop'); return { ok: false, reason: 'click stopped the session' }; } if (frame) { clog('click@', clickAt, 'frame', frame.source || 'loop', 'started', frame.startedAt - clickAt, 'ms, captured', frame.capturedAt - clickAt, 'ms rel. click'); const result = this.storeFrameAsStep(guideId, frame.mode, frame, clickPos); if (result.ok) this.noteStepAdded(result.step, trigger, guideId); return result; } // No usable frame: fall through to a one-off fresh shot — but only // while still recording. After a stop, a fresh shot would show // whatever replaced the user's workflow on screen. clog('click@', clickAt, 'no frame qualified — falling back to a fresh (post-click) shot'); if (!sessionLive) return { ok: false, reason: 'session ended before the fallback shot' }; } if (this.shooting) return { ok: false, reason: 'capture already in progress' }; this.shooting = true; try { const mode = this.settings.get('capture.mode') || 'fullscreen'; const grabMode = mode === 'region' ? 'fullscreen' : mode; const finalResult = await this.shoot({ guideId: this.session.guideId, mode: grabMode, delayMs: 0, hideWindowDelayMs: trigger === 'click' ? CLICK_CAPTURE_HIDE_DELAY_MS : null, refocus: false, // don't steal focus from the app the user is documenting clickPos, }); if (finalResult.ok) this.noteStepAdded(finalResult.step, trigger); return finalResult; } finally { this.shooting = false; } } noteStepAdded(step, trigger, guideId = null) { // Steps from queued clicks can land after the session object is gone. if (this.session) this.session.count += 1; this.notify('capture:added', { guideId: guideId || (this.session && this.session.guideId), step, trigger, }); this.notify('capture:state', this.state()); if (this.rebuildTrayMenu) this.rebuildTrayMenu(); // refresh step counter } hotkeyCapture() { return this.sessionCapture('hotkey'); } // ---- click-triggered capture -------------------------------------------- /** * Fallback frame recorder: a continuous screen-grab loop in the main * process, used only when the stream backend can't run. It keeps the most * recent frames buffered so a click can be served from a frame grabbed at * (or moments before) the instant of the click — a fresh grab started * after the click would land hundreds of ms late and show the click's * effects instead of what the user clicked on. Its cadence is capped at * FRAME_LOOP_IDLE_MS because tighter grabbing here starves the event loop * and delays the very click events it serves. */ startFrameLoop() { if (this.frameLoopRunning) return; this.frameLoopRunning = true; const tick = async () => { if (!this.frameLoopRunning) return; if (!this.session || this.session.paused) { this.frameLoopRunning = false; this.frameLoopInFlight = false; return; } try { if (!this.shooting) { this.frameLoopInFlight = true; this.frameLoopGrabStartedAt = Date.now(); const mode = this.settings.get('capture.mode') || 'fullscreen'; const grabMode = mode === 'region' ? 'fullscreen' : mode; const frame = await this.captureCurrentFrame(grabMode, null, this.frameLoopGrabStartedAt); if (this.frameLoopRunning) this.acceptFrame(frame); } } catch { // Grab failures are fine — clicks fall back to a one-off fresh shot. } finally { this.frameLoopInFlight = false; this.frameLoopGrabStartedAt = null; if (this.frameLoopRunning && this.session && !this.session.paused) { this.frameLoopTimer = setTimeout(tick, FRAME_LOOP_IDLE_MS); } } }; this.frameLoopTimer = setTimeout(tick, 0); } /** Store a grabbed frame and hand it to any clicks waiting on it. */ acceptFrame(frame) { this.latestFrame = frame; this.recentFrames.push(frame); const cutoff = Date.now() - RECENT_FRAME_RETENTION_MS; this.recentFrames = this.recentFrames .filter((f) => f && f.capturedAt >= cutoff) .slice(-RECENT_FRAME_LIMIT); const waiters = this.frameWaiters; this.frameWaiters = []; for (const resolve of waiters) resolve(frame); } /** Resolves with the next frame the loop grabs (null on timeout/stop). */ nextFrame(timeoutMs) { return new Promise((resolve) => { const entry = (frame) => { clearTimeout(timer); resolve(frame); }; const timer = setTimeout(() => { this.frameWaiters = this.frameWaiters.filter((w) => w !== entry); resolve(null); }, timeoutMs); this.frameWaiters.push(entry); }); } stopFrameLoop() { if (this.frameLoopTimer) { clearTimeout(this.frameLoopTimer); this.frameLoopTimer = null; } this.frameLoopRunning = false; this.frameLoopGrabStartedAt = null; this.latestFrame = null; this.recentFrames = []; const waiters = this.frameWaiters; this.frameWaiters = []; for (const resolve of waiters) resolve(null); } /** * Frame representing the screen at the instant of one click. * * Order of preference: * 1. the stream backend's ring buffer (off-main-process, tight cadence); * 2. the legacy loop's buffered frames; * 3. waiting for the loop grab that was already in flight when the user * clicked. * Selection semantics live in click-frames.js. In strict mode every path * obeys the same rule — never a frame whose grab started after the click — * and when nothing qualifies this returns null so the caller takes the * *explicit* fresh-shot fallback rather than silently passing a post-click * frame off as the click-time screen. */ async frameForClick(clickPos = null, clickAt = Date.now()) { const mode = this.settings.get('capture.mode') || 'fullscreen'; const grabMode = mode === 'region' ? 'fullscreen' : mode; const clickTime = Number.isFinite(clickAt) ? clickAt : Date.now(); const strict = this.strictClickFrames(); const opts = { clickAt: clickTime, clickPos, mode: grabMode, strict, maxAgeMs: CLICK_FRAME_MAX_AGE_MS, startSlackMs: CLICK_FRAME_START_SLACK_MS, }; if (this.streamBackend && this.streamBackend.isActive() && grabMode === 'fullscreen') { const frame = await this.streamBackend.frameForClick({ clickPos, clickAt: clickTime, strict }); if (frame) return frame; // No qualifying frame (or the backend just went unhealthy): fall // through to the loop buffer / fresh-shot fallbacks below. } const buffered = selectFrameForClick( [...this.recentFrames, this.latestFrame].filter((f, i, arr) => f && arr.indexOf(f) === i), opts, ); if (buffered) return buffered; if (!this.frameLoopRunning) return null; if (strict) { // Only a grab already in flight when the user clicked can still // qualify: its pixels predate the click even though it completes // after. Any grab starting later is post-click by definition, so // don't wait around for one — return immediately and let the caller // take the fresh-shot fallback. const inFlightStartedBeforeClick = this.frameLoopInFlight && Number.isFinite(this.frameLoopGrabStartedAt) && this.frameLoopGrabStartedAt <= clickTime; if (!inFlightStartedBeforeClick) return null; const next = await this.nextFrame(CLICK_FRAME_WAIT_MS); return frameUsableForClick(next, { ...opts, allowInFlight: true }) ? next : null; } // Balanced (legacy) mode: wait for the next loop frame and accept it if // its grab started within the slack window after the click. const deadline = Date.now() + CLICK_FRAME_WAIT_MS; while (this.frameLoopRunning && Date.now() < deadline) { const next = await this.nextFrame(Math.max(1, deadline - Date.now())); if (frameUsableForClick(next, { ...opts, allowInFlight: true })) return next; if (next && Number.isFinite(next.startedAt) && next.startedAt > clickTime + CLICK_FRAME_START_SLACK_MS) { // Grabs only get later from here; let the fresh-shot path handle it. return null; } } return null; } // ---- click-frame backends ------------------------------------------------- /** * Bring up the frame recorder for a recording run. The stream backend is * the architecture path (capture entirely off the main process); the * in-process frame loop is the fallback when streams can't start — and the * automatic degradation target if the worker stops answering mid-session. */ async startClickFrameBackend() { const mode = this.settings.get('capture.mode') || 'fullscreen'; // The worker streams screens; window-mode grabs need the loop's // source-filtering logic. if (this.settings.get('capture.streamCapture') === false || mode === 'window') { this.startFrameLoop(); return; } if (this.streamBackend || this.streamBackendStarting) return; this.streamBackendStarting = true; try { // eslint-disable-next-line global-require const { StreamCaptureBackend, createElectronHost } = require('./stream-backend'); const backend = new StreamCaptureBackend({ createHost: createElectronHost, onUnhealthy: () => this.degradeToFrameLoop(), }); const displays = this.screen.getAllDisplays(); const sources = await desktopCapturer.getSources({ types: ['screen'], thumbnailSize: { width: 1, height: 1 }, // ids only — skip thumbnail work }); const ok = await backend.start({ displays, sources: sources.map((s) => ({ id: s.id, display_id: s.display_id })), sampleMs: this.settings.get('capture.frameSampleMs') || 100, }); if (!ok || !this.session || this.session.paused) { backend.stop(); if (this.session && !this.session.paused) { console.error('[stepforge] stream capture backend failed to start — using in-process frame loop'); this.startFrameLoop(); } return; } this.streamBackend = backend; clog('stream capture backend active'); this.notify('capture:state', this.state()); } catch (err) { if (this.session && !this.session.paused) { console.error(`[stepforge] stream capture backend error (${err && err.message}) — using in-process frame loop`); this.startFrameLoop(); } } finally { this.streamBackendStarting = false; } } stopClickFrameBackend() { if (!this.streamBackend) return; const backend = this.streamBackend; this.streamBackend = null; backend.stop(); } /** * The worker stopped answering frame requests. Capture must not silently * stop mid-session: drop the backend and run the in-process loop for the * rest of the recording. */ degradeToFrameLoop() { this.streamBackend = null; console.error('[stepforge] stream capture backend unhealthy — falling back to in-process frame loop'); if (this.session && !this.session.paused) this.startFrameLoop(); this.notify('capture:state', this.state()); } startClickWatcher() { this.stopClickWatcher(); try { this.clickWatcherBuf = ''; this.linuxEvent = null; if (process.platform === 'linux' && hasBinary('xinput')) { // Stream raw button events from the X server; one capture per press. // xinput block-buffers stdout when piped, so a press event can sit // in its buffer until later motion events flush it — by then the // cursor read in onOsClick lands where the mouse moved *after* the // click. stdbuf -oL forces line-buffering so events (and the cursor // read) line up with the actual click instant. const argv = hasBinary('stdbuf') ? ['stdbuf', '-oL', 'xinput', 'test-xi2', '--root'] : ['xinput', 'test-xi2', '--root']; this.clickWatcher = spawn(argv[0], argv.slice(1), { stdio: ['ignore', 'pipe', 'ignore'] }); this.clickWatcher.stdout.on('data', (chunk) => { this.ingestClickWatcherChunk(chunk.toString(), 'linux'); }); } else if (process.platform === 'win32') { // Use a low-level Windows mouse hook instead of polling // GetAsyncKeyState. The low bit from GetAsyncKeyState can be consumed // by other processes and a polling loop can miss short clicks under // load; WH_MOUSE_LL gives us one event for each button-down, with the // hook-time cursor position and timestamp. const ps = ` $ErrorActionPreference = 'Stop' Add-Type -TypeDefinition @' using System; using System.Collections.Concurrent; using System.Runtime.InteropServices; using System.Threading; public static class SFMouseHook { private const int WH_MOUSE_LL = 14; private const int WM_LBUTTONDOWN = 0x0201; private const int WM_RBUTTONDOWN = 0x0204; private const int WM_MBUTTONDOWN = 0x0207; private const int WM_XBUTTONDOWN = 0x020B; private const long UnixEpochMilliseconds = 62135596800000L; private static IntPtr hook = IntPtr.Zero; private static LowLevelMouseProc proc = HookCallback; private static readonly ConcurrentQueue queue = new ConcurrentQueue(); private static readonly AutoResetEvent signal = new AutoResetEvent(false); [StructLayout(LayoutKind.Sequential)] private struct POINT { public int x; public int y; } [StructLayout(LayoutKind.Sequential)] private struct MSLLHOOKSTRUCT { public POINT pt; public uint mouseData; public uint flags; public uint time; public UIntPtr dwExtraInfo; } [StructLayout(LayoutKind.Sequential)] private struct MSG { public IntPtr hwnd; public uint message; public UIntPtr wParam; public IntPtr lParam; public uint time; public POINT pt; } private delegate IntPtr LowLevelMouseProc(int nCode, IntPtr wParam, IntPtr lParam); [DllImport("user32.dll", SetLastError = true)] private static extern IntPtr SetWindowsHookEx(int idHook, LowLevelMouseProc lpfn, IntPtr hMod, uint dwThreadId); [DllImport("user32.dll", SetLastError = true)] private static extern bool UnhookWindowsHookEx(IntPtr hhk); [DllImport("user32.dll")] private static extern IntPtr CallNextHookEx(IntPtr hhk, int nCode, IntPtr wParam, IntPtr lParam); [DllImport("kernel32.dll", CharSet = CharSet.Auto, SetLastError = true)] private static extern IntPtr GetModuleHandle(string lpModuleName); [DllImport("user32.dll")] private static extern int GetMessage(out MSG lpMsg, IntPtr hWnd, uint wMsgFilterMin, uint wMsgFilterMax); [DllImport("user32.dll")] private static extern bool TranslateMessage(ref MSG lpMsg); [DllImport("user32.dll")] private static extern IntPtr DispatchMessage(ref MSG lpMsg); [DllImport("user32.dll")] private static extern bool SetProcessDpiAwarenessContext(IntPtr value); public static void Run() { try { SetProcessDpiAwarenessContext(new IntPtr(-4)); } catch { } Thread writer = new Thread(WriterLoop); writer.IsBackground = true; writer.Start(); hook = SetWindowsHookEx(WH_MOUSE_LL, proc, GetModuleHandle(null), 0); if (hook == IntPtr.Zero) { throw new System.ComponentModel.Win32Exception(Marshal.GetLastWin32Error()); } Console.Out.WriteLine("READY"); Console.Out.Flush(); MSG msg; while (GetMessage(out msg, IntPtr.Zero, 0, 0) > 0) { TranslateMessage(ref msg); DispatchMessage(ref msg); } UnhookWindowsHookEx(hook); } private static void WriterLoop() { while (true) { signal.WaitOne(); string line; while (queue.TryDequeue(out line)) { Console.Out.WriteLine(line); } Console.Out.Flush(); } } private static IntPtr HookCallback(int nCode, IntPtr wParam, IntPtr lParam) { if (nCode >= 0) { int message = wParam.ToInt32(); string button = ButtonName(message, lParam); if (button != null) { MSLLHOOKSTRUCT data = (MSLLHOOKSTRUCT)Marshal.PtrToStructure(lParam, typeof(MSLLHOOKSTRUCT)); long unixMs = DateTime.UtcNow.Ticks / TimeSpan.TicksPerMillisecond - UnixEpochMilliseconds; queue.Enqueue("CLICK " + data.pt.x + " " + data.pt.y + " " + button + " " + unixMs); signal.Set(); } } return CallNextHookEx(hook, nCode, wParam, lParam); } private static string ButtonName(int message, IntPtr lParam) { if (message == WM_LBUTTONDOWN) return "left"; if (message == WM_RBUTTONDOWN) return "right"; if (message == WM_MBUTTONDOWN) return "middle"; if (message == WM_XBUTTONDOWN) { MSLLHOOKSTRUCT data = (MSLLHOOKSTRUCT)Marshal.PtrToStructure(lParam, typeof(MSLLHOOKSTRUCT)); uint xButton = (data.mouseData >> 16) & 0xffff; return xButton == 1 ? "x1" : "x2"; } return null; } } '@ [SFMouseHook]::Run() `; this.clickWatcher = spawn('powershell.exe', ['-NoProfile', '-NonInteractive', '-ExecutionPolicy', 'Bypass', '-Command', ps], { stdio: ['ignore', 'pipe', 'pipe'], windowsHide: true, }); this.clickWatcher.stdout.on('data', (chunk) => { this.ingestClickWatcherChunk(chunk.toString(), 'win32'); }); } if (this.clickWatcher) { const child = this.clickWatcher; this.clickWatcherErrTail = ''; if (child.stderr) { child.stderr.on('data', (chunk) => { this.clickWatcherErrTail = String(chunk).slice(-400); }); } const lost = (reason) => { if (this.clickWatcher !== child) return; // stopped deliberately this.clickWatcher = null; this.handleClickWatcherLoss(reason); }; child.on('error', (err) => lost(err && err.message)); child.on('exit', (code) => lost(`exited with code ${code}`)); } } catch { this.clickWatcher = null; } } /** * The watcher process died mid-session (crashed X server, PowerShell * blocked by policy, …). Captures must not silently stop: log why, switch * the session to interval captures, and tell the UI. */ handleClickWatcherLoss(reason) { this.linuxEvent = null; this.discardPendingRawClick(); const detail = [reason, this.clickWatcherErrTail].filter(Boolean).join(' — '); console.error(`[stepforge] click watcher stopped${detail ? `: ${detail}` : ''}`); if (!this.session) return; if (!this.session.intervalSec) { this.session.intervalSec = this.settings.get('capture.autoIntervalSec') || 5; this.applyInterval(); } this.notify('capture:state', this.state()); } stopClickWatcher() { if (this.clickWatcher) { try { this.clickWatcher.kill(); } catch { /* already gone */ } this.clickWatcher = null; } this.clickWatcherBuf = ''; this.linuxEvent = null; this.discardPendingRawClick(); this.lastClickEventByButton.clear(); } /** * Buffer stdout chunks and only parse complete lines: a chunk boundary * can split an event line in half, which used to corrupt press/release * parsing and swallow clicks. */ ingestClickWatcherChunk(chunk, platform = process.platform) { this.clickWatcherBuf += String(chunk); const cut = this.clickWatcherBuf.lastIndexOf('\n'); if (cut === -1) return; const complete = this.clickWatcherBuf.slice(0, cut); this.clickWatcherBuf = this.clickWatcherBuf.slice(cut + 1); this.processClickWatcherData(complete, platform); } processClickWatcherData(text, platform = process.platform) { const lines = String(text).split(/\r?\n/); if (platform === 'linux') { // xinput test-xi2 --root prints each event as a multi-line block: // // EVENT type 4 (ButtonPress) EVENT type 15 (RawButtonPress) // device: 11 (10) device: 11 (11) // detail: 1 detail: 1 // root: 644.52/343.55 valuators: … // // Regular (non-raw) blocks carry the event-time root coordinates — // exactly what the click marker needs, because a cursor read at parse // time drifts whenever delivery is delayed or the pointer keeps // moving after the click. Raw blocks have no coordinates, but on many // servers they are the only representation delivered for the root // window, so both kinds must fire. One physical press can produce // *both* representations; that duplication is resolved structurally // in fireLinuxClick (raw press briefly waits for its regular twin and // they merge into one click), never by a time-only debounce that // could swallow legitimate fast clicks. for (const line of lines) { if (!line) continue; const header = /EVENT type \d+ \(([A-Za-z]+)\)/.exec(line); if (header) { this.finishLinuxEvent(); const name = header[1]; this.linuxEvent = /ButtonPress$/.test(name) ? { name, raw: /^Raw/.test(name), button: null, at: Date.now(), fired: false } : null; continue; } const ev = this.linuxEvent; if (!ev || ev.fired) continue; const detail = /detail:\s*(\d+)/.exec(line); if (detail) { ev.button = Number(detail[1]); if (ev.button >= 4 && ev.button <= 7) { // Scroll-wheel ticks (X11 buttons 4-7) are not clicks. this.linuxEvent = null; } else if (ev.raw) { // Raw blocks never carry coordinates; this one is complete. ev.fired = true; this.linuxEvent = null; this.fireLinuxClick(ev.at, null, ev.button, { raw: true }); } continue; } const root = /root:\s*(-?[\d.]+)\/(-?[\d.]+)/.exec(line); if (root && !ev.raw && ev.button != null) { ev.fired = true; this.linuxEvent = null; this.fireLinuxClick(ev.at, { x: Math.round(parseFloat(root[1])), y: Math.round(parseFloat(root[2])), }, ev.button, { raw: false }); } } return; } if (platform === 'win32') { for (const line of lines) { const m = /^CLICK(?:\s+(-?\d+)\s+(-?\d+)(?:\s+([A-Za-z0-9_-]+))?(?:\s+(\d+))?)?\s*$/.exec(line.trim()); if (m) { const osPoint = m[1] === undefined ? null : { x: Number(m[1]), y: Number(m[2]) }; const eventAt = m[4] === undefined ? Date.now() : Number(m[4]); this.onOsClick(Number.isFinite(eventAt) ? eventAt : Date.now(), osPoint, m[3] || 'mouse'); } } } } /** * A new event header arrived while a press block was still open: the block * ended without the line we fire on. Old xinput builds sometimes omit * detail lines entirely — treat such a press as a plain click rather than * dropping it. */ finishLinuxEvent() { const ev = this.linuxEvent; this.linuxEvent = null; if (!ev || ev.fired) return; if (ev.button == null) { this.onOsClick(ev.at, null, 'mouse'); } else if (!ev.raw) { // Regular press whose root line never showed up — fire without // coordinates; onOsClick falls back to a cursor read. this.fireLinuxClick(ev.at, null, ev.button, { raw: false }); } } /** * Funnel for parsed Linux button presses. Raw and regular blocks for the * same physical press are merged here: a raw press (no coordinates) is * held for LINUX_CLICK_TWIN_MS; if the regular twin (with root * coordinates) arrives inside that window the pair fires once, with the * raw block's earlier timestamp and the regular block's coordinates. * Distinct presses always fire — there is no time-based dropping. */ fireLinuxClick(at, osPoint, button, { raw = false } = {}) { const pending = this.pendingRawClick; if (raw) { // Two raw presses can't be one click — release the held one first. this.flushPendingRawClick(); const entry = { button, at, timer: null }; entry.timer = setTimeout(() => { if (this.pendingRawClick !== entry) return; this.pendingRawClick = null; this.onOsClick(entry.at, null, `button-${entry.button}`); }, LINUX_CLICK_TWIN_MS); if (entry.timer.unref) entry.timer.unref(); this.pendingRawClick = entry; return; } if (pending && pending.button === button) { // The regular twin of the held raw press: one physical click. this.pendingRawClick = null; clearTimeout(pending.timer); this.onOsClick(Math.min(pending.at, at), osPoint, `button-${button}`); return; } this.onOsClick(at, osPoint, `button-${button}`); } /** Fire the held raw press immediately (its twin is not coming). */ flushPendingRawClick() { const pending = this.pendingRawClick; if (!pending) return; this.pendingRawClick = null; clearTimeout(pending.timer); this.onOsClick(pending.at, null, `button-${pending.button}`); } discardPendingRawClick() { if (!this.pendingRawClick) return; clearTimeout(this.pendingRawClick.timer); this.pendingRawClick = null; } onOsClick(at = Date.now(), osPoint = null, button = 'mouse') { if (!this.session || this.session.paused) return; const clickAt = Number.isFinite(at) ? at : Date.now(); // Source-aware dedupe, not a debounce: each hook/watcher event is one // click however fast it follows the previous one. Only an *identical* // event a few ms later — duplicate delivery of one physical press — is // suppressed. if (this.isDuplicateClickEvent(clickAt, osPoint, button)) { clog('click@', clickAt, button, 'suppressed as duplicate delivery'); return; } // Prefer the position the watcher sampled with the button-down event // (physical px -> DIP); otherwise read the cursor synchronously, right // now, so the marker lands where the user clicked even if the shot // itself takes a moment to grab. (Clicks on StepForge itself are // filtered by the cursor-position check in sessionCapture, not by // window focus — WSLg reports focus unreliably.) let clickPos = osPoint ? this.osPointToDip(osPoint) : null; if (!clickPos) clickPos = this.screen.getCursorScreenPoint(); clog('click@', clickAt, button, 'os', osPoint, '-> dip', clickPos); this.enqueueClickCapture(clickPos, clickAt, button || 'mouse'); } isDuplicateClickEvent(at, osPoint, button) { const key = button || 'mouse'; const last = this.lastClickEventByButton.get(key); this.lastClickEventByButton.set(key, { at, osPoint }); if (!last) return false; if (at < last.at || at - last.at >= CLICK_EVENT_DUPLICATE_MS) return false; // Same button within a few ms: duplicate only if it is the *same* event // (same coordinates, or neither delivery carried coordinates). if (osPoint && last.osPoint) { return osPoint.x === last.osPoint.x && osPoint.y === last.osPoint.y; } return !osPoint && !last.osPoint; } /** * Physical (OS event) pixels -> DIP. Windows exposes the canonical * conversion; on Linux/X11 it is reconstructed from display geometry (see * app/coords.js). Without this, the click marker drifts on any display * scaled away from 100% and on secondary monitors. */ osPointToDip(osPoint) { if (this.screen && typeof this.screen.screenToDipPoint === 'function') { try { const dip = this.screen.screenToDipPoint(osPoint); if (dip && Number.isFinite(dip.x) && Number.isFinite(dip.y)) return dip; } catch { /* fall through to manual conversion */ } } try { const displays = this.screen && typeof this.screen.getAllDisplays === 'function' ? this.screen.getAllDisplays() : []; const dip = physicalToDip(osPoint, displays); if (dip) return dip; } catch { /* no display geometry available */ } return osPoint; } /** * Serialize click captures: a click that lands while an earlier capture is * still being stored queues behind it instead of being dropped by the * "capture already in progress" guard. The marker position was already * read at click time, so a queued step still circles the right spot. * * Crucially, only the *storing* is serialized. The click is paired with * its frame right here, at event time: behind a slow store or PNG encode * the queue can run seconds late, and a frame request issued that late * could find the click-time frame already evicted from the ring buffer. * Eager pairing keeps one-click-one-frame semantics intact no matter how * fast the user clicks or how slow the encoder is. */ enqueueClickCapture(clickPos, clickAt = Date.now(), button = 'mouse') { const clickMeta = { at: Number.isFinite(clickAt) ? clickAt : Date.now(), button: button || 'mouse' }; if (this.session && !this.session.paused && !this.userIsInApp()) { // The guide id pins the click to its recording so it can still be // stored if the session stops while this click waits in the queue. clickMeta.guideId = this.session.guideId; clickMeta.framePromise = this.frameForClick(clickPos, clickMeta.at) .catch(() => null); } this.clickQueue = this.clickQueue .then(() => this.sessionCapture('click', clickPos, clickMeta)) .catch(() => {}); return this.clickQueue; } async captureCurrentFrame(mode, capturePoint = null, startedAt = Date.now()) { const grabbed = await this.grab(mode, capturePoint); return { mode, // Keep the raw image and defer PNG encoding to storeFrameAsStep: // toPNG() on a full-resolution frame blocks the main thread for // hundreds of ms, and doing it every frame-loop tick starved the // event loop so badly that click events arrived seconds late. // Encoding once per *stored* step is cheap; encoding per grab is not. image: grabbed.image, size: grabbed.image.getSize(), display: grabbed.display, cursor: capturePoint || grabbed.cursor, startedAt, capturedAt: Date.now(), }; } storeFrameAsStep(guideId, mode, frame, clickPos = null) { if (!frame) return { ok: false, reason: 'no capture frame available' }; const annotations = []; // The click position (DIP, read at event time) wins over the frame's // grab-time cursor; stream-backend frames carry no cursor at all. const cursor = clickPos || frame.cursor || null; if (cursor && mode !== 'window' && this.settings.get('capture.clickMarker')) { const fx = (cursor.x - frame.display.bounds.x) / frame.display.bounds.width; const fy = (cursor.y - frame.display.bounds.y) / frame.display.bounds.height; if (fx >= 0 && fx <= 1 && fy >= 0 && fy <= 1) { const d = 0.035; annotations.push({ type: 'oval', x: fx - d / 2, y: fy - (d * frame.size.width / frame.size.height) / 2, w: d, h: d * frame.size.width / frame.size.height, style: { stroke: this.settings.get('capture.clickMarkerColor') || '#E5484D', strokeWidth: 4, fill: 'transparent', }, }); } } const step = this.store.addStep(guideId, { title: this.autoTitle(mode), annotations, focusedView: { enabled: Boolean(this.settings.get('editor.focusedViewDefaultForNewSteps')), zoom: 1, panX: 0.5, panY: 0.5, }, }, frame.png || frame.image.toPNG(), frame.size); return { ok: true, step }; } autoTitle(mode) { const tplStr = this.settings.get('editor.autoTitleTemplate') || '[[Mode]] capture [[Time]]'; const now = new Date(); const pad = (n) => String(n).padStart(2, '0'); return expandPlaceholders(tplStr, { Mode: { fullscreen: 'Screen', window: 'Window', region: 'Region' }[mode] || 'Screen', Time: `${pad(now.getHours())}:${pad(now.getMinutes())}:${pad(now.getSeconds())}`, Date: `${now.getFullYear()}-${pad(now.getMonth() + 1)}-${pad(now.getDate())}`, }); } /** Grab the screen/window image as { image, display } or throw. */ async grab(mode, cursorPoint = null) { const cursor = cursorPoint || this.screen.getCursorScreenPoint(); const display = this.screen.getDisplayNearestPoint(cursor); const { width, height } = display.size; const scale = display.scaleFactor || 1; // Ask for both kinds: some compositors (WSLg/Wayland portals) expose no // individual window sources, so window mode falls back to the screen. const sources = await desktopCapturer.getSources({ types: mode === 'window' ? ['window', 'screen'] : ['screen'], thumbnailSize: { width: Math.round(width * scale), height: Math.round(height * scale) }, }); if (!sources.length) throw new Error('no capture sources available (portal/permissions?)'); let source = null; if (mode === 'window') { const win = this.getWindow(); const ownTitle = win ? win.getTitle() : ''; const windows = sources.filter((s) => s.id.startsWith('window:')); source = windows.find((s) => s.name && s.name !== ownTitle && !/stepforge/i.test(s.name)) || windows[0] || sources.find((s) => s.id.startsWith('screen:')); } else { const screens = sources.filter((s) => s.id.startsWith('screen:')); source = screens.find((s) => String(s.display_id) === String(display.id)) || screens[0] || sources[0]; } if (!source) throw new Error('no capture source matched'); const image = source.thumbnail; if (!image || image.isEmpty()) throw new Error('capture returned an empty image'); return { image, display, cursor }; } /** * Hide the app window while `fn` runs so screenshots show the user's work, * not StepForge itself. Restores visibility afterwards. */ async withWindowHidden(fn, { refocus = true, pauseMs = 350 } = {}) { const win = this.getWindow(); const wasVisible = win && !win.isDestroyed() && win.isVisible() && !win.isMinimized(); if (wasVisible) { win.hide(); if (pauseMs > 0) { await new Promise((r) => setTimeout(r, pauseMs)); // let the compositor repaint } } try { return await fn(); } finally { if (wasVisible && win && !win.isDestroyed()) { if (refocus) { win.show(); win.focus(); } else { win.showInactive(); } } } } /** * Take a screenshot and append it to the guide as a new image step. * Adds a click-marker annotation at the cursor position when enabled. */ async shoot({ guideId, mode = 'fullscreen', delayMs = null, hideWindow = true, refocus = true, hideWindowDelayMs = null, clickPos = null, }) { const delay = delayMs == null ? this.settings.get('capture.delayMs') || 0 : delayMs; if (delay > 0) await new Promise((resolve) => setTimeout(resolve, delay)); let frame; try { frame = hideWindow ? await this.withWindowHidden(() => this.captureCurrentFrame(mode, clickPos), { refocus, pauseMs: hideWindowDelayMs == null ? 350 : hideWindowDelayMs, }) : await this.captureCurrentFrame(mode, clickPos); } catch (err) { return { ok: false, reason: err.message }; } return this.storeFrameAsStep(guideId, mode, frame, clickPos); } /** * Region capture: shoot the full screen, then let the user drag a * rectangle in a fullscreen overlay; the crop becomes the step image. */ async regionCapture(guideId) { let grabbed; try { grabbed = await this.withWindowHidden(() => this.grab('fullscreen')); } catch (err) { return { ok: false, reason: err.message }; } const { image, display } = grabbed; const rect = await this.pickRegion(display, image); if (!rect) return { ok: false, reason: 'selection cancelled' }; const cropped = image.crop(rect); const size = cropped.getSize(); if (!size.width || !size.height) return { ok: false, reason: 'empty selection' }; const step = this.store.addStep(guideId, { title: this.autoTitle('region') }, cropped.toPNG(), size); return { ok: true, step }; } /** Fullscreen overlay window that resolves with a crop rect (image px). */ pickRegion(display, image) { return new Promise((resolve) => { const overlay = new BrowserWindow({ x: display.bounds.x, y: display.bounds.y, width: display.bounds.width, height: display.bounds.height, frame: false, transparent: true, alwaysOnTop: true, fullscreen: true, skipTaskbar: true, webPreferences: { preload: path.join(__dirname, 'region-preload.js'), contextIsolation: true, }, }); let settled = false; const finish = (rect) => { if (settled) return; settled = true; if (!overlay.isDestroyed()) overlay.close(); resolve(rect); }; const { ipcMain } = require('electron'); const onPick = (event, rect) => { if (event.sender !== overlay.webContents) return; ipcMain.removeListener('region:picked', onPick); if (!rect) return finish(null); const imgSize = image.getSize(); const sx = imgSize.width / display.bounds.width; const sy = imgSize.height / display.bounds.height; finish({ x: Math.round(rect.x * sx), y: Math.round(rect.y * sy), width: Math.round(rect.w * sx), height: Math.round(rect.h * sy), }); }; ipcMain.on('region:picked', onPick); overlay.on('closed', () => finish(null)); overlay.loadFile(path.join(__dirname, 'renderer', 'region.html')); }); } } module.exports = CaptureService;